diff options
Diffstat (limited to 'kernel/bpf/helpers.c')
-rw-r--r-- | kernel/bpf/helpers.c | 439 |
1 files changed, 420 insertions, 19 deletions
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index a6b04faed282..af30c6cbd65d 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -4,6 +4,7 @@ #include <linux/bpf.h> #include <linux/btf.h> #include <linux/bpf-cgroup.h> +#include <linux/cgroup.h> #include <linux/rcupdate.h> #include <linux/random.h> #include <linux/smp.h> @@ -19,6 +20,7 @@ #include <linux/proc_ns.h> #include <linux/security.h> #include <linux/btf_ids.h> +#include <linux/bpf_mem_alloc.h> #include "../../lib/kstrtox.h" @@ -336,6 +338,7 @@ const struct bpf_func_proto bpf_spin_lock_proto = { .gpl_only = false, .ret_type = RET_VOID, .arg1_type = ARG_PTR_TO_SPIN_LOCK, + .arg1_btf_id = BPF_PTR_POISON, }; static inline void __bpf_spin_unlock_irqrestore(struct bpf_spin_lock *lock) @@ -358,6 +361,7 @@ const struct bpf_func_proto bpf_spin_unlock_proto = { .gpl_only = false, .ret_type = RET_VOID, .arg1_type = ARG_PTR_TO_SPIN_LOCK, + .arg1_btf_id = BPF_PTR_POISON, }; void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, @@ -366,9 +370,9 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, struct bpf_spin_lock *lock; if (lock_src) - lock = src + map->spin_lock_off; + lock = src + map->record->spin_lock_off; else - lock = dst + map->spin_lock_off; + lock = dst + map->record->spin_lock_off; preempt_disable(); __bpf_spin_lock_irqsave(lock); copy_map_value(map, dst, src); @@ -657,6 +661,7 @@ BPF_CALL_3(bpf_copy_from_user, void *, dst, u32, size, const struct bpf_func_proto bpf_copy_from_user_proto = { .func = bpf_copy_from_user, .gpl_only = false, + .might_sleep = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, @@ -687,6 +692,7 @@ BPF_CALL_5(bpf_copy_from_user_task, void *, dst, u32, size, const struct bpf_func_proto bpf_copy_from_user_task_proto = { .func = bpf_copy_from_user_task, .gpl_only = true, + .might_sleep = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, @@ -1169,7 +1175,7 @@ BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map ret = -ENOMEM; goto out; } - t->value = (void *)timer - map->timer_off; + t->value = (void *)timer - map->record->timer_off; t->map = map; t->prog = NULL; rcu_assign_pointer(t->callback_fn, NULL); @@ -1398,7 +1404,7 @@ static const struct bpf_func_proto bpf_kptr_xchg_proto = { #define DYNPTR_SIZE_MASK 0xFFFFFF #define DYNPTR_RDONLY_BIT BIT(31) -static bool bpf_dynptr_is_rdonly(struct bpf_dynptr_kern *ptr) +static bool bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr) { return ptr->size & DYNPTR_RDONLY_BIT; } @@ -1408,7 +1414,7 @@ static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_typ ptr->size |= type << DYNPTR_TYPE_SHIFT; } -u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr) +u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr) { return ptr->size & DYNPTR_SIZE_MASK; } @@ -1432,7 +1438,7 @@ void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr) memset(ptr, 0, sizeof(*ptr)); } -static int bpf_dynptr_check_off_len(struct bpf_dynptr_kern *ptr, u32 offset, u32 len) +static int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u32 offset, u32 len) { u32 size = bpf_dynptr_get_size(ptr); @@ -1477,7 +1483,7 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = { .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT, }; -BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src, +BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src, u32, offset, u64, flags) { int err; @@ -1489,7 +1495,11 @@ BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src if (err) return err; - memcpy(dst, src->data + src->offset + offset, len); + /* Source and destination may possibly overlap, hence use memmove to + * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr + * pointing to overlapping PTR_TO_MAP_VALUE regions. + */ + memmove(dst, src->data + src->offset + offset, len); return 0; } @@ -1500,12 +1510,12 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, - .arg3_type = ARG_PTR_TO_DYNPTR, + .arg3_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; -BPF_CALL_5(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, src, +BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src, u32, len, u64, flags) { int err; @@ -1517,7 +1527,11 @@ BPF_CALL_5(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, if (err) return err; - memcpy(dst->data + dst->offset + offset, src, len); + /* Source and destination may possibly overlap, hence use memmove to + * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr + * pointing to overlapping PTR_TO_MAP_VALUE regions. + */ + memmove(dst->data + dst->offset + offset, src, len); return 0; } @@ -1526,14 +1540,14 @@ static const struct bpf_func_proto bpf_dynptr_write_proto = { .func = bpf_dynptr_write, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_DYNPTR, + .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE_OR_ZERO, .arg5_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len) +BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u32, len) { int err; @@ -1554,7 +1568,7 @@ static const struct bpf_func_proto bpf_dynptr_data_proto = { .func = bpf_dynptr_data, .gpl_only = false, .ret_type = RET_PTR_TO_DYNPTR_MEM_OR_NULL, - .arg1_type = ARG_PTR_TO_DYNPTR, + .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_CONST_ALLOC_SIZE_OR_ZERO, }; @@ -1663,6 +1677,12 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_dynptr_write_proto; case BPF_FUNC_dynptr_data: return &bpf_dynptr_data_proto; +#ifdef CONFIG_CGROUPS + case BPF_FUNC_cgrp_storage_get: + return &bpf_cgrp_storage_get_proto; + case BPF_FUNC_cgrp_storage_delete: + return &bpf_cgrp_storage_delete_proto; +#endif default: break; } @@ -1700,20 +1720,401 @@ bpf_base_func_proto(enum bpf_func_id func_id) } } -BTF_SET8_START(tracing_btf_ids) +void bpf_list_head_free(const struct btf_field *field, void *list_head, + struct bpf_spin_lock *spin_lock) +{ + struct list_head *head = list_head, *orig_head = list_head; + + BUILD_BUG_ON(sizeof(struct list_head) > sizeof(struct bpf_list_head)); + BUILD_BUG_ON(__alignof__(struct list_head) > __alignof__(struct bpf_list_head)); + + /* Do the actual list draining outside the lock to not hold the lock for + * too long, and also prevent deadlocks if tracing programs end up + * executing on entry/exit of functions called inside the critical + * section, and end up doing map ops that call bpf_list_head_free for + * the same map value again. + */ + __bpf_spin_lock_irqsave(spin_lock); + if (!head->next || list_empty(head)) + goto unlock; + head = head->next; +unlock: + INIT_LIST_HEAD(orig_head); + __bpf_spin_unlock_irqrestore(spin_lock); + + while (head != orig_head) { + void *obj = head; + + obj -= field->list_head.node_offset; + head = head->next; + /* The contained type can also have resources, including a + * bpf_list_head which needs to be freed. + */ + bpf_obj_free_fields(field->list_head.value_rec, obj); + /* bpf_mem_free requires migrate_disable(), since we can be + * called from map free path as well apart from BPF program (as + * part of map ops doing bpf_obj_free_fields). + */ + migrate_disable(); + bpf_mem_free(&bpf_global_ma, obj); + migrate_enable(); + } +} + +__diag_push(); +__diag_ignore_all("-Wmissing-prototypes", + "Global functions as their definitions will be in vmlinux BTF"); + +void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign) +{ + struct btf_struct_meta *meta = meta__ign; + u64 size = local_type_id__k; + void *p; + + p = bpf_mem_alloc(&bpf_global_ma, size); + if (!p) + return NULL; + if (meta) + bpf_obj_init(meta->field_offs, p); + return p; +} + +void bpf_obj_drop_impl(void *p__alloc, void *meta__ign) +{ + struct btf_struct_meta *meta = meta__ign; + void *p = p__alloc; + + if (meta) + bpf_obj_free_fields(meta->record, p); + bpf_mem_free(&bpf_global_ma, p); +} + +static void __bpf_list_add(struct bpf_list_node *node, struct bpf_list_head *head, bool tail) +{ + struct list_head *n = (void *)node, *h = (void *)head; + + if (unlikely(!h->next)) + INIT_LIST_HEAD(h); + if (unlikely(!n->next)) + INIT_LIST_HEAD(n); + tail ? list_add_tail(n, h) : list_add(n, h); +} + +void bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node) +{ + return __bpf_list_add(node, head, false); +} + +void bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node) +{ + return __bpf_list_add(node, head, true); +} + +static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tail) +{ + struct list_head *n, *h = (void *)head; + + if (unlikely(!h->next)) + INIT_LIST_HEAD(h); + if (list_empty(h)) + return NULL; + n = tail ? h->prev : h->next; + list_del_init(n); + return (struct bpf_list_node *)n; +} + +struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head) +{ + return __bpf_list_del(head, false); +} + +struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) +{ + return __bpf_list_del(head, true); +} + +/** + * bpf_task_acquire - Acquire a reference to a task. A task acquired by this + * kfunc which is not stored in a map as a kptr, must be released by calling + * bpf_task_release(). + * @p: The task on which a reference is being acquired. + */ +struct task_struct *bpf_task_acquire(struct task_struct *p) +{ + return get_task_struct(p); +} + +/** + * bpf_task_acquire_not_zero - Acquire a reference to a rcu task object. A task + * acquired by this kfunc which is not stored in a map as a kptr, must be + * released by calling bpf_task_release(). + * @p: The task on which a reference is being acquired. + */ +struct task_struct *bpf_task_acquire_not_zero(struct task_struct *p) +{ + /* For the time being this function returns NULL, as it's not currently + * possible to safely acquire a reference to a task with RCU protection + * using get_task_struct() and put_task_struct(). This is due to the + * slightly odd mechanics of p->rcu_users, and how task RCU protection + * works. + * + * A struct task_struct is refcounted by two different refcount_t + * fields: + * + * 1. p->usage: The "true" refcount field which tracks a task's + * lifetime. The task is freed as soon as this + * refcount drops to 0. + * + * 2. p->rcu_users: An "RCU users" refcount field which is statically + * initialized to 2, and is co-located in a union with + * a struct rcu_head field (p->rcu). p->rcu_users + * essentially encapsulates a single p->usage + * refcount, and when p->rcu_users goes to 0, an RCU + * callback is scheduled on the struct rcu_head which + * decrements the p->usage refcount. + * + * There are two important implications to this task refcounting logic + * described above. The first is that + * refcount_inc_not_zero(&p->rcu_users) cannot be used anywhere, as + * after the refcount goes to 0, the RCU callback being scheduled will + * cause the memory backing the refcount to again be nonzero due to the + * fields sharing a union. The other is that we can't rely on RCU to + * guarantee that a task is valid in a BPF program. This is because a + * task could have already transitioned to being in the TASK_DEAD + * state, had its rcu_users refcount go to 0, and its rcu callback + * invoked in which it drops its single p->usage reference. At this + * point the task will be freed as soon as the last p->usage reference + * goes to 0, without waiting for another RCU gp to elapse. The only + * way that a BPF program can guarantee that a task is valid is in this + * scenario is to hold a p->usage refcount itself. + * + * Until we're able to resolve this issue, either by pulling + * p->rcu_users and p->rcu out of the union, or by getting rid of + * p->usage and just using p->rcu_users for refcounting, we'll just + * return NULL here. + */ + return NULL; +} + +/** + * bpf_task_kptr_get - Acquire a reference on a struct task_struct kptr. A task + * kptr acquired by this kfunc which is not subsequently stored in a map, must + * be released by calling bpf_task_release(). + * @pp: A pointer to a task kptr on which a reference is being acquired. + */ +struct task_struct *bpf_task_kptr_get(struct task_struct **pp) +{ + /* We must return NULL here until we have clarity on how to properly + * leverage RCU for ensuring a task's lifetime. See the comment above + * in bpf_task_acquire_not_zero() for more details. + */ + return NULL; +} + +/** + * bpf_task_release - Release the reference acquired on a task. + * @p: The task on which a reference is being released. + */ +void bpf_task_release(struct task_struct *p) +{ + if (!p) + return; + + put_task_struct(p); +} + +#ifdef CONFIG_CGROUPS +/** + * bpf_cgroup_acquire - Acquire a reference to a cgroup. A cgroup acquired by + * this kfunc which is not stored in a map as a kptr, must be released by + * calling bpf_cgroup_release(). + * @cgrp: The cgroup on which a reference is being acquired. + */ +struct cgroup *bpf_cgroup_acquire(struct cgroup *cgrp) +{ + cgroup_get(cgrp); + return cgrp; +} + +/** + * bpf_cgroup_kptr_get - Acquire a reference on a struct cgroup kptr. A cgroup + * kptr acquired by this kfunc which is not subsequently stored in a map, must + * be released by calling bpf_cgroup_release(). + * @cgrpp: A pointer to a cgroup kptr on which a reference is being acquired. + */ +struct cgroup *bpf_cgroup_kptr_get(struct cgroup **cgrpp) +{ + struct cgroup *cgrp; + + rcu_read_lock(); + /* Another context could remove the cgroup from the map and release it + * at any time, including after we've done the lookup above. This is + * safe because we're in an RCU read region, so the cgroup is + * guaranteed to remain valid until at least the rcu_read_unlock() + * below. + */ + cgrp = READ_ONCE(*cgrpp); + + if (cgrp && !cgroup_tryget(cgrp)) + /* If the cgroup had been removed from the map and freed as + * described above, cgroup_tryget() will return false. The + * cgroup will be freed at some point after the current RCU gp + * has ended, so just return NULL to the user. + */ + cgrp = NULL; + rcu_read_unlock(); + + return cgrp; +} + +/** + * bpf_cgroup_release - Release the reference acquired on a cgroup. + * If this kfunc is invoked in an RCU read region, the cgroup is guaranteed to + * not be freed until the current grace period has ended, even if its refcount + * drops to 0. + * @cgrp: The cgroup on which a reference is being released. + */ +void bpf_cgroup_release(struct cgroup *cgrp) +{ + if (!cgrp) + return; + + cgroup_put(cgrp); +} + +/** + * bpf_cgroup_ancestor - Perform a lookup on an entry in a cgroup's ancestor + * array. A cgroup returned by this kfunc which is not subsequently stored in a + * map, must be released by calling bpf_cgroup_release(). + * @cgrp: The cgroup for which we're performing a lookup. + * @level: The level of ancestor to look up. + */ +struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level) +{ + struct cgroup *ancestor; + + if (level > cgrp->level || level < 0) + return NULL; + + ancestor = cgrp->ancestors[level]; + cgroup_get(ancestor); + return ancestor; +} +#endif /* CONFIG_CGROUPS */ + +/** + * bpf_task_from_pid - Find a struct task_struct from its pid by looking it up + * in the root pid namespace idr. If a task is returned, it must either be + * stored in a map, or released with bpf_task_release(). + * @pid: The pid of the task being looked up. + */ +struct task_struct *bpf_task_from_pid(s32 pid) +{ + struct task_struct *p; + + rcu_read_lock(); + p = find_task_by_pid_ns(pid, &init_pid_ns); + if (p) + bpf_task_acquire(p); + rcu_read_unlock(); + + return p; +} + +void *bpf_cast_to_kern_ctx(void *obj) +{ + return obj; +} + +void *bpf_rdonly_cast(void *obj__ign, u32 btf_id__k) +{ + return obj__ign; +} + +void bpf_rcu_read_lock(void) +{ + rcu_read_lock(); +} + +void bpf_rcu_read_unlock(void) +{ + rcu_read_unlock(); +} + +__diag_pop(); + +BTF_SET8_START(generic_btf_ids) #ifdef CONFIG_KEXEC_CORE BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE) #endif -BTF_SET8_END(tracing_btf_ids) +BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_list_push_front) +BTF_ID_FLAGS(func, bpf_list_push_back) +BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_task_acquire_not_zero, KF_ACQUIRE | KF_RCU | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_task_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE) +#ifdef CONFIG_CGROUPS +BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cgroup_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_TRUSTED_ARGS | KF_RET_NULL) +#endif +BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL) +BTF_SET8_END(generic_btf_ids) -static const struct btf_kfunc_id_set tracing_kfunc_set = { +static const struct btf_kfunc_id_set generic_kfunc_set = { .owner = THIS_MODULE, - .set = &tracing_btf_ids, + .set = &generic_btf_ids, +}; + + +BTF_ID_LIST(generic_dtor_ids) +BTF_ID(struct, task_struct) +BTF_ID(func, bpf_task_release) +#ifdef CONFIG_CGROUPS +BTF_ID(struct, cgroup) +BTF_ID(func, bpf_cgroup_release) +#endif + +BTF_SET8_START(common_btf_ids) +BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx) +BTF_ID_FLAGS(func, bpf_rdonly_cast) +BTF_ID_FLAGS(func, bpf_rcu_read_lock) +BTF_ID_FLAGS(func, bpf_rcu_read_unlock) +BTF_SET8_END(common_btf_ids) + +static const struct btf_kfunc_id_set common_kfunc_set = { + .owner = THIS_MODULE, + .set = &common_btf_ids, }; static int __init kfunc_init(void) { - return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &tracing_kfunc_set); + int ret; + const struct btf_id_dtor_kfunc generic_dtors[] = { + { + .btf_id = generic_dtor_ids[0], + .kfunc_btf_id = generic_dtor_ids[1] + }, +#ifdef CONFIG_CGROUPS + { + .btf_id = generic_dtor_ids[2], + .kfunc_btf_id = generic_dtor_ids[3] + }, +#endif + }; + + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &generic_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &generic_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &generic_kfunc_set); + ret = ret ?: register_btf_id_dtor_kfuncs(generic_dtors, + ARRAY_SIZE(generic_dtors), + THIS_MODULE); + return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set); } late_initcall(kfunc_init); |