diff options
-rw-r--r-- | include/linux/bpf_types.h | 1 | ||||
-rw-r--r-- | include/uapi/linux/bpf.h | 1 | ||||
-rw-r--r-- | kernel/bpf/ringbuf.c | 62 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 3 | ||||
-rw-r--r-- | tools/bpf/bpftool/Documentation/bpftool-map.rst | 2 | ||||
-rw-r--r-- | tools/bpf/bpftool/map.c | 2 | ||||
-rw-r--r-- | tools/include/uapi/linux/bpf.h | 1 | ||||
-rw-r--r-- | tools/lib/bpf/libbpf.c | 1 |
8 files changed, 65 insertions, 8 deletions
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 2b9112b80171..2c6a4f2562a7 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -126,6 +126,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_BLOOM_FILTER, bloom_filter_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_USER_RINGBUF, user_ringbuf_map_ops) BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint) BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3df78c56c1bf..e18c85324db6 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -928,6 +928,7 @@ enum bpf_map_type { BPF_MAP_TYPE_INODE_STORAGE, BPF_MAP_TYPE_TASK_STORAGE, BPF_MAP_TYPE_BLOOM_FILTER, + BPF_MAP_TYPE_USER_RINGBUF, }; /* Note that tracing related programs such as diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index b483aea35f41..754e915748fb 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -38,10 +38,27 @@ struct bpf_ringbuf { struct page **pages; int nr_pages; spinlock_t spinlock ____cacheline_aligned_in_smp; - /* Consumer and producer counters are put into separate pages to allow - * mapping consumer page as r/w, but restrict producer page to r/o. - * This protects producer position from being modified by user-space - * application and ruining in-kernel position tracking. + /* Consumer and producer counters are put into separate pages to + * allow each position to be mapped with different permissions. + * This prevents a user-space application from modifying the + * position and ruining in-kernel tracking. The permissions of the + * pages depend on who is producing samples: user-space or the + * kernel. + * + * Kernel-producer + * --------------- + * The producer position and data pages are mapped as r/o in + * userspace. For this approach, bits in the header of samples are + * used to signal to user-space, and to other producers, whether a + * sample is currently being written. + * + * User-space producer + * ------------------- + * Only the page containing the consumer position is mapped r/o in + * user-space. User-space producers also use bits of the header to + * communicate to the kernel, but the kernel must carefully check and + * validate each sample to ensure that they're correctly formatted, and + * fully contained within the ring buffer. */ unsigned long consumer_pos __aligned(PAGE_SIZE); unsigned long producer_pos __aligned(PAGE_SIZE); @@ -224,7 +241,7 @@ static int ringbuf_map_get_next_key(struct bpf_map *map, void *key, return -ENOTSUPP; } -static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) +static int ringbuf_map_mmap_kern(struct bpf_map *map, struct vm_area_struct *vma) { struct bpf_ringbuf_map *rb_map; @@ -242,6 +259,26 @@ static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) vma->vm_pgoff + RINGBUF_PGOFF); } +static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma) +{ + struct bpf_ringbuf_map *rb_map; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + + if (vma->vm_flags & VM_WRITE) { + if (vma->vm_pgoff == 0) + /* Disallow writable mappings to the consumer pointer, + * and allow writable mappings to both the producer + * position, and the ring buffer data itself. + */ + return -EPERM; + } else { + vma->vm_flags &= ~VM_MAYWRITE; + } + /* remap_vmalloc_range() checks size and offset constraints */ + return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF); +} + static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb) { unsigned long cons_pos, prod_pos; @@ -269,7 +306,7 @@ const struct bpf_map_ops ringbuf_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = ringbuf_map_alloc, .map_free = ringbuf_map_free, - .map_mmap = ringbuf_map_mmap, + .map_mmap = ringbuf_map_mmap_kern, .map_poll = ringbuf_map_poll, .map_lookup_elem = ringbuf_map_lookup_elem, .map_update_elem = ringbuf_map_update_elem, @@ -278,6 +315,19 @@ const struct bpf_map_ops ringbuf_map_ops = { .map_btf_id = &ringbuf_map_btf_ids[0], }; +BTF_ID_LIST_SINGLE(user_ringbuf_map_btf_ids, struct, bpf_ringbuf_map) +const struct bpf_map_ops user_ringbuf_map_ops = { + .map_meta_equal = bpf_map_meta_equal, + .map_alloc = ringbuf_map_alloc, + .map_free = ringbuf_map_free, + .map_mmap = ringbuf_map_mmap_user, + .map_lookup_elem = ringbuf_map_lookup_elem, + .map_update_elem = ringbuf_map_update_elem, + .map_delete_elem = ringbuf_map_delete_elem, + .map_get_next_key = ringbuf_map_get_next_key, + .map_btf_id = &user_ringbuf_map_btf_ids[0], +}; + /* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself, * calculate offset from record metadata to ring buffer in pages, rounded * down. This page offset is stored as part of record metadata and allows to diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8c6fbcd0afaf..83710b60e708 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6240,6 +6240,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_ringbuf_discard_dynptr) goto error; break; + case BPF_MAP_TYPE_USER_RINGBUF: + goto error; case BPF_MAP_TYPE_STACK_TRACE: if (func_id != BPF_FUNC_get_stackid) goto error; @@ -12635,6 +12637,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_ARRAY_OF_MAPS: case BPF_MAP_TYPE_HASH_OF_MAPS: case BPF_MAP_TYPE_RINGBUF: + case BPF_MAP_TYPE_USER_RINGBUF: case BPF_MAP_TYPE_INODE_STORAGE: case BPF_MAP_TYPE_SK_STORAGE: case BPF_MAP_TYPE_TASK_STORAGE: diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index 7c188a598444..7f3b67a8b48f 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -55,7 +55,7 @@ MAP COMMANDS | | **devmap** | **devmap_hash** | **sockmap** | **cpumap** | **xskmap** | **sockhash** | | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage** | | **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** | **inode_storage** -| | **task_storage** | **bloom_filter** } +| | **task_storage** | **bloom_filter** | **user_ringbuf** } DESCRIPTION =========== diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index 38b6bc9c26c3..9a6ca9f31133 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -1459,7 +1459,7 @@ static int do_help(int argc, char **argv) " devmap | devmap_hash | sockmap | cpumap | xskmap | sockhash |\n" " cgroup_storage | reuseport_sockarray | percpu_cgroup_storage |\n" " queue | stack | sk_storage | struct_ops | ringbuf | inode_storage |\n" - " task_storage | bloom_filter }\n" + " task_storage | bloom_filter | user_ringbuf }\n" " " HELP_SPEC_OPTIONS " |\n" " {-f|--bpffs} | {-n|--nomount} }\n" "", diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 3df78c56c1bf..e18c85324db6 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -928,6 +928,7 @@ enum bpf_map_type { BPF_MAP_TYPE_INODE_STORAGE, BPF_MAP_TYPE_TASK_STORAGE, BPF_MAP_TYPE_BLOOM_FILTER, + BPF_MAP_TYPE_USER_RINGBUF, }; /* Note that tracing related programs such as diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 2ca30ccc774c..d480da05b6de 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -163,6 +163,7 @@ static const char * const map_type_name[] = { [BPF_MAP_TYPE_INODE_STORAGE] = "inode_storage", [BPF_MAP_TYPE_TASK_STORAGE] = "task_storage", [BPF_MAP_TYPE_BLOOM_FILTER] = "bloom_filter", + [BPF_MAP_TYPE_USER_RINGBUF] = "user_ringbuf", }; static const char * const prog_type_name[] = { |