From de920fc64cbaa031f947e9be964bda05fd090380 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 8 Mar 2021 17:56:47 -0800 Subject: bpf, x86: Use kvmalloc_array instead kmalloc_array in bpf_jit_comp x86 bpf_jit_comp.c used kmalloc_array to store jited addresses for each bpf insn. With a large bpf program, we have see the following allocation failures in our production server: page allocation failure: order:5, mode:0x40cc0(GFP_KERNEL|__GFP_COMP), nodemask=(null),cpuset=/,mems_allowed=0" Call Trace: dump_stack+0x50/0x70 warn_alloc.cold.120+0x72/0xd2 ? __alloc_pages_direct_compact+0x157/0x160 __alloc_pages_slowpath+0xcdb/0xd00 ? get_page_from_freelist+0xe44/0x1600 ? vunmap_page_range+0x1ba/0x340 __alloc_pages_nodemask+0x2c9/0x320 kmalloc_order+0x18/0x80 kmalloc_order_trace+0x1d/0xa0 bpf_int_jit_compile+0x1e2/0x484 ? kmalloc_order_trace+0x1d/0xa0 bpf_prog_select_runtime+0xc3/0x150 bpf_prog_load+0x480/0x720 ? __mod_memcg_lruvec_state+0x21/0x100 __do_sys_bpf+0xc31/0x2040 ? close_pdeo+0x86/0xe0 do_syscall_64+0x42/0x110 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f2f300f7fa9 Code: Bad RIP value. Dumped assembly: ffffffff810b6d70 : ; { ffffffff810b6d70: e8 eb a5 b4 00 callq 0xffffffff81c01360 <__fentry__> ffffffff810b6d75: 41 57 pushq %r15 ... ffffffff810b6f39: e9 72 fe ff ff jmp 0xffffffff810b6db0 ; addrs = kmalloc_array(prog->len + 1, sizeof(*addrs), GFP_KERNEL); ffffffff810b6f3e: 8b 45 0c movl 12(%rbp), %eax ; return __kmalloc(bytes, flags); ffffffff810b6f41: be c0 0c 00 00 movl $3264, %esi ; addrs = kmalloc_array(prog->len + 1, sizeof(*addrs), GFP_KERNEL); ffffffff810b6f46: 8d 78 01 leal 1(%rax), %edi ; if (unlikely(check_mul_overflow(n, size, &bytes))) ffffffff810b6f49: 48 c1 e7 02 shlq $2, %rdi ; return __kmalloc(bytes, flags); ffffffff810b6f4d: e8 8e 0c 1d 00 callq 0xffffffff81287be0 <__kmalloc> ; if (!addrs) { ffffffff810b6f52: 48 85 c0 testq %rax, %rax Change kmalloc_array() to kvmalloc_array() to avoid potential allocation error for big bpf programs. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210309015647.3657852-1-yhs@fb.com --- arch/x86/net/bpf_jit_comp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 6926d0ca6c71..747bba0a584a 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -2225,7 +2225,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) padding = true; goto skip_init_addrs; } - addrs = kmalloc_array(prog->len + 1, sizeof(*addrs), GFP_KERNEL); + addrs = kvmalloc_array(prog->len + 1, sizeof(*addrs), GFP_KERNEL); if (!addrs) { prog = orig_prog; goto out_addrs; @@ -2317,7 +2317,7 @@ out_image: if (image) bpf_prog_fill_jited_linfo(prog, addrs + 1); out_addrs: - kfree(addrs); + kvfree(addrs); kfree(jit_data); prog->aux->jit_data = NULL; } -- cgit From e21aa341785c679dd409c8cb71f864c00fe6c463 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 16 Mar 2021 14:00:07 -0700 Subject: bpf: Fix fexit trampoline. The fexit/fmod_ret programs can be attached to kernel functions that can sleep. The synchronize_rcu_tasks() will not wait for such tasks to complete. In such case the trampoline image will be freed and when the task wakes up the return IP will point to freed memory causing the crash. Solve this by adding percpu_ref_get/put for the duration of trampoline and separate trampoline vs its image life times. The "half page" optimization has to be removed, since first_half->second_half->first_half transition cannot be guaranteed to complete in deterministic time. Every trampoline update becomes a new image. The image with fmod_ret or fexit progs will be freed via percpu_ref_kill and call_rcu_tasks. Together they will wait for the original function and trampoline asm to complete. The trampoline is patched from nop to jmp to skip fexit progs. They are freed independently from the trampoline. The image with fentry progs only will be freed via call_rcu_tasks_trace+call_rcu_tasks which will wait for both sleepable and non-sleepable progs to complete. Fixes: fec56f5890d9 ("bpf: Introduce BPF trampoline") Reported-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Paul E. McKenney # for RCU Link: https://lore.kernel.org/bpf/20210316210007.38949-1-alexei.starovoitov@gmail.com --- arch/x86/net/bpf_jit_comp.c | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 747bba0a584a..72b5a57e9e31 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1936,7 +1936,7 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, * add rsp, 8 // skip eth_type_trans's frame * ret // return to its caller */ -int arch_prepare_bpf_trampoline(void *image, void *image_end, +int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, const struct btf_func_model *m, u32 flags, struct bpf_tramp_progs *tprogs, void *orig_call) @@ -1975,6 +1975,15 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, save_regs(m, &prog, nr_args, stack_size); + if (flags & BPF_TRAMP_F_CALL_ORIG) { + /* arg1: mov rdi, im */ + emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im); + if (emit_call(&prog, __bpf_tramp_enter, prog)) { + ret = -EINVAL; + goto cleanup; + } + } + if (fentry->nr_progs) if (invoke_bpf(m, &prog, fentry, stack_size)) return -EINVAL; @@ -1993,8 +2002,7 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, } if (flags & BPF_TRAMP_F_CALL_ORIG) { - if (fentry->nr_progs || fmod_ret->nr_progs) - restore_regs(m, &prog, nr_args, stack_size); + restore_regs(m, &prog, nr_args, stack_size); /* call original function */ if (emit_call(&prog, orig_call, prog)) { @@ -2003,6 +2011,8 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, } /* remember return value in a stack for bpf prog to access */ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); + im->ip_after_call = prog; + emit_nops(&prog, 5); } if (fmod_ret->nr_progs) { @@ -2033,9 +2043,17 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, * the return value is only updated on the stack and still needs to be * restored to R0. */ - if (flags & BPF_TRAMP_F_CALL_ORIG) + if (flags & BPF_TRAMP_F_CALL_ORIG) { + im->ip_epilogue = prog; + /* arg1: mov rdi, im */ + emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im); + if (emit_call(&prog, __bpf_tramp_exit, prog)) { + ret = -EINVAL; + goto cleanup; + } /* restore original return value back into RAX */ emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8); + } EMIT1(0x5B); /* pop rbx */ EMIT1(0xC9); /* leave */ -- cgit From b9082970478009b778aa9b22d5561eef35b53b63 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 19 Mar 2021 17:00:01 -0700 Subject: bpf: Use NOP_ATOMIC5 instead of emit_nops(&prog, 5) for BPF_TRAMP_F_CALL_ORIG __bpf_arch_text_poke does rewrite only for atomic nop5, emit_nops(xxx, 5) emits non-atomic one which breaks fentry/fexit with k8 atomics: P6_NOP5 == P6_NOP5_ATOMIC (0f1f440000 == 0f1f440000) K8_NOP5 != K8_NOP5_ATOMIC (6666906690 != 6666666690) Can be reproduced by doing "ideal_nops = k8_nops" in "arch_init_ideal_nops() and running fexit_bpf2bpf selftest. Fixes: e21aa341785c ("bpf: Fix fexit trampoline.") Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210320000001.915366-1-sdf@google.com --- arch/x86/net/bpf_jit_comp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 72b5a57e9e31..b35fc8023884 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -2012,7 +2012,8 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i /* remember return value in a stack for bpf prog to access */ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); im->ip_after_call = prog; - emit_nops(&prog, 5); + memcpy(prog, ideal_nops[NOP_ATOMIC5], X86_PATCH_SIZE); + prog += X86_PATCH_SIZE; } if (fmod_ret->nr_progs) { -- cgit