From 9f2470fbc4cb4583c080bb729a998933ba61aca4 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 14 Jun 2021 19:13:35 -0700 Subject: skmsg: Improve udp_bpf_recvmsg() accuracy I tried to reuse sk_msg_wait_data() for different protocols, but it turns out it can not be simply reused. For example, UDP actually uses two queues to receive skb: udp_sk(sk)->reader_queue and sk->sk_receive_queue. So we have to check both of them to know whether we have received any packet. Also, UDP does not lock the sock during BH Rx path, it makes no sense for its ->recvmsg() to lock the sock. It is always possible for ->recvmsg() to be called before packets actually arrive in the receive queue, we just use best effort to make it accurate here. Fixes: 1f5be6b3b063 ("udp: Implement udp_bpf_recvmsg() for sockmap") Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210615021342.7416-2-xiyou.wangcong@gmail.com --- net/core/skmsg.c | 23 ----------------------- 1 file changed, 23 deletions(-) (limited to 'net/core') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 43ce17a6a585..f9a81b314e4c 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -399,29 +399,6 @@ out: } EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter); -int sk_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags, - long timeo, int *err) -{ - DEFINE_WAIT_FUNC(wait, woken_wake_function); - int ret = 0; - - if (sk->sk_shutdown & RCV_SHUTDOWN) - return 1; - - if (!timeo) - return ret; - - add_wait_queue(sk_sleep(sk), &wait); - sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); - ret = sk_wait_event(sk, &timeo, - !list_empty(&psock->ingress_msg) || - !skb_queue_empty(&sk->sk_receive_queue), &wait); - sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); - remove_wait_queue(sk_sleep(sk), &wait); - return ret; -} -EXPORT_SYMBOL_GPL(sk_msg_wait_data); - /* Receive sk_msg from psock->ingress_msg to @msg. */ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags) -- cgit From 30b9c54a707db4155735cf71f4600241c1b7b6ff Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 14 Jun 2021 19:13:38 -0700 Subject: skmsg: Clear skb redirect pointer before dropping it When we drop skb inside sk_psock_skb_redirect(), we have to clear its skb->_sk_redir pointer too, otherwise kfree_skb() would misinterpret it as a valid skb->_skb_refdst and dst_release() would eventually complain. Fixes: e3526bb92a20 ("skmsg: Move sk_redir from TCP_SKB_CB to skb") Reported-by: Jiang Wang Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210615021342.7416-5-xiyou.wangcong@gmail.com --- net/core/skmsg.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/core') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index f9a81b314e4c..4334720e2a04 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -843,12 +843,14 @@ static void sk_psock_skb_redirect(struct sk_buff *skb) * a socket that is in this state so we drop the skb. */ if (!psock_other || sock_flag(sk_other, SOCK_DEAD)) { + skb_bpf_redirect_clear(skb); kfree_skb(skb); return; } spin_lock_bh(&psock_other->ingress_lock); if (!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) { spin_unlock_bh(&psock_other->ingress_lock); + skb_bpf_redirect_clear(skb); kfree_skb(skb); return; } -- cgit From 0cf6672b23c8aa9d9274798dd63cbf6ede77ef90 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 14 Jun 2021 19:13:39 -0700 Subject: skmsg: Fix a memory leak in sk_psock_verdict_apply() If the dest psock does not set SK_PSOCK_TX_ENABLED, the skb can't be queued anywhere so must be dropped. This one is found during code review. Fixes: 799aa7f98d53 ("skmsg: Avoid lock_sock() in sk_psock_backlog()") Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210615021342.7416-6-xiyou.wangcong@gmail.com --- net/core/skmsg.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net/core') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 4334720e2a04..5464477e2d3d 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -924,8 +924,13 @@ static void sk_psock_verdict_apply(struct sk_psock *psock, if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { skb_queue_tail(&psock->ingress_skb, skb); schedule_work(&psock->work); + err = 0; } spin_unlock_bh(&psock->ingress_lock); + if (err < 0) { + skb_bpf_redirect_clear(skb); + goto out_free; + } } break; case __SK_REDIRECT: -- cgit From 1581a6c1c3291a8320b080f4411345f60229976d Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 14 Jun 2021 19:13:40 -0700 Subject: skmsg: Teach sk_psock_verdict_apply() to return errors Currently sk_psock_verdict_apply() is void, but it handles some error conditions too. Its caller is impossible to learn whether it succeeds or fails, especially sk_psock_verdict_recv(). Make it return int to indicate error cases and propagate errors to callers properly. Fixes: ef5659280eb1 ("bpf, sockmap: Allow skipping sk_skb parser program") Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210615021342.7416-7-xiyou.wangcong@gmail.com --- net/core/skmsg.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'net/core') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 5464477e2d3d..e3d210811db4 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -824,7 +824,7 @@ out: } EXPORT_SYMBOL_GPL(sk_psock_msg_verdict); -static void sk_psock_skb_redirect(struct sk_buff *skb) +static int sk_psock_skb_redirect(struct sk_buff *skb) { struct sk_psock *psock_other; struct sock *sk_other; @@ -835,7 +835,7 @@ static void sk_psock_skb_redirect(struct sk_buff *skb) */ if (unlikely(!sk_other)) { kfree_skb(skb); - return; + return -EIO; } psock_other = sk_psock(sk_other); /* This error indicates the socket is being torn down or had another @@ -845,19 +845,20 @@ static void sk_psock_skb_redirect(struct sk_buff *skb) if (!psock_other || sock_flag(sk_other, SOCK_DEAD)) { skb_bpf_redirect_clear(skb); kfree_skb(skb); - return; + return -EIO; } spin_lock_bh(&psock_other->ingress_lock); if (!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) { spin_unlock_bh(&psock_other->ingress_lock); skb_bpf_redirect_clear(skb); kfree_skb(skb); - return; + return -EIO; } skb_queue_tail(&psock_other->ingress_skb, skb); schedule_work(&psock_other->work); spin_unlock_bh(&psock_other->ingress_lock); + return 0; } static void sk_psock_tls_verdict_apply(struct sk_buff *skb, struct sock *sk, int verdict) @@ -894,14 +895,15 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) } EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read); -static void sk_psock_verdict_apply(struct sk_psock *psock, - struct sk_buff *skb, int verdict) +static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, + int verdict) { struct sock *sk_other; - int err = -EIO; + int err = 0; switch (verdict) { case __SK_PASS: + err = -EIO; sk_other = psock->sk; if (sock_flag(sk_other, SOCK_DEAD) || !sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { @@ -934,13 +936,15 @@ static void sk_psock_verdict_apply(struct sk_psock *psock, } break; case __SK_REDIRECT: - sk_psock_skb_redirect(skb); + err = sk_psock_skb_redirect(skb); break; case __SK_DROP: default: out_free: kfree_skb(skb); } + + return err; } static void sk_psock_write_space(struct sock *sk) @@ -1107,7 +1111,8 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); skb->sk = NULL; } - sk_psock_verdict_apply(psock, skb, ret); + if (sk_psock_verdict_apply(psock, skb, ret) < 0) + len = 0; out: rcu_read_unlock(); return len; -- cgit From 42830571f1fd9751b3fbf38084bbb253320e185f Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 14 Jun 2021 19:13:41 -0700 Subject: skmsg: Pass source psock to sk_psock_skb_redirect() sk_psock_skb_redirect() only takes skb as a parameter, we will need to know where this skb is from, so just pass the source psock to this function as a new parameter. This patch prepares for the next one. Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210615021342.7416-8-xiyou.wangcong@gmail.com --- net/core/skmsg.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index e3d210811db4..3aa9065811ad 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -824,7 +824,7 @@ out: } EXPORT_SYMBOL_GPL(sk_psock_msg_verdict); -static int sk_psock_skb_redirect(struct sk_buff *skb) +static int sk_psock_skb_redirect(struct sk_psock *from, struct sk_buff *skb) { struct sk_psock *psock_other; struct sock *sk_other; @@ -861,11 +861,12 @@ static int sk_psock_skb_redirect(struct sk_buff *skb) return 0; } -static void sk_psock_tls_verdict_apply(struct sk_buff *skb, struct sock *sk, int verdict) +static void sk_psock_tls_verdict_apply(struct sk_buff *skb, + struct sk_psock *from, int verdict) { switch (verdict) { case __SK_REDIRECT: - sk_psock_skb_redirect(skb); + sk_psock_skb_redirect(from, skb); break; case __SK_PASS: case __SK_DROP: @@ -889,7 +890,7 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); skb->sk = NULL; } - sk_psock_tls_verdict_apply(skb, psock->sk, ret); + sk_psock_tls_verdict_apply(skb, psock, ret); rcu_read_unlock(); return ret; } @@ -936,7 +937,7 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, } break; case __SK_REDIRECT: - err = sk_psock_skb_redirect(skb); + err = sk_psock_skb_redirect(psock, skb); break; case __SK_DROP: default: -- cgit From 781dd0431eb549f9cb1fdddf91a50d985febe884 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 14 Jun 2021 19:13:42 -0700 Subject: skmsg: Increase sk->sk_drops when dropping packets It is hard to observe packet drops without increasing relevant drop counters, here we should increase sk->sk_drops which is a protocol-independent counter. Fortunately psock is always associated with a struct sock, we can just use psock->sk. Suggested-by: John Fastabend Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210615021342.7416-9-xiyou.wangcong@gmail.com --- net/core/skmsg.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'net/core') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 3aa9065811ad..9b6160a191f8 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -578,6 +578,12 @@ static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, return sk_psock_skb_ingress(psock, skb); } +static void sock_drop(struct sock *sk, struct sk_buff *skb) +{ + sk_drops_add(sk, skb); + kfree_skb(skb); +} + static void sk_psock_backlog(struct work_struct *work) { struct sk_psock *psock = container_of(work, struct sk_psock, work); @@ -617,7 +623,7 @@ start: /* Hard errors break pipe and stop xmit. */ sk_psock_report_error(psock, ret ? -ret : EPIPE); sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); - kfree_skb(skb); + sock_drop(psock->sk, skb); goto end; } off += ret; @@ -708,7 +714,7 @@ static void __sk_psock_zap_ingress(struct sk_psock *psock) while ((skb = skb_dequeue(&psock->ingress_skb)) != NULL) { skb_bpf_redirect_clear(skb); - kfree_skb(skb); + sock_drop(psock->sk, skb); } __sk_psock_purge_ingress_msg(psock); } @@ -834,7 +840,7 @@ static int sk_psock_skb_redirect(struct sk_psock *from, struct sk_buff *skb) * return code, but then didn't set a redirect interface. */ if (unlikely(!sk_other)) { - kfree_skb(skb); + sock_drop(from->sk, skb); return -EIO; } psock_other = sk_psock(sk_other); @@ -844,14 +850,14 @@ static int sk_psock_skb_redirect(struct sk_psock *from, struct sk_buff *skb) */ if (!psock_other || sock_flag(sk_other, SOCK_DEAD)) { skb_bpf_redirect_clear(skb); - kfree_skb(skb); + sock_drop(from->sk, skb); return -EIO; } spin_lock_bh(&psock_other->ingress_lock); if (!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) { spin_unlock_bh(&psock_other->ingress_lock); skb_bpf_redirect_clear(skb); - kfree_skb(skb); + sock_drop(from->sk, skb); return -EIO; } @@ -942,7 +948,7 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, case __SK_DROP: default: out_free: - kfree_skb(skb); + sock_drop(psock->sk, skb); } return err; @@ -977,7 +983,7 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) sk = strp->sk; psock = sk_psock(sk); if (unlikely(!psock)) { - kfree_skb(skb); + sock_drop(sk, skb); goto out; } prog = READ_ONCE(psock->progs.stream_verdict); @@ -1098,7 +1104,7 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, psock = sk_psock(sk); if (unlikely(!psock)) { len = 0; - kfree_skb(skb); + sock_drop(sk, skb); goto out; } prog = READ_ONCE(psock->progs.stream_verdict); -- cgit From 7dd5d437c258bbf4cc15b35229e5208b87b8b4e0 Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Sun, 13 Jun 2021 21:34:39 +0700 Subject: bpf: Fix integer overflow in argument calculation for bpf_map_area_alloc In 32-bit architecture, the result of sizeof() is a 32-bit integer so the expression becomes the multiplication between 2 32-bit integer which can potentially leads to integer overflow. As a result, bpf_map_area_alloc() allocates less memory than needed. Fix this by casting 1 operand to u64. Fixes: 0d2c4f964050 ("bpf: Eliminate rlimit-based memory accounting for sockmap and sockhash maps") Fixes: 99c51064fb06 ("devmap: Use bpf_map_area_alloc() for allocating hash buckets") Fixes: 546ac1ffb70d ("bpf: add devmap, a map for storing net device references") Signed-off-by: Bui Quang Minh Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210613143440.71975-1-minhquangbui99@gmail.com --- net/core/sock_map.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 6f1b82b8ad49..60decd6420ca 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -48,7 +48,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) bpf_map_init_from_attr(&stab->map, attr); raw_spin_lock_init(&stab->lock); - stab->sks = bpf_map_area_alloc(stab->map.max_entries * + stab->sks = bpf_map_area_alloc((u64) stab->map.max_entries * sizeof(struct sock *), stab->map.numa_node); if (!stab->sks) { -- cgit