From 4873a1b2024dce9a501b56d1039ff0752027a92e Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 31 Jul 2022 15:41:06 +0300 Subject: net/sched: remove hacks added to dev_trans_start() for bonding to work Now that the bonding driver keeps track of the last TX time of ARP and NS probes, we effectively revert the following commits: 32d3e51a82d4 ("net_sched: use macvlan real dev trans_start in dev_trans_start()") 07ce76aa9bcf ("net_sched: make dev_trans_start return vlan's real dev trans_start") Note that the approach of continuing to hack at this function would not get us very far, hence the desire to take a different approach. DSA is also a virtual device that uses NETIF_F_LLTX, but there, many uppers share the same lower (DSA master, i.e. the physical host port of a switch). By making dev_trans_start() on a DSA interface return the dev_trans_start() of the master, we effectively assume that all other DSA interfaces are silent, otherwise this corrupts the validity of the probe timestamp data from the bonding driver's perspective. Furthermore, the hacks didn't take into consideration the fact that the lower interface of @dev may not have been physical either. For example, VLAN over VLAN, or DSA with 2 masters in a LAG. And even furthermore, there are NETIF_F_LLTX devices which are not stacked, like veth. The hack here would not work with those, because it would not have to provide the bonding driver something to chew at all. Signed-off-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- net/sched/sch_generic.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index cc6eabee2830..d47b9689eba6 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -427,14 +427,10 @@ void __qdisc_run(struct Qdisc *q) unsigned long dev_trans_start(struct net_device *dev) { - unsigned long val, res; + unsigned long res = READ_ONCE(netdev_get_tx_queue(dev, 0)->trans_start); + unsigned long val; unsigned int i; - if (is_vlan_dev(dev)) - dev = vlan_dev_real_dev(dev); - else if (netif_is_macvlan(dev)) - dev = macvlan_dev_real_dev(dev); - res = READ_ONCE(netdev_get_tx_queue(dev, 0)->trans_start); for (i = 1; i < dev->num_tx_queues; i++) { val = READ_ONCE(netdev_get_tx_queue(dev, i)->trans_start); if (val && time_after(val, res)) -- cgit From c0bf3c6aa444a5ef44acc57ef6cfa53fd4fc1c9b Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 4 Aug 2022 17:21:25 -0700 Subject: mptcp: move subflow cleanup in mptcp_destroy_common() If the mptcp socket creation fails due to a CGROUP_INET_SOCK_CREATE eBPF program, the MPTCP protocol ends-up leaking all the subflows: the related cleanup happens in __mptcp_destroy_sock() that is not invoked in such code path. Address the issue moving the subflow sockets cleanup in the mptcp_destroy_common() helper, which is invoked in every msk cleanup path. Additionally get rid of the intermediate list_splice_init step, which is an unneeded relic from the past. The issue is present since before the reported root cause commit, but any attempt to backport the fix before that hash will require a complete rewrite. Fixes: e16163b6e2 ("mptcp: refactor shutdown and close") Reported-by: Nguyen Dinh Phi Reviewed-by: Mat Martineau Co-developed-by: Nguyen Dinh Phi Signed-off-by: Nguyen Dinh Phi Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 39 +++++++++++++++------------------------ net/mptcp/protocol.h | 2 +- net/mptcp/subflow.c | 3 ++- 3 files changed, 18 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index a3f1c1461874..07fcc86e1fc9 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2769,30 +2769,16 @@ static void __mptcp_wr_shutdown(struct sock *sk) static void __mptcp_destroy_sock(struct sock *sk) { - struct mptcp_subflow_context *subflow, *tmp; struct mptcp_sock *msk = mptcp_sk(sk); - LIST_HEAD(conn_list); pr_debug("msk=%p", msk); might_sleep(); - /* join list will be eventually flushed (with rst) at sock lock release time*/ - list_splice_init(&msk->conn_list, &conn_list); - mptcp_stop_timer(sk); sk_stop_timer(sk, &sk->sk_timer); msk->pm.status = 0; - /* clears msk->subflow, allowing the following loop to close - * even the initial subflow - */ - mptcp_dispose_initial_subflow(msk); - list_for_each_entry_safe(subflow, tmp, &conn_list, node) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - __mptcp_close_ssk(sk, ssk, subflow, 0); - } - sk->sk_prot->destroy(sk); WARN_ON_ONCE(msk->rmem_fwd_alloc); @@ -2884,24 +2870,20 @@ static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) static int mptcp_disconnect(struct sock *sk, int flags) { - struct mptcp_subflow_context *subflow, *tmp; struct mptcp_sock *msk = mptcp_sk(sk); inet_sk_state_store(sk, TCP_CLOSE); - list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - - __mptcp_close_ssk(sk, ssk, subflow, MPTCP_CF_FASTCLOSE); - } - mptcp_stop_timer(sk); sk_stop_timer(sk, &sk->sk_timer); if (mptcp_sk(sk)->token) mptcp_event(MPTCP_EVENT_CLOSED, mptcp_sk(sk), NULL, GFP_KERNEL); - mptcp_destroy_common(msk); + /* msk->subflow is still intact, the following will not free the first + * subflow + */ + mptcp_destroy_common(msk, MPTCP_CF_FASTCLOSE); msk->last_snd = NULL; WRITE_ONCE(msk->flags, 0); msk->cb_flags = 0; @@ -3051,12 +3033,17 @@ out: return newsk; } -void mptcp_destroy_common(struct mptcp_sock *msk) +void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags) { + struct mptcp_subflow_context *subflow, *tmp; struct sock *sk = (struct sock *)msk; __mptcp_clear_xmit(sk); + /* join list will be eventually flushed (with rst) at sock lock release time */ + list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) + __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), subflow, flags); + /* move to sk_receive_queue, sk_stream_kill_queues will purge it */ mptcp_data_lock(sk); skb_queue_splice_tail_init(&msk->receive_queue, &sk->sk_receive_queue); @@ -3078,7 +3065,11 @@ static void mptcp_destroy(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - mptcp_destroy_common(msk); + /* clears msk->subflow, allowing the following to close + * even the initial subflow + */ + mptcp_dispose_initial_subflow(msk); + mptcp_destroy_common(msk, 0); sk_sockets_allocated_dec(sk); } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 5d6043c16b09..40881a7df5d5 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -717,7 +717,7 @@ static inline void mptcp_write_space(struct sock *sk) } } -void mptcp_destroy_common(struct mptcp_sock *msk); +void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags); #define MPTCP_TOKEN_MAX_RETRIES 4 diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 901c763dcdbb..c7d49fb6e7bd 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -621,7 +621,8 @@ static void mptcp_sock_destruct(struct sock *sk) sock_orphan(sk); } - mptcp_destroy_common(mptcp_sk(sk)); + /* We don't need to clear msk->subflow, as it's still NULL at this point */ + mptcp_destroy_common(mptcp_sk(sk), 0); inet_sock_destruct(sk); } -- cgit From c886d70286bf3ad411eb3d689328a67f7102c6ae Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 4 Aug 2022 17:21:26 -0700 Subject: mptcp: do not queue data on closed subflows Dipanjan reported a syzbot splat at close time: WARNING: CPU: 1 PID: 10818 at net/ipv4/af_inet.c:153 inet_sock_destruct+0x6d0/0x8e0 net/ipv4/af_inet.c:153 Modules linked in: uio_ivshmem(OE) uio(E) CPU: 1 PID: 10818 Comm: kworker/1:16 Tainted: G OE 5.19.0-rc6-g2eae0556bb9d #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 Workqueue: events mptcp_worker RIP: 0010:inet_sock_destruct+0x6d0/0x8e0 net/ipv4/af_inet.c:153 Code: 21 02 00 00 41 8b 9c 24 28 02 00 00 e9 07 ff ff ff e8 34 4d 91 f9 89 ee 4c 89 e7 e8 4a 47 60 ff e9 a6 fc ff ff e8 20 4d 91 f9 <0f> 0b e9 84 fe ff ff e8 14 4d 91 f9 0f 0b e9 d4 fd ff ff e8 08 4d RSP: 0018:ffffc9001b35fa78 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 00000000002879d0 RCX: ffff8881326f3b00 RDX: 0000000000000000 RSI: ffff8881326f3b00 RDI: 0000000000000002 RBP: ffff888179662674 R08: ffffffff87e983a0 R09: 0000000000000000 R10: 0000000000000005 R11: 00000000000004ea R12: ffff888179662400 R13: ffff888179662428 R14: 0000000000000001 R15: ffff88817e38e258 FS: 0000000000000000(0000) GS:ffff8881f5f00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020007bc0 CR3: 0000000179592000 CR4: 0000000000150ee0 Call Trace: __sk_destruct+0x4f/0x8e0 net/core/sock.c:2067 sk_destruct+0xbd/0xe0 net/core/sock.c:2112 __sk_free+0xef/0x3d0 net/core/sock.c:2123 sk_free+0x78/0xa0 net/core/sock.c:2134 sock_put include/net/sock.h:1927 [inline] __mptcp_close_ssk+0x50f/0x780 net/mptcp/protocol.c:2351 __mptcp_destroy_sock+0x332/0x760 net/mptcp/protocol.c:2828 mptcp_worker+0x5d2/0xc90 net/mptcp/protocol.c:2586 process_one_work+0x9cc/0x1650 kernel/workqueue.c:2289 worker_thread+0x623/0x1070 kernel/workqueue.c:2436 kthread+0x2e9/0x3a0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:302 The root cause of the problem is that an mptcp-level (re)transmit can race with mptcp_close() and the packet scheduler checks the subflow state before acquiring the socket lock: we can try to (re)transmit on an already closed ssk. Fix the issue checking again the subflow socket status under the subflow socket lock protection. Additionally add the missing check for the fallback-to-tcp case. Fixes: d5f49190def6 ("mptcp: allow picking different xmit subflows") Reported-by: Dipanjan Das Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 8 +++++++- net/mptcp/protocol.h | 11 +++++++---- 2 files changed, 14 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 07fcc86e1fc9..da4257504fad 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1240,6 +1240,9 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, info->limit > dfrag->data_len)) return 0; + if (unlikely(!__tcp_can_send(ssk))) + return -EAGAIN; + /* compute send limit */ info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags); copy = info->size_goal; @@ -1413,7 +1416,8 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) if (__mptcp_check_fallback(msk)) { if (!msk->first) return NULL; - return sk_stream_memory_free(msk->first) ? msk->first : NULL; + return __tcp_can_send(msk->first) && + sk_stream_memory_free(msk->first) ? msk->first : NULL; } /* re-use last subflow, if the burst allow that */ @@ -1564,6 +1568,8 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags) ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); if (ret <= 0) { + if (ret == -EAGAIN) + continue; mptcp_push_release(ssk, &info); goto out; } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 40881a7df5d5..132d50833df1 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -624,16 +624,19 @@ void mptcp_info2sockaddr(const struct mptcp_addr_info *info, struct sockaddr_storage *addr, unsigned short family); -static inline bool __mptcp_subflow_active(struct mptcp_subflow_context *subflow) +static inline bool __tcp_can_send(const struct sock *ssk) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + /* only send if our side has not closed yet */ + return ((1 << inet_sk_state_load(ssk)) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)); +} +static inline bool __mptcp_subflow_active(struct mptcp_subflow_context *subflow) +{ /* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */ if (subflow->request_join && !subflow->fully_established) return false; - /* only send if our side has not closed yet */ - return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)); + return __tcp_can_send(mptcp_subflow_tcp_sock(subflow)); } void mptcp_subflow_set_active(struct mptcp_subflow_context *subflow); -- cgit From 399a14ec7993d605740de7b2cd5c0ce8407d12ed Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 4 Aug 2022 19:26:27 +0200 Subject: netfilter: nf_tables: fix crash when nf_trace is enabled do not access info->pkt when info->trace is not 1. nft_traceinfo is not initialized, except when tracing is enabled. The 'nft_trace_enabled' static key cannot be used for this, we must always check info->trace first. Pass nft_pktinfo directly to avoid this. Fixes: e34b9ed96ce3 ("netfilter: nf_tables: avoid skb access on nf_stolen") Reported-by: Hangbin Liu Signed-off-by: Florian Westphal Signed-off-by: Jakub Kicinski --- net/netfilter/nf_tables_core.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 3ddce24ac76d..cee3e4e905ec 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -34,25 +34,23 @@ static noinline void __nft_trace_packet(struct nft_traceinfo *info, nft_trace_notify(info); } -static inline void nft_trace_packet(struct nft_traceinfo *info, +static inline void nft_trace_packet(const struct nft_pktinfo *pkt, + struct nft_traceinfo *info, const struct nft_chain *chain, const struct nft_rule_dp *rule, enum nft_trace_types type) { if (static_branch_unlikely(&nft_trace_enabled)) { - const struct nft_pktinfo *pkt = info->pkt; - info->nf_trace = pkt->skb->nf_trace; info->rule = rule; __nft_trace_packet(info, chain, type); } } -static inline void nft_trace_copy_nftrace(struct nft_traceinfo *info) +static inline void nft_trace_copy_nftrace(const struct nft_pktinfo *pkt, + struct nft_traceinfo *info) { if (static_branch_unlikely(&nft_trace_enabled)) { - const struct nft_pktinfo *pkt = info->pkt; - if (info->trace) info->nf_trace = pkt->skb->nf_trace; } @@ -96,7 +94,6 @@ static noinline void __nft_trace_verdict(struct nft_traceinfo *info, const struct nft_chain *chain, const struct nft_regs *regs) { - const struct nft_pktinfo *pkt = info->pkt; enum nft_trace_types type; switch (regs->verdict.code) { @@ -110,7 +107,9 @@ static noinline void __nft_trace_verdict(struct nft_traceinfo *info, break; default: type = NFT_TRACETYPE_RULE; - info->nf_trace = pkt->skb->nf_trace; + + if (info->trace) + info->nf_trace = info->pkt->skb->nf_trace; break; } @@ -271,10 +270,10 @@ next_rule: switch (regs.verdict.code) { case NFT_BREAK: regs.verdict.code = NFT_CONTINUE; - nft_trace_copy_nftrace(&info); + nft_trace_copy_nftrace(pkt, &info); continue; case NFT_CONTINUE: - nft_trace_packet(&info, chain, rule, + nft_trace_packet(pkt, &info, chain, rule, NFT_TRACETYPE_RULE); continue; } @@ -318,7 +317,7 @@ next_rule: goto next_rule; } - nft_trace_packet(&info, basechain, NULL, NFT_TRACETYPE_POLICY); + nft_trace_packet(pkt, &info, basechain, NULL, NFT_TRACETYPE_POLICY); if (static_branch_unlikely(&nft_counters_enabled)) nft_update_chain_stats(basechain, pkt); -- cgit From b06ada6df9cf785099c142d96cb8a337ff46adf7 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 4 Aug 2022 19:26:29 +0200 Subject: netfilter: flowtable: fix incorrect Kconfig dependencies Remove default to 'y', this infrastructure is not fundamental for the flowtable operational. Add a missing dependency on CONFIG_NF_FLOW_TABLE. Reported-by: Linus Torvalds Fixes: b038177636f8 ("netfilter: nf_flow_table: count pending offload workqueue tasks") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal Signed-off-by: Jakub Kicinski --- net/netfilter/Kconfig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index df6abbfe0079..22f15ebf6045 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -736,9 +736,8 @@ config NF_FLOW_TABLE config NF_FLOW_TABLE_PROCFS bool "Supply flow table statistics in procfs" - default y + depends on NF_FLOW_TABLE depends on PROC_FS - depends on SYSCTL help This option enables for the flow table offload statistics to be shown in procfs under net/netfilter/nf_flowtable. -- cgit From df1c941468fca014ad092f76672966bb412c2848 Mon Sep 17 00:00:00 2001 From: Francois Romieu Date: Tue, 2 Aug 2022 17:07:42 +0200 Subject: net: avoid overflow when rose /proc displays timer information. rose /proc code does not serialize timer accesses. Initial report by Bernard F6BVP Pidoux exhibits overflow amounting to 116 ticks on its HZ=250 system. Full timer access serialization would imho be overkill as rose /proc does not enforce consistency between displayed ROSE_STATE_XYZ and timer values during changes of state. The patch may also fix similar behavior in ax25 /proc, ax25 ioctl and netrom /proc as they all exhibit the same timer serialization policy. This point has not been reported though. The sole remaining use of ax25_display_timer - ax25 rtt valuation - may also perform marginally better but I have not analyzed it too deeply. Cc: Thomas DL9SAU Osterried Link: https://lore.kernel.org/all/d5e93cc7-a91f-13d3-49a1-b50c11f0f811@free.fr/ Signed-off-by: Francois Romieu Tested-by: Bernard Pidoux Link: https://lore.kernel.org/r/Yuk9vq7t7VhmnOXu@electric-eye.fr.zoreil.com Signed-off-by: Jakub Kicinski --- net/ax25/ax25_timer.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ax25/ax25_timer.c b/net/ax25/ax25_timer.c index 85865ebfdfa2..9f7cb0a7c73f 100644 --- a/net/ax25/ax25_timer.c +++ b/net/ax25/ax25_timer.c @@ -108,10 +108,12 @@ int ax25_t1timer_running(ax25_cb *ax25) unsigned long ax25_display_timer(struct timer_list *timer) { + long delta = timer->expires - jiffies; + if (!timer_pending(timer)) return 0; - return timer->expires - jiffies; + return max(0L, delta); } EXPORT_SYMBOL(ax25_display_timer); -- cgit From ac0dbed9ba4c38ed9b5fd3a43ee4bc1f48901a34 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Tue, 2 Aug 2022 09:12:03 -0700 Subject: net: seg6: initialize induction variable to first valid array index Fixes the following warnings observed when building CONFIG_IPV6_SEG6_LWTUNNEL=y with clang: net/ipv6/seg6_local.o: warning: objtool: seg6_local_fill_encap() falls through to next function seg6_local_get_encap_size() net/ipv6/seg6_local.o: warning: objtool: seg6_local_cmp_encap() falls through to next function input_action_end() LLVM can fully unroll loops in seg6_local_get_encap_size() and seg6_local_cmp_encap(). One issue in those loops is that the induction variable is initialized to 0. The loop iterates over members of seg6_action_params, a global array of struct seg6_action_param calling their put() function pointer members. seg6_action_param uses an array initializer to initialize SEG6_LOCAL_SRH and later elements, which is the third enumeration of an anonymous union. The guard `if (attrs & SEG6_F_ATTR(i))` may prevent this from being called at runtime, but it would still be UB for `seg6_action_params[0]->put` to be called; the unrolled loop will make the initial iterations unreachable, which LLVM will later rotate to fallthrough to the next function. Make this more obvious that this cannot happen to the compiler by initializing the loop induction variable to the minimum valid index that seg6_action_params is initialized to. Reported-by: Thomas Gleixner Signed-off-by: Nick Desaulniers Link: https://lore.kernel.org/r/20220802161203.622293-1-ndesaulniers@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/seg6_local.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index 2cd4a8d3b30a..b7de5e46fdd8 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -1614,7 +1614,7 @@ static void __destroy_attrs(unsigned long parsed_attrs, int max_parsed, * callback. If the callback is not available, then we skip to the next * attribute; otherwise, we call the destroy() callback. */ - for (i = 0; i < max_parsed; ++i) { + for (i = SEG6_LOCAL_SRH; i < max_parsed; ++i) { if (!(parsed_attrs & SEG6_F_ATTR(i))) continue; @@ -1643,7 +1643,7 @@ static int parse_nla_optional_attrs(struct nlattr **attrs, struct seg6_action_param *param; int err, i; - for (i = 0; i < SEG6_LOCAL_MAX + 1; ++i) { + for (i = SEG6_LOCAL_SRH; i < SEG6_LOCAL_MAX + 1; ++i) { if (!(desc->optattrs & SEG6_F_ATTR(i)) || !attrs[i]) continue; @@ -1742,7 +1742,7 @@ static int parse_nla_action(struct nlattr **attrs, struct seg6_local_lwt *slwt) } /* parse the required attributes */ - for (i = 0; i < SEG6_LOCAL_MAX + 1; i++) { + for (i = SEG6_LOCAL_SRH; i < SEG6_LOCAL_MAX + 1; i++) { if (desc->attrs & SEG6_F_ATTR(i)) { if (!attrs[i]) return -EINVAL; @@ -1847,7 +1847,7 @@ static int seg6_local_fill_encap(struct sk_buff *skb, attrs = slwt->desc->attrs | slwt->parsed_optattrs; - for (i = 0; i < SEG6_LOCAL_MAX + 1; i++) { + for (i = SEG6_LOCAL_SRH; i < SEG6_LOCAL_MAX + 1; i++) { if (attrs & SEG6_F_ATTR(i)) { param = &seg6_action_params[i]; err = param->put(skb, slwt); @@ -1927,7 +1927,7 @@ static int seg6_local_cmp_encap(struct lwtunnel_state *a, if (attrs_a != attrs_b) return 1; - for (i = 0; i < SEG6_LOCAL_MAX + 1; i++) { + for (i = SEG6_LOCAL_SRH; i < SEG6_LOCAL_MAX + 1; i++) { if (attrs_a & SEG6_F_ATTR(i)) { param = &seg6_action_params[i]; if (param->cmp(slwt_a, slwt_b)) -- cgit From baa56dfe2cdad12edb2625b2d454e205943c3402 Mon Sep 17 00:00:00 2001 From: Veerendranath Jakkam Date: Fri, 5 Aug 2022 19:22:59 +0530 Subject: wifi: cfg80211: Fix validating BSS pointers in __cfg80211_connect_result Driver's SME is allowed to fill either BSSID or BSS pointers in struct cfg80211_connect_resp_params when indicating connect response but a check in __cfg80211_connect_result() is giving unnecessary warning when driver's SME fills only BSSID pointer and not BSS pointer in struct cfg80211_connect_resp_params. In case of mac80211 with auth/assoc path, it is always expected to fill BSS pointers in struct cfg80211_connect_resp_params when calling __cfg80211_connect_result() since cfg80211 must have hold BSS pointers in cfg80211_mlme_assoc(). So, skip the check for the drivers which support cfg80211 connect callback, for example with brcmfmac is one such driver which had the warning: WARNING: CPU: 5 PID: 514 at net/wireless/sme.c:786 __cfg80211_connect_result+0x2fc/0x5c0 [cfg80211] Reported-by: Linus Torvalds Fixes: efbabc116500 ("cfg80211: Indicate MLO connection info in connect and roam callbacks") Signed-off-by: Veerendranath Jakkam [kvalo@kernel.org: add more info to the commit log] Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20220805135259.4126630-1-quic_vjakkam@quicinc.com --- net/wireless/sme.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 62c773cf1b8d..27fb2a0c4052 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -782,9 +782,11 @@ void __cfg80211_connect_result(struct net_device *dev, #endif if (cr->status == WLAN_STATUS_SUCCESS) { - for_each_valid_link(cr, link) { - if (WARN_ON_ONCE(!cr->links[link].bss)) - break; + if (!wiphy_to_rdev(wdev->wiphy)->ops->connect) { + for_each_valid_link(cr, link) { + if (WARN_ON_ONCE(!cr->links[link].bss)) + break; + } } for_each_valid_link(cr, link) { -- cgit From f574f7f839fc1753467b52417591cf2668825a92 Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Thu, 4 Aug 2022 23:04:21 +0800 Subject: net: bpf: Use the protocol's set_rcvlowat behavior if there is one The commit d1361840f8c5 ("tcp: fix SO_RCVLOWAT and RCVBUF autotuning") add one new (struct proto_ops)->set_rcvlowat method so that a protocol can override the default setsockopt(SO_RCVLOWAT) behavior. The prior bpf codes don't check and invoke the protos's set_rcvlowat, now correct it. Signed-off-by: Gao Feng Signed-off-by: David S. Miller --- net/core/filter.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 5669248aff25..e8508aaafd27 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5063,7 +5063,10 @@ static int __bpf_setsockopt(struct sock *sk, int level, int optname, case SO_RCVLOWAT: if (val < 0) val = INT_MAX; - WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); + if (sk->sk_socket && sk->sk_socket->ops->set_rcvlowat) + ret = sk->sk_socket->ops->set_rcvlowat(sk, val); + else + WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); break; case SO_MARK: if (sk->sk_mark != val) { -- cgit From 332f1795ca202489c665a75e62e18ff6284de077 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 1 Aug 2022 13:52:07 -0700 Subject: Bluetooth: L2CAP: Fix l2cap_global_chan_by_psm regression The patch d0be8347c623: "Bluetooth: L2CAP: Fix use-after-free caused by l2cap_chan_put" from Jul 21, 2022, leads to the following Smatch static checker warning: net/bluetooth/l2cap_core.c:1977 l2cap_global_chan_by_psm() error: we previously assumed 'c' could be null (see line 1996) Fixes: d0be8347c623 ("Bluetooth: L2CAP: Fix use-after-free caused by l2cap_chan_put") Reported-by: Dan Carpenter Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/l2cap_core.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 77c0aac14539..cbe0cae73434 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1970,11 +1970,11 @@ static struct l2cap_chan *l2cap_global_chan_by_psm(int state, __le16 psm, bdaddr_t *dst, u8 link_type) { - struct l2cap_chan *c, *c1 = NULL; + struct l2cap_chan *c, *tmp, *c1 = NULL; read_lock(&chan_list_lock); - list_for_each_entry(c, &chan_list, global_l) { + list_for_each_entry_safe(c, tmp, &chan_list, global_l) { if (state && c->state != state) continue; @@ -1993,11 +1993,10 @@ static struct l2cap_chan *l2cap_global_chan_by_psm(int state, __le16 psm, dst_match = !bacmp(&c->dst, dst); if (src_match && dst_match) { c = l2cap_chan_hold_unless_zero(c); - if (!c) - continue; - - read_unlock(&chan_list_lock); - return c; + if (c) { + read_unlock(&chan_list_lock); + return c; + } } /* Closest match */ -- cgit From 164dac9755ac297b0c07505ad3db9e7d69b80499 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 27 Jul 2022 15:08:56 +0300 Subject: Bluetooth: ISO: unlock on error path in iso_sock_setsockopt() Call release_sock(sk); before returning on this error path. Fixes: ccf74f2390d60 ("Bluetooth: Add BTPROTO_ISO socket type") Signed-off-by: Dan Carpenter Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/iso.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index ff09c353e64e..19d003727b50 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -1177,8 +1177,10 @@ static int iso_sock_setsockopt(struct socket *sock, int level, int optname, } len = min_t(unsigned int, sizeof(qos), optlen); - if (len != sizeof(qos)) - return -EINVAL; + if (len != sizeof(qos)) { + err = -EINVAL; + break; + } memset(&qos, 0, sizeof(qos)); -- cgit From 10b9adb556508a299dc283b7c746b811f6918987 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 28 Jul 2022 13:56:36 -0700 Subject: Bluetooth: hci_conn: Fix updating ISO QoS PHY BT_ISO_QOS has different semantics when it comes to QoS PHY as it uses 0x00 to disable a direction but that value is invalid over HCI and sockets using DEFER_SETUP to connect may attempt to use hci_bind_cis multiple times in order to detect if the parameters have changed, so to fix the code will now just mirror the PHY for the parameters of HCI_OP_LE_SET_CIG_PARAMS and will not update the PHY of the socket leaving it disabled. Fixes: 26afbd826ee32 ("Bluetooth: Add initial implementation of CIS connections") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_conn.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index f54864e19866..9777e7b109ee 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -1551,8 +1551,8 @@ static void cis_add(struct iso_list_data *d, struct bt_iso_qos *qos) cis->cis_id = qos->cis; cis->c_sdu = cpu_to_le16(qos->out.sdu); cis->p_sdu = cpu_to_le16(qos->in.sdu); - cis->c_phy = qos->out.phy; - cis->p_phy = qos->in.phy; + cis->c_phy = qos->out.phy ? qos->out.phy : qos->in.phy; + cis->p_phy = qos->in.phy ? qos->in.phy : qos->out.phy; cis->c_rtn = qos->out.rtn; cis->p_rtn = qos->in.rtn; @@ -1735,13 +1735,6 @@ struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst, if (!qos->in.latency) qos->in.latency = qos->out.latency; - /* Mirror PHYs that are disabled as SDU will be set to 0 */ - if (!qos->in.phy) - qos->in.phy = qos->out.phy; - - if (!qos->out.phy) - qos->out.phy = qos->in.phy; - if (!hci_le_set_cig_params(cis, qos)) { hci_conn_drop(cis); return ERR_PTR(-EINVAL); -- cgit From 0eee4995f40573f65ed67cea4d20fcf389d353de Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 28 Jul 2022 16:50:48 -0700 Subject: Bluetooth: ISO: Fix info leak in iso_sock_getsockopt() The C standard rules for when struct holes are zeroed out are slightly weird. The existing assignments might initialize everything, but GCC is allowed to (and does sometimes) leave the struct holes uninitialized, so instead of using yet another variable and copy the QoS settings just use a pointer to the stored QoS settings. Fixes: ccf74f2390d60 ("Bluetooth: Add BTPROTO_ISO socket type") Reported-by: Dan Carpenter Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/iso.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 19d003727b50..dded22cde0d1 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -1235,7 +1235,7 @@ static int iso_sock_getsockopt(struct socket *sock, int level, int optname, { struct sock *sk = sock->sk; int len, err = 0; - struct bt_iso_qos qos; + struct bt_iso_qos *qos; u8 base_len; u8 *base; @@ -1261,12 +1261,12 @@ static int iso_sock_getsockopt(struct socket *sock, int level, int optname, case BT_ISO_QOS: if (sk->sk_state == BT_CONNECTED || sk->sk_state == BT_CONNECT2) - qos = iso_pi(sk)->conn->hcon->iso_qos; + qos = &iso_pi(sk)->conn->hcon->iso_qos; else - qos = iso_pi(sk)->qos; + qos = &iso_pi(sk)->qos; - len = min_t(unsigned int, len, sizeof(qos)); - if (copy_to_user(optval, (char *)&qos, len)) + len = min_t(unsigned int, len, sizeof(*qos)); + if (copy_to_user(optval, qos, len)) err = -EFAULT; break; -- cgit From ce78e557ff8819f2d10e8d6bae79404bfbbd6809 Mon Sep 17 00:00:00 2001 From: Soenke Huster Date: Fri, 22 Jul 2022 13:53:07 +0200 Subject: Bluetooth: Fix null pointer deref on unexpected status event __hci_cmd_sync returns NULL if the controller responds with a status event. This is unexpected for the commands sent here, but on occurrence leads to null pointer dereferences and thus must be handled. Signed-off-by: Soenke Huster Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/aosp.c | 15 ++++++++++++--- net/bluetooth/msft.c | 15 ++++++++++++--- 2 files changed, 24 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/bluetooth/aosp.c b/net/bluetooth/aosp.c index 432ae3aac9e3..1d67836e95e1 100644 --- a/net/bluetooth/aosp.c +++ b/net/bluetooth/aosp.c @@ -54,7 +54,10 @@ void aosp_do_open(struct hci_dev *hdev) /* LE Get Vendor Capabilities Command */ skb = __hci_cmd_sync(hdev, hci_opcode_pack(0x3f, 0x153), 0, NULL, HCI_CMD_TIMEOUT); - if (IS_ERR(skb)) { + if (IS_ERR_OR_NULL(skb)) { + if (!skb) + skb = ERR_PTR(-EIO); + bt_dev_err(hdev, "AOSP get vendor capabilities (%ld)", PTR_ERR(skb)); return; @@ -152,7 +155,10 @@ static int enable_quality_report(struct hci_dev *hdev) skb = __hci_cmd_sync(hdev, BQR_OPCODE, sizeof(cp), &cp, HCI_CMD_TIMEOUT); - if (IS_ERR(skb)) { + if (IS_ERR_OR_NULL(skb)) { + if (!skb) + skb = ERR_PTR(-EIO); + bt_dev_err(hdev, "Enabling Android BQR failed (%ld)", PTR_ERR(skb)); return PTR_ERR(skb); @@ -171,7 +177,10 @@ static int disable_quality_report(struct hci_dev *hdev) skb = __hci_cmd_sync(hdev, BQR_OPCODE, sizeof(cp), &cp, HCI_CMD_TIMEOUT); - if (IS_ERR(skb)) { + if (IS_ERR_OR_NULL(skb)) { + if (!skb) + skb = ERR_PTR(-EIO); + bt_dev_err(hdev, "Disabling Android BQR failed (%ld)", PTR_ERR(skb)); return PTR_ERR(skb); diff --git a/net/bluetooth/msft.c b/net/bluetooth/msft.c index 14975769f678..bee6a4c656be 100644 --- a/net/bluetooth/msft.c +++ b/net/bluetooth/msft.c @@ -120,7 +120,10 @@ static bool read_supported_features(struct hci_dev *hdev, skb = __hci_cmd_sync(hdev, hdev->msft_opcode, sizeof(cp), &cp, HCI_CMD_TIMEOUT); - if (IS_ERR(skb)) { + if (IS_ERR_OR_NULL(skb)) { + if (!skb) + skb = ERR_PTR(-EIO); + bt_dev_err(hdev, "Failed to read MSFT supported features (%ld)", PTR_ERR(skb)); return false; @@ -319,8 +322,11 @@ static int msft_remove_monitor_sync(struct hci_dev *hdev, skb = __hci_cmd_sync(hdev, hdev->msft_opcode, sizeof(cp), &cp, HCI_CMD_TIMEOUT); - if (IS_ERR(skb)) + if (IS_ERR_OR_NULL(skb)) { + if (!skb) + return -EIO; return PTR_ERR(skb); + } return msft_le_cancel_monitor_advertisement_cb(hdev, hdev->msft_opcode, monitor, skb); @@ -432,8 +438,11 @@ static int msft_add_monitor_sync(struct hci_dev *hdev, HCI_CMD_TIMEOUT); kfree(cp); - if (IS_ERR(skb)) + if (IS_ERR_OR_NULL(skb)) { + if (!skb) + return -EIO; return PTR_ERR(skb); + } return msft_le_monitor_advertisement_cb(hdev, hdev->msft_opcode, monitor, skb); -- cgit From b4443423278263d229dbeee12d09e657b78d64ab Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 29 Jul 2022 11:03:27 -0700 Subject: Bluetooth: ISO: Fix memory corruption The following memory corruption can happen since iso_pinfo.base size did not account for its headers (4 bytes): net/bluetooth/eir.c 76 memcpy(&eir[eir_len], data, data_len); ^^^^^^^ ^^^^^^^^ 77 eir_len += data_len; 78 79 return eir_len; 80 } The "eir" buffer has 252 bytes and data_len is 252 but we do a memcpy() to &eir[4] so this can corrupt 4 bytes beyond the end of the buffer. Fixes: f764a6c2c1e4 ("Bluetooth: ISO: Add broadcast support") Signed-off-by: Luiz Augusto von Dentz Reported-by: Dan Carpenter --- net/bluetooth/iso.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index dded22cde0d1..70c2dd30cb13 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -44,6 +44,9 @@ static void iso_sock_kill(struct sock *sk); /* ----- ISO socket info ----- */ #define iso_pi(sk) ((struct iso_pinfo *)sk) +#define EIR_SERVICE_DATA_LENGTH 4 +#define BASE_MAX_LENGTH (HCI_MAX_PER_AD_LENGTH - EIR_SERVICE_DATA_LENGTH) + struct iso_pinfo { struct bt_sock bt; bdaddr_t src; @@ -57,7 +60,7 @@ struct iso_pinfo { __u32 flags; struct bt_iso_qos qos; __u8 base_len; - __u8 base[HCI_MAX_PER_AD_LENGTH]; + __u8 base[BASE_MAX_LENGTH]; struct iso_conn *conn; }; -- cgit From 889f0346d47a0285093a3b665d1455c084636d9f Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 3 Aug 2022 14:44:13 -0700 Subject: Bluetooth: hci_event: Fix build warning with C=1 This fixes the following warning when build with make C=1: net/bluetooth/hci_event.c:337:15: warning: restricted __le16 degrades to integer Fixes: a93661203641e ("Bluetooth: Process result of HCI Delete Stored Link Key command") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_event.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index ea33dd0cd478..485c814cf44a 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -328,14 +328,17 @@ static u8 hci_cc_delete_stored_link_key(struct hci_dev *hdev, void *data, struct sk_buff *skb) { struct hci_rp_delete_stored_link_key *rp = data; + u16 num_keys; bt_dev_dbg(hdev, "status 0x%2.2x", rp->status); if (rp->status) return rp->status; - if (rp->num_keys <= hdev->stored_num_keys) - hdev->stored_num_keys -= le16_to_cpu(rp->num_keys); + num_keys = le16_to_cpu(rp->num_keys); + + if (num_keys <= hdev->stored_num_keys) + hdev->stored_num_keys -= num_keys; else hdev->stored_num_keys = 0; -- cgit From 0c7937587d8b0337466c993dc9c7645767f57bfd Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 3 Aug 2022 14:51:16 -0700 Subject: Bluetooth: MGMT: Fixes build warnings with C=1 This fixes the following warning when building with make C=1: net/bluetooth/mgmt.c:3821:29: warning: restricted __le16 degrades to integer net/bluetooth/mgmt.c:4625:9: warning: cast to restricted __le32 Fixes: 600a87490ff98 ("Bluetooth: Implementation of MGMT_OP_SET_BLOCKED_KEYS.") Fixes: 4c54bf2b093bb ("Bluetooth: Add get/set device flags mgmt op") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/mgmt.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 646d10401b80..f0bb2fc883d7 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -3819,7 +3819,7 @@ static int set_blocked_keys(struct sock *sk, struct hci_dev *hdev, void *data, hci_blocked_keys_clear(hdev); - for (i = 0; i < keys->key_count; ++i) { + for (i = 0; i < key_count; ++i) { struct blocked_key *b = kzalloc(sizeof(*b), GFP_KERNEL); if (!b) { @@ -4624,8 +4624,7 @@ static int set_device_flags(struct sock *sk, struct hci_dev *hdev, void *data, u32 current_flags = __le32_to_cpu(cp->current_flags); bt_dev_dbg(hdev, "Set device flags %pMR (type 0x%x) = 0x%x", - &cp->addr.bdaddr, cp->addr.type, - __le32_to_cpu(current_flags)); + &cp->addr.bdaddr, cp->addr.type, current_flags); // We should take hci_dev_lock() early, I think.. conn_flags can change supported_flags = hdev->conn_flags; -- cgit From 9dfe1727b21927c6dd8d703e3a9618b505eb6224 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 3 Aug 2022 10:17:17 -0700 Subject: Bluetooth: ISO: Fix iso_sock_getsockopt for BT_DEFER_SETUP BT_DEFER_SETUP shall be considered valid for all states except for BT_CONNECTED as it is also used when initiated a connection rather then only for BT_BOUND and BT_LISTEN. Fixes: ccf74f2390d60 ("Bluetooth: Add BTPROTO_ISO socket type") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/iso.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 70c2dd30cb13..015d1b41bc32 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -1251,7 +1251,7 @@ static int iso_sock_getsockopt(struct socket *sock, int level, int optname, switch (optname) { case BT_DEFER_SETUP: - if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) { + if (sk->sk_state == BT_CONNECTED) { err = -EINVAL; break; } -- cgit From 3f2893d3c142986aa935821460cb3adb77044722 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Fri, 5 Aug 2022 16:12:18 +0900 Subject: Bluetooth: don't try to cancel uninitialized works at mgmt_index_removed() syzbot is reporting attempt to cancel uninitialized work at mgmt_index_removed() [1], for calling cancel_delayed_work_sync() without INIT_DELAYED_WORK() is not permitted. INIT_DELAYED_WORK() is called from mgmt_init_hdev() via chan->hdev_init() from hci_mgmt_cmd(), but cancel_delayed_work_sync() is unconditionally called from mgmt_index_removed(). Call cancel_delayed_work_sync() only if HCI_MGMT flag was set, for mgmt_init_hdev() sets HCI_MGMT flag when calling INIT_DELAYED_WORK(). Link: https://syzkaller.appspot.com/bug?extid=b8ddd338a8838e581b1c [1] Reported-by: syzbot Signed-off-by: Tetsuo Handa Fixes: 0ef08313cefdd60d ("Bluetooth: Convert delayed discov_off to hci_sync") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/mgmt.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index f0bb2fc883d7..6e31023b84f5 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -8935,6 +8935,8 @@ void mgmt_index_removed(struct hci_dev *hdev) HCI_MGMT_EXT_INDEX_EVENTS); /* Cancel any remaining timed work */ + if (!hci_dev_test_flag(hdev, HCI_MGMT)) + return; cancel_delayed_work_sync(&hdev->discov_off); cancel_delayed_work_sync(&hdev->service_cache); cancel_delayed_work_sync(&hdev->rpa_expired); -- cgit From 1d1ab5d39be7590bb2400418877bff43da9e75ec Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 5 Aug 2022 14:02:21 -0700 Subject: Bluetooth: ISO: Fix not using the correct QoS This fixes using wrong QoS settings when attempting to send frames while acting as peripheral since the QoS settings in use are stored in hconn->iso_qos not in sk->qos, this is actually properly handled on getsockopt(BT_ISO_QOS) but not on iso_send_frame. Fixes: ccf74f2390d60 ("Bluetooth: Add BTPROTO_ISO socket type") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/iso.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 015d1b41bc32..ced8ad4fed4f 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -373,15 +373,24 @@ done: return err; } +static struct bt_iso_qos *iso_sock_get_qos(struct sock *sk) +{ + if (sk->sk_state == BT_CONNECTED || sk->sk_state == BT_CONNECT2) + return &iso_pi(sk)->conn->hcon->iso_qos; + + return &iso_pi(sk)->qos; +} + static int iso_send_frame(struct sock *sk, struct sk_buff *skb) { struct iso_conn *conn = iso_pi(sk)->conn; + struct bt_iso_qos *qos = iso_sock_get_qos(sk); struct hci_iso_data_hdr *hdr; int len = 0; BT_DBG("sk %p len %d", sk, skb->len); - if (skb->len > iso_pi(sk)->qos.out.sdu) + if (skb->len > qos->out.sdu) return -EMSGSIZE; len = skb->len; @@ -1263,10 +1272,7 @@ static int iso_sock_getsockopt(struct socket *sock, int level, int optname, break; case BT_ISO_QOS: - if (sk->sk_state == BT_CONNECTED || sk->sk_state == BT_CONNECT2) - qos = &iso_pi(sk)->conn->hcon->iso_qos; - else - qos = &iso_pi(sk)->qos; + qos = iso_sock_get_qos(sk); len = min_t(unsigned int, len, sizeof(*qos)); if (copy_to_user(optval, qos, len)) -- cgit From 944e594cfa84ec552831489c244e02589d826b11 Mon Sep 17 00:00:00 2001 From: Martin Schiller Date: Fri, 5 Aug 2022 08:18:10 +0200 Subject: net/x25: fix call timeouts in blocking connects When a userspace application starts a blocking connect(), a CALL REQUEST is sent, the t21 timer is started and the connect is waiting in x25_wait_for_connection_establishment(). If then for some reason the t21 timer expires before any reaction on the assigned logical channel (e.g. CALL ACCEPT, CLEAR REQUEST), there is sent a CLEAR REQUEST and timer t23 is started waiting for a CLEAR confirmation. If we now receive a CLEAR CONFIRMATION from the peer, x25_disconnect() is called in x25_state2_machine() with reason "0", which means "normal" call clearing. This is ok, but the parameter "reason" is used as sk->sk_err in x25_disconnect() and sock_error(sk) is evaluated in x25_wait_for_connection_establishment() to check if the call is still pending. As "0" is not rated as an error, the connect will stuck here forever. To fix this situation, also check if the sk->sk_state changed form TCP_SYN_SENT to TCP_CLOSE in the meantime, which is also done by x25_disconnect(). Signed-off-by: Martin Schiller Link: https://lore.kernel.org/r/20220805061810.10824-1-ms@dev.tdt.de Signed-off-by: Jakub Kicinski --- net/x25/af_x25.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 6bc2ac8d8146..3b55502b2965 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -719,6 +719,11 @@ static int x25_wait_for_connection_establishment(struct sock *sk) sk->sk_socket->state = SS_UNCONNECTED; break; } + rc = -ENOTCONN; + if (sk->sk_state == TCP_CLOSE) { + sk->sk_socket->state = SS_UNCONNECTED; + break; + } rc = 0; if (sk->sk_state != TCP_ESTABLISHED) { release_sock(sk); -- cgit From 8ef49f7f8244424adcf4a546dba4cbbeb0b09c09 Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Fri, 29 Jul 2022 17:36:55 +0300 Subject: can: j1939: j1939_sk_queue_activate_next_locked(): replace WARN_ON_ONCE with netdev_warn_once() We should warn user-space that it is doing something wrong when trying to activate sessions with identical parameters but WARN_ON_ONCE macro can not be used here as it serves a different purpose. So it would be good to replace it with netdev_warn_once() message. Found by Linux Verification Center (linuxtesting.org) with Syzkaller. Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Signed-off-by: Fedor Pchelkin Signed-off-by: Alexey Khoroshilov Acked-by: Oleksij Rempel Link: https://lore.kernel.org/all/20220729143655.1108297-1-pchelkin@ispras.ru [mkl: fix indention] Signed-off-by: Marc Kleine-Budde --- net/can/j1939/socket.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index f5ecfdcf57b2..b670ba03a675 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -178,7 +178,10 @@ activate_next: if (!first) return; - if (WARN_ON_ONCE(j1939_session_activate(first))) { + if (j1939_session_activate(first)) { + netdev_warn_once(first->priv->ndev, + "%s: 0x%p: Identical session is already activated.\n", + __func__, first); first->err = -EBUSY; goto activate_next; } else { -- cgit From 8c21c54a53ab21842f5050fa090f26b03c0313d6 Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Fri, 5 Aug 2022 18:02:16 +0300 Subject: can: j1939: j1939_session_destroy(): fix memory leak of skbs We need to drop skb references taken in j1939_session_skb_queue() when destroying a session in j1939_session_destroy(). Otherwise those skbs would be lost. Link to Syzkaller info and repro: https://forge.ispras.ru/issues/11743. Found by Linux Verification Center (linuxtesting.org) with Syzkaller. V1: https://lore.kernel.org/all/20220708175949.539064-1-pchelkin@ispras.ru Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Suggested-by: Oleksij Rempel Signed-off-by: Fedor Pchelkin Signed-off-by: Alexey Khoroshilov Acked-by: Oleksij Rempel Link: https://lore.kernel.org/all/20220805150216.66313-1-pchelkin@ispras.ru Signed-off-by: Marc Kleine-Budde --- net/can/j1939/transport.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index 307ee1174a6e..d7d86c944d76 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -260,6 +260,8 @@ static void __j1939_session_drop(struct j1939_session *session) static void j1939_session_destroy(struct j1939_session *session) { + struct sk_buff *skb; + if (session->transmission) { if (session->err) j1939_sk_errqueue(session, J1939_ERRQUEUE_TX_ABORT); @@ -274,7 +276,11 @@ static void j1939_session_destroy(struct j1939_session *session) WARN_ON_ONCE(!list_empty(&session->sk_session_queue_entry)); WARN_ON_ONCE(!list_empty(&session->active_session_list_entry)); - skb_queue_purge(&session->skb_queue); + while ((skb = skb_dequeue(&session->skb_queue)) != NULL) { + /* drop ref taken in j1939_session_skb_queue() */ + skb_unref(skb); + kfree_skb(skb); + } __j1939_session_drop(session); j1939_priv_put(session->priv); kfree(session); -- cgit From 34aae2c2fb1e3d88a5aeee16715cb6bf0336cdce Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 9 Aug 2022 11:25:43 +0200 Subject: netfilter: nf_tables: validate variable length element extension Update template to validate variable length extensions. This patch adds a new .ext_len[id] field to the template to store the expected extension length. This is used to sanity check the initialization of the variable length extension. Use PTR_ERR() in nft_set_elem_init() to report errors since, after this update, there are two reason why this might fail, either because of ENOMEM or insufficient room in the extension field (EINVAL). Kernels up until 7e6bc1f6cabc ("netfilter: nf_tables: stricter validation of element data") allowed to copy more data to the extension than was allocated. This ext_len field allows to validate if the destination has the correct size as additional check. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 84 +++++++++++++++++++++++++++++++++++-------- net/netfilter/nft_dynset.c | 2 +- 2 files changed, 70 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 9f976b11d896..3b09e13b9b5c 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5467,6 +5467,27 @@ err_set_elem_expr: return ERR_PTR(err); } +static int nft_set_ext_check(const struct nft_set_ext_tmpl *tmpl, u8 id, u32 len) +{ + len += nft_set_ext_types[id].len; + if (len > tmpl->ext_len[id] || + len > U8_MAX) + return -1; + + return 0; +} + +static int nft_set_ext_memcpy(const struct nft_set_ext_tmpl *tmpl, u8 id, + void *to, const void *from, u32 len) +{ + if (nft_set_ext_check(tmpl, id, len) < 0) + return -1; + + memcpy(to, from, len); + + return 0; +} + void *nft_set_elem_init(const struct nft_set *set, const struct nft_set_ext_tmpl *tmpl, const u32 *key, const u32 *key_end, @@ -5477,17 +5498,26 @@ void *nft_set_elem_init(const struct nft_set *set, elem = kzalloc(set->ops->elemsize + tmpl->len, gfp); if (elem == NULL) - return NULL; + return ERR_PTR(-ENOMEM); ext = nft_set_elem_ext(set, elem); nft_set_ext_init(ext, tmpl); - if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY)) - memcpy(nft_set_ext_key(ext), key, set->klen); - if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END)) - memcpy(nft_set_ext_key_end(ext), key_end, set->klen); - if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) - memcpy(nft_set_ext_data(ext), data, set->dlen); + if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY) && + nft_set_ext_memcpy(tmpl, NFT_SET_EXT_KEY, + nft_set_ext_key(ext), key, set->klen) < 0) + goto err_ext_check; + + if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END) && + nft_set_ext_memcpy(tmpl, NFT_SET_EXT_KEY_END, + nft_set_ext_key_end(ext), key_end, set->klen) < 0) + goto err_ext_check; + + if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) && + nft_set_ext_memcpy(tmpl, NFT_SET_EXT_DATA, + nft_set_ext_data(ext), data, set->dlen) < 0) + goto err_ext_check; + if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { *nft_set_ext_expiration(ext) = get_jiffies_64() + expiration; if (expiration == 0) @@ -5497,6 +5527,11 @@ void *nft_set_elem_init(const struct nft_set *set, *nft_set_ext_timeout(ext) = timeout; return elem; + +err_ext_check: + kfree(elem); + + return ERR_PTR(-EINVAL); } static void __nft_set_elem_expr_destroy(const struct nft_ctx *ctx, @@ -5584,14 +5619,25 @@ err_expr: } static int nft_set_elem_expr_setup(struct nft_ctx *ctx, + const struct nft_set_ext_tmpl *tmpl, const struct nft_set_ext *ext, struct nft_expr *expr_array[], u32 num_exprs) { struct nft_set_elem_expr *elem_expr = nft_set_ext_expr(ext); + u32 len = sizeof(struct nft_set_elem_expr); struct nft_expr *expr; int i, err; + if (num_exprs == 0) + return 0; + + for (i = 0; i < num_exprs; i++) + len += expr_array[i]->ops->size; + + if (nft_set_ext_check(tmpl, NFT_SET_EXT_EXPRESSIONS, len) < 0) + return -EINVAL; + for (i = 0; i < num_exprs; i++) { expr = nft_setelem_expr_at(elem_expr, elem_expr->size); err = nft_expr_clone(expr, expr_array[i]); @@ -6054,17 +6100,23 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, } } - err = -ENOMEM; elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, elem.key_end.val.data, elem.data.val.data, timeout, expiration, GFP_KERNEL_ACCOUNT); - if (elem.priv == NULL) + if (IS_ERR(elem.priv)) { + err = PTR_ERR(elem.priv); goto err_parse_data; + } ext = nft_set_elem_ext(set, elem.priv); if (flags) *nft_set_ext_flags(ext) = flags; + if (ulen > 0) { + if (nft_set_ext_check(&tmpl, NFT_SET_EXT_USERDATA, ulen) < 0) { + err = -EINVAL; + goto err_elem_userdata; + } udata = nft_set_ext_userdata(ext); udata->len = ulen - 1; nla_memcpy(&udata->data, nla[NFTA_SET_ELEM_USERDATA], ulen); @@ -6073,14 +6125,14 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, *nft_set_ext_obj(ext) = obj; obj->use++; } - err = nft_set_elem_expr_setup(ctx, ext, expr_array, num_exprs); + err = nft_set_elem_expr_setup(ctx, &tmpl, ext, expr_array, num_exprs); if (err < 0) - goto err_elem_expr; + goto err_elem_free; trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set); if (trans == NULL) { err = -ENOMEM; - goto err_elem_expr; + goto err_elem_free; } ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK; @@ -6126,10 +6178,10 @@ err_set_full: nft_setelem_remove(ctx->net, set, &elem); err_element_clash: kfree(trans); -err_elem_expr: +err_elem_free: if (obj) obj->use--; - +err_elem_userdata: nf_tables_set_elem_destroy(ctx, set, elem.priv); err_parse_data: if (nla[NFTA_SET_ELEM_DATA] != NULL) @@ -6311,8 +6363,10 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, elem.key_end.val.data, NULL, 0, 0, GFP_KERNEL_ACCOUNT); - if (elem.priv == NULL) + if (IS_ERR(elem.priv)) { + err = PTR_ERR(elem.priv); goto fail_elem_key_end; + } ext = nft_set_elem_ext(set, elem.priv); if (flags) diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 22f70b543fa2..6983e6ddeef9 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -60,7 +60,7 @@ static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr, ®s->data[priv->sreg_key], NULL, ®s->data[priv->sreg_data], timeout, 0, GFP_ATOMIC); - if (elem == NULL) + if (IS_ERR(elem)) goto err1; ext = nft_set_elem_ext(set, elem); -- cgit From 470ee20e069a6d05ae549f7d0ef2bdbcee6a81b2 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Tue, 9 Aug 2022 14:01:46 -0300 Subject: netfilter: nf_tables: do not allow SET_ID to refer to another table When doing lookups for sets on the same batch by using its ID, a set from a different table can be used. Then, when the table is removed, a reference to the set may be kept after the set is freed, leading to a potential use-after-free. When looking for sets by ID, use the table that was used for the lookup by name, and only return sets belonging to that same table. This fixes CVE-2022-2586, also reported as ZDI-CAN-17470. Reported-by: Team Orca of Sea Security (@seasecresponse) Fixes: 958bee14d071 ("netfilter: nf_tables: use new transaction infrastructure to handle sets") Signed-off-by: Thadeu Lima de Souza Cascardo Cc: Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 3b09e13b9b5c..41c529b0001c 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3842,6 +3842,7 @@ static struct nft_set *nft_set_lookup_byhandle(const struct nft_table *table, } static struct nft_set *nft_set_lookup_byid(const struct net *net, + const struct nft_table *table, const struct nlattr *nla, u8 genmask) { struct nftables_pernet *nft_net = nft_pernet(net); @@ -3853,6 +3854,7 @@ static struct nft_set *nft_set_lookup_byid(const struct net *net, struct nft_set *set = nft_trans_set(trans); if (id == nft_trans_set_id(trans) && + set->table == table && nft_active_genmask(set, genmask)) return set; } @@ -3873,7 +3875,7 @@ struct nft_set *nft_set_lookup_global(const struct net *net, if (!nla_set_id) return set; - set = nft_set_lookup_byid(net, nla_set_id, genmask); + set = nft_set_lookup_byid(net, table, nla_set_id, genmask); } return set; } -- cgit From 95f466d22364a33d183509629d0879885b4f547e Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Tue, 9 Aug 2022 14:01:47 -0300 Subject: netfilter: nf_tables: do not allow CHAIN_ID to refer to another table When doing lookups for chains on the same batch by using its ID, a chain from a different table can be used. If a rule is added to a table but refers to a chain in a different table, it will be linked to the chain in table2, but would have expressions referring to objects in table1. Then, when table1 is removed, the rule will not be removed as its linked to a chain in table2. When expressions in the rule are processed or removed, that will lead to a use-after-free. When looking for chains by ID, use the table that was used for the lookup by name, and only return chains belonging to that same table. Fixes: 837830a4b439 ("netfilter: nf_tables: add NFTA_RULE_CHAIN_ID attribute") Signed-off-by: Thadeu Lima de Souza Cascardo Cc: Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 41c529b0001c..041accbaca32 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2472,6 +2472,7 @@ err: } static struct nft_chain *nft_chain_lookup_byid(const struct net *net, + const struct nft_table *table, const struct nlattr *nla) { struct nftables_pernet *nft_net = nft_pernet(net); @@ -2482,6 +2483,7 @@ static struct nft_chain *nft_chain_lookup_byid(const struct net *net, struct nft_chain *chain = trans->ctx.chain; if (trans->msg_type == NFT_MSG_NEWCHAIN && + chain->table == table && id == nft_trans_chain_id(trans)) return chain; } @@ -3417,7 +3419,7 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, return -EOPNOTSUPP; } else if (nla[NFTA_RULE_CHAIN_ID]) { - chain = nft_chain_lookup_byid(net, nla[NFTA_RULE_CHAIN_ID]); + chain = nft_chain_lookup_byid(net, table, nla[NFTA_RULE_CHAIN_ID]); if (IS_ERR(chain)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN_ID]); return PTR_ERR(chain); @@ -9661,7 +9663,7 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, tb[NFTA_VERDICT_CHAIN], genmask); } else if (tb[NFTA_VERDICT_CHAIN_ID]) { - chain = nft_chain_lookup_byid(ctx->net, + chain = nft_chain_lookup_byid(ctx->net, ctx->table, tb[NFTA_VERDICT_CHAIN_ID]); if (IS_ERR(chain)) return PTR_ERR(chain); -- cgit From 36d5b2913219ac853908b0f1c664345e04313856 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Tue, 9 Aug 2022 14:01:48 -0300 Subject: netfilter: nf_tables: do not allow RULE_ID to refer to another chain When doing lookups for rules on the same batch by using its ID, a rule from a different chain can be used. If a rule is added to a chain but tries to be positioned next to a rule from a different chain, it will be linked to chain2, but the use counter on chain1 would be the one to be incremented. When looking for rules by ID, use the chain that was used for the lookup by name. The chain used in the context copied to the transaction needs to match that same chain. That way, struct nft_rule does not need to get enlarged with another member. Fixes: 1a94e38d254b ("netfilter: nf_tables: add NFTA_RULE_ID attribute") Fixes: 75dd48e2e420 ("netfilter: nf_tables: Support RULE_ID reference in new rule") Signed-off-by: Thadeu Lima de Souza Cascardo Cc: Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 041accbaca32..0c3c0523e5f2 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3373,6 +3373,7 @@ static int nft_table_validate(struct net *net, const struct nft_table *table) } static struct nft_rule *nft_rule_lookup_byid(const struct net *net, + const struct nft_chain *chain, const struct nlattr *nla); #define NFT_RULE_MAXEXPRS 128 @@ -3461,7 +3462,7 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, return PTR_ERR(old_rule); } } else if (nla[NFTA_RULE_POSITION_ID]) { - old_rule = nft_rule_lookup_byid(net, nla[NFTA_RULE_POSITION_ID]); + old_rule = nft_rule_lookup_byid(net, chain, nla[NFTA_RULE_POSITION_ID]); if (IS_ERR(old_rule)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION_ID]); return PTR_ERR(old_rule); @@ -3606,6 +3607,7 @@ err_release_expr: } static struct nft_rule *nft_rule_lookup_byid(const struct net *net, + const struct nft_chain *chain, const struct nlattr *nla) { struct nftables_pernet *nft_net = nft_pernet(net); @@ -3616,6 +3618,7 @@ static struct nft_rule *nft_rule_lookup_byid(const struct net *net, struct nft_rule *rule = nft_trans_rule(trans); if (trans->msg_type == NFT_MSG_NEWRULE && + trans->ctx.chain == chain && id == nft_trans_rule_id(trans)) return rule; } @@ -3665,7 +3668,7 @@ static int nf_tables_delrule(struct sk_buff *skb, const struct nfnl_info *info, err = nft_delrule(&ctx, rule); } else if (nla[NFTA_RULE_ID]) { - rule = nft_rule_lookup_byid(net, nla[NFTA_RULE_ID]); + rule = nft_rule_lookup_byid(net, chain, nla[NFTA_RULE_ID]); if (IS_ERR(rule)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_ID]); return PTR_ERR(rule); -- cgit From 341b6941608762d8235f3fd1e45e4d7114ed8c2c Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 8 Aug 2022 19:30:06 +0200 Subject: netfilter: nf_tables: upfront validation of data via nft_data_init() Instead of parsing the data and then validate that type and length are correct, pass a description of the expected data so it can be validated upfront before parsing it to bail out earlier. This patch adds a new .size field to specify the maximum size of the data area. The .len field is optional and it is used as an input/output field, it provides the specific length of the expected data in the input path. If then .len field is not specified, then obtained length from the netlink attribute is stored. This is required by cmp, bitwise, range and immediate, which provide no netlink attribute that describes the data length. The immediate expression uses the destination register type to infer the expected data type. Relying on opencoded validation of the expected data might lead to subtle bugs as described in 7e6bc1f6cabc ("netfilter: nf_tables: stricter validation of element data"). Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 78 ++++++++++++++++++++++--------------------- net/netfilter/nft_bitwise.c | 66 ++++++++++++++++++------------------ net/netfilter/nft_cmp.c | 44 +++++++++++------------- net/netfilter/nft_immediate.c | 22 ++++++++++-- net/netfilter/nft_range.c | 27 +++++++-------- 5 files changed, 124 insertions(+), 113 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 0c3c0523e5f2..05896765c68f 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5202,19 +5202,13 @@ static int nft_setelem_parse_flags(const struct nft_set *set, static int nft_setelem_parse_key(struct nft_ctx *ctx, struct nft_set *set, struct nft_data *key, struct nlattr *attr) { - struct nft_data_desc desc; - int err; - - err = nft_data_init(ctx, key, NFT_DATA_VALUE_MAXLEN, &desc, attr); - if (err < 0) - return err; - - if (desc.type != NFT_DATA_VALUE || desc.len != set->klen) { - nft_data_release(key, desc.type); - return -EINVAL; - } + struct nft_data_desc desc = { + .type = NFT_DATA_VALUE, + .size = NFT_DATA_VALUE_MAXLEN, + .len = set->klen, + }; - return 0; + return nft_data_init(ctx, key, &desc, attr); } static int nft_setelem_parse_data(struct nft_ctx *ctx, struct nft_set *set, @@ -5223,24 +5217,17 @@ static int nft_setelem_parse_data(struct nft_ctx *ctx, struct nft_set *set, struct nlattr *attr) { u32 dtype; - int err; - - err = nft_data_init(ctx, data, NFT_DATA_VALUE_MAXLEN, desc, attr); - if (err < 0) - return err; if (set->dtype == NFT_DATA_VERDICT) dtype = NFT_DATA_VERDICT; else dtype = NFT_DATA_VALUE; - if (dtype != desc->type || - set->dlen != desc->len) { - nft_data_release(data, desc->type); - return -EINVAL; - } + desc->type = dtype; + desc->size = NFT_DATA_VALUE_MAXLEN; + desc->len = set->dlen; - return 0; + return nft_data_init(ctx, data, desc, attr); } static void *nft_setelem_catchall_get(const struct net *net, @@ -9685,7 +9672,7 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, } desc->len = sizeof(data->verdict); - desc->type = NFT_DATA_VERDICT; + return 0; } @@ -9738,20 +9725,25 @@ nla_put_failure: } static int nft_value_init(const struct nft_ctx *ctx, - struct nft_data *data, unsigned int size, - struct nft_data_desc *desc, const struct nlattr *nla) + struct nft_data *data, struct nft_data_desc *desc, + const struct nlattr *nla) { unsigned int len; len = nla_len(nla); if (len == 0) return -EINVAL; - if (len > size) + if (len > desc->size) return -EOVERFLOW; + if (desc->len) { + if (len != desc->len) + return -EINVAL; + } else { + desc->len = len; + } nla_memcpy(data->data, nla, len); - desc->type = NFT_DATA_VALUE; - desc->len = len; + return 0; } @@ -9771,7 +9763,6 @@ static const struct nla_policy nft_data_policy[NFTA_DATA_MAX + 1] = { * * @ctx: context of the expression using the data * @data: destination struct nft_data - * @size: maximum data length * @desc: data description * @nla: netlink attribute containing data * @@ -9781,24 +9772,35 @@ static const struct nla_policy nft_data_policy[NFTA_DATA_MAX + 1] = { * The caller can indicate that it only wants to accept data of type * NFT_DATA_VALUE by passing NULL for the ctx argument. */ -int nft_data_init(const struct nft_ctx *ctx, - struct nft_data *data, unsigned int size, +int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data, struct nft_data_desc *desc, const struct nlattr *nla) { struct nlattr *tb[NFTA_DATA_MAX + 1]; int err; + if (WARN_ON_ONCE(!desc->size)) + return -EINVAL; + err = nla_parse_nested_deprecated(tb, NFTA_DATA_MAX, nla, nft_data_policy, NULL); if (err < 0) return err; - if (tb[NFTA_DATA_VALUE]) - return nft_value_init(ctx, data, size, desc, - tb[NFTA_DATA_VALUE]); - if (tb[NFTA_DATA_VERDICT] && ctx != NULL) - return nft_verdict_init(ctx, data, desc, tb[NFTA_DATA_VERDICT]); - return -EINVAL; + if (tb[NFTA_DATA_VALUE]) { + if (desc->type != NFT_DATA_VALUE) + return -EINVAL; + + err = nft_value_init(ctx, data, desc, tb[NFTA_DATA_VALUE]); + } else if (tb[NFTA_DATA_VERDICT] && ctx != NULL) { + if (desc->type != NFT_DATA_VERDICT) + return -EINVAL; + + err = nft_verdict_init(ctx, data, desc, tb[NFTA_DATA_VERDICT]); + } else { + err = -EINVAL; + } + + return err; } EXPORT_SYMBOL_GPL(nft_data_init); diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index 83590afe3768..e6e402b247d0 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -93,7 +93,16 @@ static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = { static int nft_bitwise_init_bool(struct nft_bitwise *priv, const struct nlattr *const tb[]) { - struct nft_data_desc mask, xor; + struct nft_data_desc mask = { + .type = NFT_DATA_VALUE, + .size = sizeof(priv->mask), + .len = priv->len, + }; + struct nft_data_desc xor = { + .type = NFT_DATA_VALUE, + .size = sizeof(priv->xor), + .len = priv->len, + }; int err; if (tb[NFTA_BITWISE_DATA]) @@ -103,37 +112,30 @@ static int nft_bitwise_init_bool(struct nft_bitwise *priv, !tb[NFTA_BITWISE_XOR]) return -EINVAL; - err = nft_data_init(NULL, &priv->mask, sizeof(priv->mask), &mask, - tb[NFTA_BITWISE_MASK]); + err = nft_data_init(NULL, &priv->mask, &mask, tb[NFTA_BITWISE_MASK]); if (err < 0) return err; - if (mask.type != NFT_DATA_VALUE || mask.len != priv->len) { - err = -EINVAL; - goto err_mask_release; - } - err = nft_data_init(NULL, &priv->xor, sizeof(priv->xor), &xor, - tb[NFTA_BITWISE_XOR]); + err = nft_data_init(NULL, &priv->xor, &xor, tb[NFTA_BITWISE_XOR]); if (err < 0) - goto err_mask_release; - if (xor.type != NFT_DATA_VALUE || xor.len != priv->len) { - err = -EINVAL; - goto err_xor_release; - } + goto err_xor_err; return 0; -err_xor_release: - nft_data_release(&priv->xor, xor.type); -err_mask_release: +err_xor_err: nft_data_release(&priv->mask, mask.type); + return err; } static int nft_bitwise_init_shift(struct nft_bitwise *priv, const struct nlattr *const tb[]) { - struct nft_data_desc d; + struct nft_data_desc desc = { + .type = NFT_DATA_VALUE, + .size = sizeof(priv->data), + .len = sizeof(u32), + }; int err; if (tb[NFTA_BITWISE_MASK] || @@ -143,13 +145,12 @@ static int nft_bitwise_init_shift(struct nft_bitwise *priv, if (!tb[NFTA_BITWISE_DATA]) return -EINVAL; - err = nft_data_init(NULL, &priv->data, sizeof(priv->data), &d, - tb[NFTA_BITWISE_DATA]); + err = nft_data_init(NULL, &priv->data, &desc, tb[NFTA_BITWISE_DATA]); if (err < 0) return err; - if (d.type != NFT_DATA_VALUE || d.len != sizeof(u32) || - priv->data.data[0] >= BITS_PER_TYPE(u32)) { - nft_data_release(&priv->data, d.type); + + if (priv->data.data[0] >= BITS_PER_TYPE(u32)) { + nft_data_release(&priv->data, desc.type); return -EINVAL; } @@ -339,22 +340,21 @@ static const struct nft_expr_ops nft_bitwise_ops = { static int nft_bitwise_extract_u32_data(const struct nlattr * const tb, u32 *out) { - struct nft_data_desc desc; struct nft_data data; - int err = 0; + struct nft_data_desc desc = { + .type = NFT_DATA_VALUE, + .size = sizeof(data), + .len = sizeof(u32), + }; + int err; - err = nft_data_init(NULL, &data, sizeof(data), &desc, tb); + err = nft_data_init(NULL, &data, &desc, tb); if (err < 0) return err; - if (desc.type != NFT_DATA_VALUE || desc.len != sizeof(u32)) { - err = -EINVAL; - goto err; - } *out = data.data[0]; -err: - nft_data_release(&data, desc.type); - return err; + + return 0; } static int nft_bitwise_fast_init(const struct nft_ctx *ctx, diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index 777f09e4dc60..963cf831799c 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -73,20 +73,16 @@ static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_cmp_expr *priv = nft_expr_priv(expr); - struct nft_data_desc desc; + struct nft_data_desc desc = { + .type = NFT_DATA_VALUE, + .size = sizeof(priv->data), + }; int err; - err = nft_data_init(NULL, &priv->data, sizeof(priv->data), &desc, - tb[NFTA_CMP_DATA]); + err = nft_data_init(NULL, &priv->data, &desc, tb[NFTA_CMP_DATA]); if (err < 0) return err; - if (desc.type != NFT_DATA_VALUE) { - err = -EINVAL; - nft_data_release(&priv->data, desc.type); - return err; - } - err = nft_parse_register_load(tb[NFTA_CMP_SREG], &priv->sreg, desc.len); if (err < 0) return err; @@ -214,12 +210,14 @@ static int nft_cmp_fast_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_cmp_fast_expr *priv = nft_expr_priv(expr); - struct nft_data_desc desc; struct nft_data data; + struct nft_data_desc desc = { + .type = NFT_DATA_VALUE, + .size = sizeof(data), + }; int err; - err = nft_data_init(NULL, &data, sizeof(data), &desc, - tb[NFTA_CMP_DATA]); + err = nft_data_init(NULL, &data, &desc, tb[NFTA_CMP_DATA]); if (err < 0) return err; @@ -313,11 +311,13 @@ static int nft_cmp16_fast_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_cmp16_fast_expr *priv = nft_expr_priv(expr); - struct nft_data_desc desc; + struct nft_data_desc desc = { + .type = NFT_DATA_VALUE, + .size = sizeof(priv->data), + }; int err; - err = nft_data_init(NULL, &priv->data, sizeof(priv->data), &desc, - tb[NFTA_CMP_DATA]); + err = nft_data_init(NULL, &priv->data, &desc, tb[NFTA_CMP_DATA]); if (err < 0) return err; @@ -380,8 +380,11 @@ const struct nft_expr_ops nft_cmp16_fast_ops = { static const struct nft_expr_ops * nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { - struct nft_data_desc desc; struct nft_data data; + struct nft_data_desc desc = { + .type = NFT_DATA_VALUE, + .size = sizeof(data), + }; enum nft_cmp_ops op; u8 sreg; int err; @@ -404,14 +407,10 @@ nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) return ERR_PTR(-EINVAL); } - err = nft_data_init(NULL, &data, sizeof(data), &desc, - tb[NFTA_CMP_DATA]); + err = nft_data_init(NULL, &data, &desc, tb[NFTA_CMP_DATA]); if (err < 0) return ERR_PTR(err); - if (desc.type != NFT_DATA_VALUE) - goto err1; - sreg = ntohl(nla_get_be32(tb[NFTA_CMP_SREG])); if (op == NFT_CMP_EQ || op == NFT_CMP_NEQ) { @@ -423,9 +422,6 @@ nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) return &nft_cmp16_fast_ops; } return &nft_cmp_ops; -err1: - nft_data_release(&data, desc.type); - return ERR_PTR(-EINVAL); } struct nft_expr_type nft_cmp_type __read_mostly = { diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index b80f7b507349..5f28b21abc7d 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -29,20 +29,36 @@ static const struct nla_policy nft_immediate_policy[NFTA_IMMEDIATE_MAX + 1] = { [NFTA_IMMEDIATE_DATA] = { .type = NLA_NESTED }, }; +static enum nft_data_types nft_reg_to_type(const struct nlattr *nla) +{ + enum nft_data_types type; + u8 reg; + + reg = ntohl(nla_get_be32(nla)); + if (reg == NFT_REG_VERDICT) + type = NFT_DATA_VERDICT; + else + type = NFT_DATA_VALUE; + + return type; +} + static int nft_immediate_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_immediate_expr *priv = nft_expr_priv(expr); - struct nft_data_desc desc; + struct nft_data_desc desc = { + .size = sizeof(priv->data), + }; int err; if (tb[NFTA_IMMEDIATE_DREG] == NULL || tb[NFTA_IMMEDIATE_DATA] == NULL) return -EINVAL; - err = nft_data_init(ctx, &priv->data, sizeof(priv->data), &desc, - tb[NFTA_IMMEDIATE_DATA]); + desc.type = nft_reg_to_type(tb[NFTA_IMMEDIATE_DREG]); + err = nft_data_init(ctx, &priv->data, &desc, tb[NFTA_IMMEDIATE_DATA]); if (err < 0) return err; diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c index 66f77484c227..832f0d725a9e 100644 --- a/net/netfilter/nft_range.c +++ b/net/netfilter/nft_range.c @@ -51,7 +51,14 @@ static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr const struct nlattr * const tb[]) { struct nft_range_expr *priv = nft_expr_priv(expr); - struct nft_data_desc desc_from, desc_to; + struct nft_data_desc desc_from = { + .type = NFT_DATA_VALUE, + .size = sizeof(priv->data_from), + }; + struct nft_data_desc desc_to = { + .type = NFT_DATA_VALUE, + .size = sizeof(priv->data_to), + }; int err; u32 op; @@ -61,26 +68,16 @@ static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr !tb[NFTA_RANGE_TO_DATA]) return -EINVAL; - err = nft_data_init(NULL, &priv->data_from, sizeof(priv->data_from), - &desc_from, tb[NFTA_RANGE_FROM_DATA]); + err = nft_data_init(NULL, &priv->data_from, &desc_from, + tb[NFTA_RANGE_FROM_DATA]); if (err < 0) return err; - if (desc_from.type != NFT_DATA_VALUE) { - err = -EINVAL; - goto err1; - } - - err = nft_data_init(NULL, &priv->data_to, sizeof(priv->data_to), - &desc_to, tb[NFTA_RANGE_TO_DATA]); + err = nft_data_init(NULL, &priv->data_to, &desc_to, + tb[NFTA_RANGE_TO_DATA]); if (err < 0) goto err1; - if (desc_to.type != NFT_DATA_VALUE) { - err = -EINVAL; - goto err2; - } - if (desc_from.len != desc_to.len) { err = -EINVAL; goto err2; -- cgit From f323ef3a0d49e147365284bc1f02212e617b7f09 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 8 Aug 2022 19:30:07 +0200 Subject: netfilter: nf_tables: disallow jump to implicit chain from set element Extend struct nft_data_desc to add a flag field that specifies nft_data_init() is being called for set element data. Use it to disallow jump to implicit chain from set element, only jump to chain via immediate expression is allowed. Fixes: d0e2c7de92c7 ("netfilter: nf_tables: add NFT_CHAIN_BINDING") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 05896765c68f..460b0925ea60 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5226,6 +5226,7 @@ static int nft_setelem_parse_data(struct nft_ctx *ctx, struct nft_set *set, desc->type = dtype; desc->size = NFT_DATA_VALUE_MAXLEN; desc->len = set->dlen; + desc->flags = NFT_DATA_DESC_SETELEM; return nft_data_init(ctx, data, desc, attr); } @@ -9665,6 +9666,9 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, return PTR_ERR(chain); if (nft_is_base_chain(chain)) return -EOPNOTSUPP; + if (desc->flags & NFT_DATA_DESC_SETELEM && + chain->flags & NFT_CHAIN_BINDING) + return -EINVAL; chain->use++; data->verdict.chain = chain; -- cgit From 580077855a40741cf511766129702d97ff02f4d9 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 9 Aug 2022 18:34:02 +0200 Subject: netfilter: nf_tables: fix null deref due to zeroed list head In nf_tables_updtable, if nf_tables_table_enable returns an error, nft_trans_destroy is called to free the transaction object. nft_trans_destroy() calls list_del(), but the transaction was never placed on a list -- the list head is all zeroes, this results in a null dereference: BUG: KASAN: null-ptr-deref in nft_trans_destroy+0x26/0x59 Call Trace: nft_trans_destroy+0x26/0x59 nf_tables_newtable+0x4bc/0x9bc [..] Its sane to assume that nft_trans_destroy() can be called on the transaction object returned by nft_trans_alloc(), so make sure the list head is initialised. Fixes: 55dd6f93076b ("netfilter: nf_tables: use new transaction infrastructure to handle table") Reported-by: mingi cho Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 460b0925ea60..3cc88998b879 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -153,6 +153,7 @@ static struct nft_trans *nft_trans_alloc_gfp(const struct nft_ctx *ctx, if (trans == NULL) return NULL; + INIT_LIST_HEAD(&trans->list); trans->msg_type = msg_type; trans->ctx = *ctx; -- cgit From 1f0752628e76bb3a02109dbd980381f27060ba92 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 9 Aug 2022 23:30:31 +0200 Subject: bpf: Allow calling bpf_prog_test kfuncs in tracing programs In addition to TC hook, enable these in tracing programs so that they can be used in selftests. Acked-by: Yonghong Song Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220809213033.24147-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- net/bpf/test_run.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index cbc9cd5058cb..d11209367dd0 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -1628,6 +1628,7 @@ static int __init bpf_prog_test_run_init(void) int ret; ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_prog_test_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_prog_test_kfunc_set); return ret ?: register_btf_id_dtor_kfuncs(bpf_prog_test_dtor_kfunc, ARRAY_SIZE(bpf_prog_test_dtor_kfunc), THIS_MODULE); -- cgit From ab7e2e0dfa5d37540ab1dc5376e9a2cb9188925d Mon Sep 17 00:00:00 2001 From: Matthias May Date: Fri, 5 Aug 2022 21:19:06 +0200 Subject: ipv6: do not use RT_TOS for IPv6 flowlabel According to Guillaume Nault RT_TOS should never be used for IPv6. Quote: RT_TOS() is an old macro used to interprete IPv4 TOS as described in the obsolete RFC 1349. It's conceptually wrong to use it even in IPv4 code, although, given the current state of the code, most of the existing calls have no consequence. But using RT_TOS() in IPv6 code is always a bug: IPv6 never had a "TOS" field to be interpreted the RFC 1349 way. There's no historical compatibility to worry about. Fixes: 571912c69f0e ("net: UDP tunnel encapsulation module for tunnelling different protocols like MPLS, IP, NSH etc.") Acked-by: Guillaume Nault Signed-off-by: Matthias May Signed-off-by: Jakub Kicinski --- net/ipv6/ip6_output.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 897ca4f9b791..f152e51242cb 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1311,8 +1311,7 @@ struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb, fl6.daddr = info->key.u.ipv6.dst; fl6.saddr = info->key.u.ipv6.src; prio = info->key.tos; - fl6.flowlabel = ip6_make_flowinfo(RT_TOS(prio), - info->key.label); + fl6.flowlabel = ip6_make_flowinfo(prio, info->key.label); dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6, NULL); -- cgit From 2cd0e8dba7a529a7706d1e81209f98a6351f8ad0 Mon Sep 17 00:00:00 2001 From: Topi Miettinen Date: Sat, 6 Aug 2022 13:12:53 +0300 Subject: netlabel: fix typo in comment 'IPv4 and IPv4' should be 'IPv4 and IPv6'. Signed-off-by: Topi Miettinen Acked-by: Paul Moore Signed-off-by: David S. Miller --- net/netlabel/netlabel_unlabeled.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index 8490e46359ae..0555dffd80e0 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -885,7 +885,7 @@ static int netlbl_unlabel_staticadd(struct sk_buff *skb, /* Don't allow users to add both IPv4 and IPv6 addresses for a * single entry. However, allow users to create two entries, one each - * for IPv4 and IPv4, with the same LSM security context which should + * for IPv4 and IPv6, with the same LSM security context which should * achieve the same result. */ if (!info->attrs[NLBL_UNLABEL_A_SECCTX] || !info->attrs[NLBL_UNLABEL_A_IFACE] || -- cgit From 7e97cfed9929eaabc41829c395eb0d1350fccb9d Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Mon, 8 Aug 2022 11:04:47 -0700 Subject: vsock: Fix memory leak in vsock_connect() An O_NONBLOCK vsock_connect() request may try to reschedule @connect_work. Imagine the following sequence of vsock_connect() requests: 1. The 1st, non-blocking request schedules @connect_work, which will expire after 200 jiffies. Socket state is now SS_CONNECTING; 2. Later, the 2nd, blocking request gets interrupted by a signal after a few jiffies while waiting for the connection to be established. Socket state is back to SS_UNCONNECTED, but @connect_work is still pending, and will expire after 100 jiffies. 3. Now, the 3rd, non-blocking request tries to schedule @connect_work again. Since @connect_work is already scheduled, schedule_delayed_work() silently returns. sock_hold() is called twice, but sock_put() will only be called once in vsock_connect_timeout(), causing a memory leak reported by syzbot: BUG: memory leak unreferenced object 0xffff88810ea56a40 (size 1232): comm "syz-executor756", pid 3604, jiffies 4294947681 (age 12.350s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 28 00 07 40 00 00 00 00 00 00 00 00 00 00 00 00 (..@............ backtrace: [] sk_prot_alloc+0x3e/0x1b0 net/core/sock.c:1930 [] sk_alloc+0x32/0x2e0 net/core/sock.c:1989 [] __vsock_create.constprop.0+0x38/0x320 net/vmw_vsock/af_vsock.c:734 [] vsock_create+0xc1/0x2d0 net/vmw_vsock/af_vsock.c:2203 [] __sock_create+0x1ab/0x2b0 net/socket.c:1468 [] sock_create net/socket.c:1519 [inline] [] __sys_socket+0x6f/0x140 net/socket.c:1561 [] __do_sys_socket net/socket.c:1570 [inline] [] __se_sys_socket net/socket.c:1568 [inline] [] __x64_sys_socket+0x1a/0x20 net/socket.c:1568 [] do_syscall_x64 arch/x86/entry/common.c:50 [inline] [] do_syscall_64+0x35/0x80 arch/x86/entry/common.c:80 [] entry_SYSCALL_64_after_hwframe+0x44/0xae <...> Use mod_delayed_work() instead: if @connect_work is already scheduled, reschedule it, and undo sock_hold() to keep the reference count balanced. Reported-and-tested-by: syzbot+b03f55bf128f9a38f064@syzkaller.appspotmail.com Fixes: d021c344051a ("VSOCK: Introduce VM Sockets") Co-developed-by: Stefano Garzarella Signed-off-by: Stefano Garzarella Reviewed-by: Stefano Garzarella Signed-off-by: Peilin Ye Signed-off-by: David S. Miller --- net/vmw_vsock/af_vsock.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index f04abf662ec6..4d68681f5abe 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1391,7 +1391,14 @@ static int vsock_connect(struct socket *sock, struct sockaddr *addr, * timeout fires. */ sock_hold(sk); - schedule_delayed_work(&vsk->connect_work, timeout); + + /* If the timeout function is already scheduled, + * reschedule it, then ungrab the socket refcount to + * keep it balanced. + */ + if (mod_delayed_work(system_wq, &vsk->connect_work, + timeout)) + sock_put(sk); /* Skip ahead to preserve error code set above. */ goto out_wait; -- cgit From a3e7b29e30854ed67be0d17687e744ad0c769c4b Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Mon, 8 Aug 2022 11:05:25 -0700 Subject: vsock: Set socket state back to SS_UNCONNECTED in vsock_connect_timeout() Imagine two non-blocking vsock_connect() requests on the same socket. The first request schedules @connect_work, and after it times out, vsock_connect_timeout() sets *sock* state back to TCP_CLOSE, but keeps *socket* state as SS_CONNECTING. Later, the second request returns -EALREADY, meaning the socket "already has a pending connection in progress", even though the first request has already timed out. As suggested by Stefano, fix it by setting *socket* state back to SS_UNCONNECTED, so that the second request will return -ETIMEDOUT. Suggested-by: Stefano Garzarella Fixes: d021c344051a ("VSOCK: Introduce VM Sockets") Reviewed-by: Stefano Garzarella Signed-off-by: Peilin Ye Signed-off-by: David S. Miller --- net/vmw_vsock/af_vsock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 4d68681f5abe..b4ee163154a6 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1286,6 +1286,7 @@ static void vsock_connect_timeout(struct work_struct *work) if (sk->sk_state == TCP_SYN_SENT && (sk->sk_shutdown != SHUTDOWN_MASK)) { sk->sk_state = TCP_CLOSE; + sk->sk_socket->state = SS_UNCONNECTED; sk->sk_err = ETIMEDOUT; sk_error_report(sk); vsock_transport_cancel_pkt(vsk); -- cgit From 6b4db2e528f650c7fb712961aac36455468d5902 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 9 Aug 2022 14:35:06 +0300 Subject: devlink: Fix use-after-free after a failed reload After a failed devlink reload, devlink parameters are still registered, which means user space can set and get their values. In the case of the mlxsw "acl_region_rehash_interval" parameter, these operations will trigger a use-after-free [1]. Fix this by rejecting set and get operations while in the failed state. Return the "-EOPNOTSUPP" error code which does not abort the parameters dump, but instead causes it to skip over the problematic parameter. Another possible fix is to perform these checks in the mlxsw parameter callbacks, but other drivers might be affected by the same problem and I am not aware of scenarios where these stricter checks will cause a regression. [1] mlxsw_spectrum3 0000:00:10.0: Port 125: Failed to register netdev mlxsw_spectrum3 0000:00:10.0: Failed to create ports ================================================================== BUG: KASAN: use-after-free in mlxsw_sp_acl_tcam_vregion_rehash_intrvl_get+0xbd/0xd0 drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c:904 Read of size 4 at addr ffff8880099dcfd8 by task kworker/u4:4/777 CPU: 1 PID: 777 Comm: kworker/u4:4 Not tainted 5.19.0-rc7-custom-126601-gfe26f28c586d #1 Hardware name: QEMU MSN4700, BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Workqueue: netns cleanup_net Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x92/0xbd lib/dump_stack.c:106 print_address_description mm/kasan/report.c:313 [inline] print_report.cold+0x5e/0x5cf mm/kasan/report.c:429 kasan_report+0xb9/0xf0 mm/kasan/report.c:491 __asan_report_load4_noabort+0x14/0x20 mm/kasan/report_generic.c:306 mlxsw_sp_acl_tcam_vregion_rehash_intrvl_get+0xbd/0xd0 drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c:904 mlxsw_sp_acl_region_rehash_intrvl_get+0x49/0x60 drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c:1106 mlxsw_sp_params_acl_region_rehash_intrvl_get+0x33/0x80 drivers/net/ethernet/mellanox/mlxsw/spectrum.c:3854 devlink_param_get net/core/devlink.c:4981 [inline] devlink_nl_param_fill+0x238/0x12d0 net/core/devlink.c:5089 devlink_param_notify+0xe5/0x230 net/core/devlink.c:5168 devlink_ns_change_notify net/core/devlink.c:4417 [inline] devlink_ns_change_notify net/core/devlink.c:4396 [inline] devlink_reload+0x15f/0x700 net/core/devlink.c:4507 devlink_pernet_pre_exit+0x112/0x1d0 net/core/devlink.c:12272 ops_pre_exit_list net/core/net_namespace.c:152 [inline] cleanup_net+0x494/0xc00 net/core/net_namespace.c:582 process_one_work+0x9fc/0x1710 kernel/workqueue.c:2289 worker_thread+0x675/0x10b0 kernel/workqueue.c:2436 kthread+0x30c/0x3d0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:306 The buggy address belongs to the physical page: page:ffffea0000267700 refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x99dc flags: 0x100000000000000(node=0|zone=1) raw: 0100000000000000 0000000000000000 dead000000000122 0000000000000000 raw: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff8880099dce80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff8880099dcf00: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff >ffff8880099dcf80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ^ ffff8880099dd000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff8880099dd080: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ================================================================== Fixes: 98bbf70c1c41 ("mlxsw: spectrum: add "acl_region_rehash_interval" devlink param") Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/devlink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index 5da5c7cca98a..b50bcc18b8d9 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -5147,7 +5147,7 @@ static int devlink_param_get(struct devlink *devlink, const struct devlink_param *param, struct devlink_param_gset_ctx *ctx) { - if (!param->get) + if (!param->get || devlink->reload_failed) return -EOPNOTSUPP; return param->get(devlink, param->id, ctx); } @@ -5156,7 +5156,7 @@ static int devlink_param_set(struct devlink *devlink, const struct devlink_param *param, struct devlink_param_gset_ctx *ctx) { - if (!param->set) + if (!param->set || devlink->reload_failed) return -EOPNOTSUPP; return param->set(devlink, param->id, ctx); } -- cgit From 3c5f6e698b5c538bbb23cd453b22e1e4922cffd8 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Wed, 10 Aug 2022 16:05:32 +0800 Subject: bpf: Acquire map uref in .init_seq_private for sock local storage map iterator bpf_iter_attach_map() acquires a map uref, and the uref may be released before or in the middle of iterating map elements. For example, the uref could be released in bpf_iter_detach_map() as part of bpf_link_release(), or could be released in bpf_map_put_with_uref() as part of bpf_map_release(). So acquiring an extra map uref in bpf_iter_init_sk_storage_map() and releasing it in bpf_iter_fini_sk_storage_map(). Fixes: 5ce6e77c7edf ("bpf: Implement bpf iterator for sock local storage map") Signed-off-by: Hou Tao Acked-by: Yonghong Song Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220810080538.1845898-4-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- net/core/bpf_sk_storage.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index a25ec93729b9..83b89ba824d7 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -875,10 +875,18 @@ static int bpf_iter_init_sk_storage_map(void *priv_data, { struct bpf_iter_seq_sk_storage_map_info *seq_info = priv_data; + bpf_map_inc_with_uref(aux->map); seq_info->map = aux->map; return 0; } +static void bpf_iter_fini_sk_storage_map(void *priv_data) +{ + struct bpf_iter_seq_sk_storage_map_info *seq_info = priv_data; + + bpf_map_put_with_uref(seq_info->map); +} + static int bpf_iter_attach_map(struct bpf_prog *prog, union bpf_iter_link_info *linfo, struct bpf_iter_aux_info *aux) @@ -924,7 +932,7 @@ static const struct seq_operations bpf_sk_storage_map_seq_ops = { static const struct bpf_iter_seq_info iter_seq_info = { .seq_ops = &bpf_sk_storage_map_seq_ops, .init_seq_private = bpf_iter_init_sk_storage_map, - .fini_seq_private = NULL, + .fini_seq_private = bpf_iter_fini_sk_storage_map, .seq_priv_size = sizeof(struct bpf_iter_seq_sk_storage_map_info), }; -- cgit From f0d2b2716d71778d0b0c8eaa433c073287d69d93 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Wed, 10 Aug 2022 16:05:33 +0800 Subject: bpf: Acquire map uref in .init_seq_private for sock{map,hash} iterator sock_map_iter_attach_target() acquires a map uref, and the uref may be released before or in the middle of iterating map elements. For example, the uref could be released in sock_map_iter_detach_target() as part of bpf_link_release(), or could be released in bpf_map_put_with_uref() as part of bpf_map_release(). Fixing it by acquiring an extra map uref in .init_seq_private and releasing it in .fini_seq_private. Fixes: 0365351524d7 ("net: Allow iterating sockmap and sockhash") Signed-off-by: Hou Tao Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20220810080538.1845898-5-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- net/core/sock_map.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 028813dfecb0..9a9fb9487d63 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -783,13 +783,22 @@ static int sock_map_init_seq_private(void *priv_data, { struct sock_map_seq_info *info = priv_data; + bpf_map_inc_with_uref(aux->map); info->map = aux->map; return 0; } +static void sock_map_fini_seq_private(void *priv_data) +{ + struct sock_map_seq_info *info = priv_data; + + bpf_map_put_with_uref(info->map); +} + static const struct bpf_iter_seq_info sock_map_iter_seq_info = { .seq_ops = &sock_map_seq_ops, .init_seq_private = sock_map_init_seq_private, + .fini_seq_private = sock_map_fini_seq_private, .seq_priv_size = sizeof(struct sock_map_seq_info), }; @@ -1369,18 +1378,27 @@ static const struct seq_operations sock_hash_seq_ops = { }; static int sock_hash_init_seq_private(void *priv_data, - struct bpf_iter_aux_info *aux) + struct bpf_iter_aux_info *aux) { struct sock_hash_seq_info *info = priv_data; + bpf_map_inc_with_uref(aux->map); info->map = aux->map; info->htab = container_of(aux->map, struct bpf_shtab, map); return 0; } +static void sock_hash_fini_seq_private(void *priv_data) +{ + struct sock_hash_seq_info *info = priv_data; + + bpf_map_put_with_uref(info->map); +} + static const struct bpf_iter_seq_info sock_hash_iter_seq_info = { .seq_ops = &sock_hash_seq_ops, .init_seq_private = sock_hash_init_seq_private, + .fini_seq_private = sock_hash_fini_seq_private, .seq_priv_size = sizeof(struct sock_hash_seq_info), }; -- cgit From 52bd05eb7c88e1ad8541a48873188ccebca9da26 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Wed, 10 Aug 2022 16:05:34 +0800 Subject: bpf: Check the validity of max_rdwr_access for sock local storage map iterator The value of sock local storage map is writable in map iterator, so check max_rdwr_access instead of max_rdonly_access. Fixes: 5ce6e77c7edf ("bpf: Implement bpf iterator for sock local storage map") Signed-off-by: Hou Tao Acked-by: Yonghong Song Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220810080538.1845898-6-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- net/core/bpf_sk_storage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 83b89ba824d7..1b7f385643b4 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -904,7 +904,7 @@ static int bpf_iter_attach_map(struct bpf_prog *prog, if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) goto put_map; - if (prog->aux->max_rdonly_access > map->value_size) { + if (prog->aux->max_rdwr_access > map->value_size) { err = -EACCES; goto put_map; } -- cgit From 2a0133723f9ebeb751cfce19f74ec07e108bef1f Mon Sep 17 00:00:00 2001 From: Hawkins Jiawei Date: Fri, 5 Aug 2022 15:48:34 +0800 Subject: net: fix refcount bug in sk_psock_get (2) Syzkaller reports refcount bug as follows: ------------[ cut here ]------------ refcount_t: saturated; leaking memory. WARNING: CPU: 1 PID: 3605 at lib/refcount.c:19 refcount_warn_saturate+0xf4/0x1e0 lib/refcount.c:19 Modules linked in: CPU: 1 PID: 3605 Comm: syz-executor208 Not tainted 5.18.0-syzkaller-03023-g7e062cda7d90 #0 __refcount_add_not_zero include/linux/refcount.h:163 [inline] __refcount_inc_not_zero include/linux/refcount.h:227 [inline] refcount_inc_not_zero include/linux/refcount.h:245 [inline] sk_psock_get+0x3bc/0x410 include/linux/skmsg.h:439 tls_data_ready+0x6d/0x1b0 net/tls/tls_sw.c:2091 tcp_data_ready+0x106/0x520 net/ipv4/tcp_input.c:4983 tcp_data_queue+0x25f2/0x4c90 net/ipv4/tcp_input.c:5057 tcp_rcv_state_process+0x1774/0x4e80 net/ipv4/tcp_input.c:6659 tcp_v4_do_rcv+0x339/0x980 net/ipv4/tcp_ipv4.c:1682 sk_backlog_rcv include/net/sock.h:1061 [inline] __release_sock+0x134/0x3b0 net/core/sock.c:2849 release_sock+0x54/0x1b0 net/core/sock.c:3404 inet_shutdown+0x1e0/0x430 net/ipv4/af_inet.c:909 __sys_shutdown_sock net/socket.c:2331 [inline] __sys_shutdown_sock net/socket.c:2325 [inline] __sys_shutdown+0xf1/0x1b0 net/socket.c:2343 __do_sys_shutdown net/socket.c:2351 [inline] __se_sys_shutdown net/socket.c:2349 [inline] __x64_sys_shutdown+0x50/0x70 net/socket.c:2349 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x46/0xb0 During SMC fallback process in connect syscall, kernel will replaces TCP with SMC. In order to forward wakeup smc socket waitqueue after fallback, kernel will sets clcsk->sk_user_data to origin smc socket in smc_fback_replace_callbacks(). Later, in shutdown syscall, kernel will calls sk_psock_get(), which treats the clcsk->sk_user_data as psock type, triggering the refcnt warning. So, the root cause is that smc and psock, both will use sk_user_data field. So they will mismatch this field easily. This patch solves it by using another bit(defined as SK_USER_DATA_PSOCK) in PTRMASK, to mark whether sk_user_data points to a psock object or not. This patch depends on a PTRMASK introduced in commit f1ff5ce2cd5e ("net, sk_msg: Clear sk_user_data pointer on clone if tagged"). For there will possibly be more flags in the sk_user_data field, this patch also refactor sk_user_data flags code to be more generic to improve its maintainability. Reported-and-tested-by: syzbot+5f26f85569bd179c18ce@syzkaller.appspotmail.com Suggested-by: Jakub Kicinski Acked-by: Wen Gu Signed-off-by: Hawkins Jiawei Reviewed-by: Jakub Sitnicki Signed-off-by: Jakub Kicinski --- net/core/skmsg.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 81627892bdd4..57e942a6431a 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -739,7 +739,9 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node) sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED); refcount_set(&psock->refcnt, 1); - rcu_assign_sk_user_data_nocopy(sk, psock); + __rcu_assign_sk_user_data_with_flags(sk, psock, + SK_USER_DATA_NOCOPY | + SK_USER_DATA_PSOCK); sock_hold(sk); out: -- cgit From 9ad36309e2719a884f946678e0296be10f0bb4c1 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Tue, 9 Aug 2022 14:05:18 -0300 Subject: net_sched: cls_route: remove from list when handle is 0 When a route filter is replaced and the old filter has a 0 handle, the old one won't be removed from the hashtable, while it will still be freed. The test was there since before commit 1109c00547fc ("net: sched: RCU cls_route"), when a new filter was not allocated when there was an old one. The old filter was reused and the reinserting would only be necessary if an old filter was replaced. That was still wrong for the same case where the old handle was 0. Remove the old filter from the list independently from its handle value. This fixes CVE-2022-2588, also reported as ZDI-CAN-17440. Reported-by: Zhenpeng Lin Signed-off-by: Thadeu Lima de Souza Cascardo Reviewed-by: Kamal Mostafa Cc: Acked-by: Jamal Hadi Salim Link: https://lore.kernel.org/r/20220809170518.164662-1-cascardo@canonical.com Signed-off-by: Jakub Kicinski --- net/sched/cls_route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index a35ab8c27866..3f935cbbaff6 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -526,7 +526,7 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, rcu_assign_pointer(f->next, f1); rcu_assign_pointer(*fp, f); - if (fold && fold->handle && f->handle != fold->handle) { + if (fold) { th = to_hash(fold->handle); h = from_hash(fold->handle >> 16); b = rtnl_dereference(head->table[th]); -- cgit From 86b259f6f8880237899fbf4f940303b3987dffa9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 9 Aug 2022 10:55:43 -0700 Subject: tls: rx: device: bound the frag walk We can't do skb_walk_frags() on the input skbs, because the input skbs is really just a pointer to the tcp read queue. We need to bound the "is decrypted" check by the amount of data in the message. Note that the walk in tls_device_reencrypt() is after a CoW so the skb there is safe to walk. Actually in the current implementation it can't have frags at all, but whatever, maybe one day it will. Reported-by: Tariq Toukan Fixes: 84c61fe1a75b ("tls: rx: do not use the standard strparser") Tested-by: Ran Rozenstein Link: https://lore.kernel.org/r/20220809175544.354343-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/tls/tls_device.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index e3e6cf75aa03..6ed41474bdf8 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -984,11 +984,17 @@ int tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx) int is_decrypted = skb->decrypted; int is_encrypted = !is_decrypted; struct sk_buff *skb_iter; + int left; + left = rxm->full_len - skb->len; /* Check if all the data is decrypted already */ - skb_walk_frags(skb, skb_iter) { + skb_iter = skb_shinfo(skb)->frag_list; + while (skb_iter && left > 0) { is_decrypted &= skb_iter->decrypted; is_encrypted &= !skb_iter->decrypted; + + left -= skb_iter->len; + skb_iter = skb_iter->next; } trace_tls_device_decrypted(sk, tcp_sk(sk)->copied_seq - rxm->full_len, -- cgit From d800a7b3577bfb783481b02865d8775a760212a7 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 9 Aug 2022 10:55:44 -0700 Subject: tls: rx: device: don't try to copy too much on detach Another device offload bug, we use the length of the output skb as an indication of how much data to copy. But that skb is sized to offset + record length, and we start from offset. So we end up double-counting the offset which leads to skb_copy_bits() returning -EFAULT. Reported-by: Tariq Toukan Fixes: 84c61fe1a75b ("tls: rx: do not use the standard strparser") Tested-by: Ran Rozenstein Link: https://lore.kernel.org/r/20220809175544.354343-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/tls/tls_strp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c index f0b7c9122fba..9b79e334dbd9 100644 --- a/net/tls/tls_strp.c +++ b/net/tls/tls_strp.c @@ -41,7 +41,7 @@ static struct sk_buff *tls_strp_msg_make_copy(struct tls_strparser *strp) struct sk_buff *skb; int i, err, offset; - skb = alloc_skb_with_frags(0, strp->anchor->len, TLS_PAGE_ORDER, + skb = alloc_skb_with_frags(0, strp->stm.full_len, TLS_PAGE_ORDER, &err, strp->sk->sk_allocation); if (!skb) return NULL; -- cgit From 94ce3b64c62d4b628cf85cd0d9a370aca8f7e43a Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Wed, 10 Aug 2022 11:16:02 +0300 Subject: net/tls: Use RCU API to access tls_ctx->netdev Currently, tls_device_down synchronizes with tls_device_resync_rx using RCU, however, the pointer to netdev is stored using WRITE_ONCE and loaded using READ_ONCE. Although such approach is technically correct (rcu_dereference is essentially a READ_ONCE, and rcu_assign_pointer uses WRITE_ONCE to store NULL), using special RCU helpers for pointers is more valid, as it includes additional checks and might change the implementation transparently to the callers. Mark the netdev pointer as __rcu and use the correct RCU helpers to access it. For non-concurrent access pass the right conditions that guarantee safe access (locks taken, refcount value). Also use the correct helper in mlx5e, where even READ_ONCE was missing. The transition to RCU exposes existing issues, fixed by this commit: 1. bond_tls_device_xmit could read netdev twice, and it could become NULL the second time, after the NULL check passed. 2. Drivers shouldn't stop processing the last packet if tls_device_down just set netdev to NULL, before tls_dev_del was called. This prevents a possible packet drop when transitioning to the fallback software mode. Fixes: 89df6a810470 ("net/bonding: Implement TLS TX device offload") Fixes: c55dcdd435aa ("net/tls: Fix use-after-free after the TLS device goes down and up") Signed-off-by: Maxim Mikityanskiy Link: https://lore.kernel.org/r/20220810081602.1435800-1-maximmi@nvidia.com Signed-off-by: Jakub Kicinski --- net/tls/tls_device.c | 38 +++++++++++++++++++++++++++++--------- net/tls/tls_device_fallback.c | 3 ++- 2 files changed, 31 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 6ed41474bdf8..0f983e5f7dde 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -71,7 +71,13 @@ static void tls_device_tx_del_task(struct work_struct *work) struct tls_offload_context_tx *offload_ctx = container_of(work, struct tls_offload_context_tx, destruct_work); struct tls_context *ctx = offload_ctx->ctx; - struct net_device *netdev = ctx->netdev; + struct net_device *netdev; + + /* Safe, because this is the destroy flow, refcount is 0, so + * tls_device_down can't store this field in parallel. + */ + netdev = rcu_dereference_protected(ctx->netdev, + !refcount_read(&ctx->refcount)); netdev->tlsdev_ops->tls_dev_del(netdev, ctx, TLS_OFFLOAD_CTX_DIR_TX); dev_put(netdev); @@ -81,6 +87,7 @@ static void tls_device_tx_del_task(struct work_struct *work) static void tls_device_queue_ctx_destruction(struct tls_context *ctx) { + struct net_device *netdev; unsigned long flags; bool async_cleanup; @@ -91,7 +98,14 @@ static void tls_device_queue_ctx_destruction(struct tls_context *ctx) } list_del(&ctx->list); /* Remove from tls_device_list / tls_device_down_list */ - async_cleanup = ctx->netdev && ctx->tx_conf == TLS_HW; + + /* Safe, because this is the destroy flow, refcount is 0, so + * tls_device_down can't store this field in parallel. + */ + netdev = rcu_dereference_protected(ctx->netdev, + !refcount_read(&ctx->refcount)); + + async_cleanup = netdev && ctx->tx_conf == TLS_HW; if (async_cleanup) { struct tls_offload_context_tx *offload_ctx = tls_offload_ctx_tx(ctx); @@ -229,7 +243,8 @@ static void tls_device_resync_tx(struct sock *sk, struct tls_context *tls_ctx, trace_tls_device_tx_resync_send(sk, seq, rcd_sn); down_read(&device_offload_lock); - netdev = tls_ctx->netdev; + netdev = rcu_dereference_protected(tls_ctx->netdev, + lockdep_is_held(&device_offload_lock)); if (netdev) err = netdev->tlsdev_ops->tls_dev_resync(netdev, sk, seq, rcd_sn, @@ -710,7 +725,7 @@ static void tls_device_resync_rx(struct tls_context *tls_ctx, trace_tls_device_rx_resync_send(sk, seq, rcd_sn, rx_ctx->resync_type); rcu_read_lock(); - netdev = READ_ONCE(tls_ctx->netdev); + netdev = rcu_dereference(tls_ctx->netdev); if (netdev) netdev->tlsdev_ops->tls_dev_resync(netdev, sk, seq, rcd_sn, TLS_OFFLOAD_CTX_DIR_RX); @@ -1035,7 +1050,7 @@ static void tls_device_attach(struct tls_context *ctx, struct sock *sk, if (sk->sk_destruct != tls_device_sk_destruct) { refcount_set(&ctx->refcount, 1); dev_hold(netdev); - ctx->netdev = netdev; + RCU_INIT_POINTER(ctx->netdev, netdev); spin_lock_irq(&tls_device_lock); list_add_tail(&ctx->list, &tls_device_list); spin_unlock_irq(&tls_device_lock); @@ -1306,7 +1321,8 @@ void tls_device_offload_cleanup_rx(struct sock *sk) struct net_device *netdev; down_read(&device_offload_lock); - netdev = tls_ctx->netdev; + netdev = rcu_dereference_protected(tls_ctx->netdev, + lockdep_is_held(&device_offload_lock)); if (!netdev) goto out; @@ -1315,7 +1331,7 @@ void tls_device_offload_cleanup_rx(struct sock *sk) if (tls_ctx->tx_conf != TLS_HW) { dev_put(netdev); - tls_ctx->netdev = NULL; + rcu_assign_pointer(tls_ctx->netdev, NULL); } else { set_bit(TLS_RX_DEV_CLOSED, &tls_ctx->flags); } @@ -1335,7 +1351,11 @@ static int tls_device_down(struct net_device *netdev) spin_lock_irqsave(&tls_device_lock, flags); list_for_each_entry_safe(ctx, tmp, &tls_device_list, list) { - if (ctx->netdev != netdev || + struct net_device *ctx_netdev = + rcu_dereference_protected(ctx->netdev, + lockdep_is_held(&device_offload_lock)); + + if (ctx_netdev != netdev || !refcount_inc_not_zero(&ctx->refcount)) continue; @@ -1352,7 +1372,7 @@ static int tls_device_down(struct net_device *netdev) /* Stop the RX and TX resync. * tls_dev_resync must not be called after tls_dev_del. */ - WRITE_ONCE(ctx->netdev, NULL); + rcu_assign_pointer(ctx->netdev, NULL); /* Start skipping the RX resync logic completely. */ set_bit(TLS_RX_DEV_DEGRADED, &ctx->flags); diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c index 618cee704217..7dfc8023e0f1 100644 --- a/net/tls/tls_device_fallback.c +++ b/net/tls/tls_device_fallback.c @@ -426,7 +426,8 @@ struct sk_buff *tls_validate_xmit_skb(struct sock *sk, struct net_device *dev, struct sk_buff *skb) { - if (dev == tls_get_ctx(sk)->netdev || netif_is_bond_master(dev)) + if (dev == rcu_dereference_bh(tls_get_ctx(sk)->netdev) || + netif_is_bond_master(dev)) return skb; return tls_sw_fallback(sk, skb); -- cgit