From 2c82c7e724ff51cab78e1afd5c2aaa31994fe41e Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 30 Apr 2019 14:53:11 +0200
Subject: netfilter: nf_tables: fix oops during rule dump

We can oops in nf_tables_fill_rule_info().

Its not possible to fetch previous element in rcu-protected lists
when deletions are not prevented somehow: list_del_rcu poisons
the ->prev pointer value.

Before rcu-conversion this was safe as dump operations did hold
nfnetlink mutex.

Pass previous rule as argument, obtained by keeping a pointer to
the previous rule during traversal.

Fixes: d9adf22a291883 ("netfilter: nf_tables: use call_rcu in netlink dumps")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 28241e82fd15..4b5159936034 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2270,13 +2270,13 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net,
 				    u32 flags, int family,
 				    const struct nft_table *table,
 				    const struct nft_chain *chain,
-				    const struct nft_rule *rule)
+				    const struct nft_rule *rule,
+				    const struct nft_rule *prule)
 {
 	struct nlmsghdr *nlh;
 	struct nfgenmsg *nfmsg;
 	const struct nft_expr *expr, *next;
 	struct nlattr *list;
-	const struct nft_rule *prule;
 	u16 type = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
 
 	nlh = nlmsg_put(skb, portid, seq, type, sizeof(struct nfgenmsg), flags);
@@ -2296,8 +2296,7 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net,
 			 NFTA_RULE_PAD))
 		goto nla_put_failure;
 
-	if ((event != NFT_MSG_DELRULE) && (rule->list.prev != &chain->rules)) {
-		prule = list_prev_entry(rule, list);
+	if (event != NFT_MSG_DELRULE && prule) {
 		if (nla_put_be64(skb, NFTA_RULE_POSITION,
 				 cpu_to_be64(prule->handle),
 				 NFTA_RULE_PAD))
@@ -2344,7 +2343,7 @@ static void nf_tables_rule_notify(const struct nft_ctx *ctx,
 
 	err = nf_tables_fill_rule_info(skb, ctx->net, ctx->portid, ctx->seq,
 				       event, 0, ctx->family, ctx->table,
-				       ctx->chain, rule);
+				       ctx->chain, rule, NULL);
 	if (err < 0) {
 		kfree_skb(skb);
 		goto err;
@@ -2369,12 +2368,13 @@ static int __nf_tables_dump_rules(struct sk_buff *skb,
 				  const struct nft_chain *chain)
 {
 	struct net *net = sock_net(skb->sk);
+	const struct nft_rule *rule, *prule;
 	unsigned int s_idx = cb->args[0];
-	const struct nft_rule *rule;
 
+	prule = NULL;
 	list_for_each_entry_rcu(rule, &chain->rules, list) {
 		if (!nft_is_active(net, rule))
-			goto cont;
+			goto cont_skip;
 		if (*idx < s_idx)
 			goto cont;
 		if (*idx > s_idx) {
@@ -2386,11 +2386,13 @@ static int __nf_tables_dump_rules(struct sk_buff *skb,
 					NFT_MSG_NEWRULE,
 					NLM_F_MULTI | NLM_F_APPEND,
 					table->family,
-					table, chain, rule) < 0)
+					table, chain, rule, prule) < 0)
 			return 1;
 
 		nl_dump_check_consistent(cb, nlmsg_hdr(skb));
 cont:
+		prule = rule;
+cont_skip:
 		(*idx)++;
 	}
 	return 0;
@@ -2546,7 +2548,7 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
 
 	err = nf_tables_fill_rule_info(skb2, net, NETLINK_CB(skb).portid,
 				       nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0,
-				       family, table, chain, rule);
+				       family, table, chain, rule, NULL);
 	if (err < 0)
 		goto err;
 
-- 
cgit 


From 946c0d8e6ed43dae6527e878d0077c1e11015db0 Mon Sep 17 00:00:00 2001
From: Jagdish Motwani <jagdish.motwani@sophos.com>
Date: Mon, 13 May 2019 23:47:40 +0530
Subject: netfilter: nf_queue: fix reinject verdict handling

This patch fixes netfilter hook traversal when there are more than 1 hooks
returning NF_QUEUE verdict. When the first queue reinjects the packet,
'nf_reinject' starts traversing hooks with a proper hook_index. However,
if it again receives a NF_QUEUE verdict (by some other netfilter hook), it
queues the packet with a wrong hook_index. So, when the second queue
reinjects the packet, it re-executes hooks in between.

Fixes: 960632ece694 ("netfilter: convert hook list to an array")
Signed-off-by: Jagdish Motwani <jagdish.motwani@sophos.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_queue.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 9dc1d6e04946..b5b2be55ca82 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -255,6 +255,7 @@ static unsigned int nf_iterate(struct sk_buff *skb,
 repeat:
 		verdict = nf_hook_entry_hookfn(hook, skb, state);
 		if (verdict != NF_ACCEPT) {
+			*index = i;
 			if (verdict != NF_REPEAT)
 				return verdict;
 			goto repeat;
-- 
cgit 


From e633508a95289489d28faacb68b32c3e7e68ef6f Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Wed, 15 May 2019 20:15:32 +0200
Subject: netfilter: nft_fib: Fix existence check support

NFTA_FIB_F_PRESENT flag was not always honored since eval functions did
not call nft_fib_store_result in all cases.

Given that in all callsites there is a struct net_device pointer
available which holds the interface data to be stored in destination
register, simplify nft_fib_store_result() to just accept that pointer
instead of the nft_pktinfo pointer and interface index. This also
allows to drop the index to interface lookup previously needed to get
the name associated with given index.

Fixes: 055c4b34b94f6 ("netfilter: nft_fib: Support existence check")
Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/nft_fib_ipv4.c | 23 +++--------------------
 net/ipv6/netfilter/nft_fib_ipv6.c | 16 ++--------------
 net/netfilter/nft_fib.c           |  6 +++---
 3 files changed, 8 insertions(+), 37 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c
index 94eb25bc8d7e..c8888e52591f 100644
--- a/net/ipv4/netfilter/nft_fib_ipv4.c
+++ b/net/ipv4/netfilter/nft_fib_ipv4.c
@@ -58,11 +58,6 @@ void nft_fib4_eval_type(const struct nft_expr *expr, struct nft_regs *regs,
 }
 EXPORT_SYMBOL_GPL(nft_fib4_eval_type);
 
-static int get_ifindex(const struct net_device *dev)
-{
-	return dev ? dev->ifindex : 0;
-}
-
 void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
 		   const struct nft_pktinfo *pkt)
 {
@@ -94,8 +89,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
 
 	if (nft_hook(pkt) == NF_INET_PRE_ROUTING &&
 	    nft_fib_is_loopback(pkt->skb, nft_in(pkt))) {
-		nft_fib_store_result(dest, priv, pkt,
-				     nft_in(pkt)->ifindex);
+		nft_fib_store_result(dest, priv, nft_in(pkt));
 		return;
 	}
 
@@ -108,8 +102,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
 	if (ipv4_is_zeronet(iph->saddr)) {
 		if (ipv4_is_lbcast(iph->daddr) ||
 		    ipv4_is_local_multicast(iph->daddr)) {
-			nft_fib_store_result(dest, priv, pkt,
-					     get_ifindex(pkt->skb->dev));
+			nft_fib_store_result(dest, priv, pkt->skb->dev);
 			return;
 		}
 	}
@@ -150,17 +143,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
 		found = oif;
 	}
 
-	switch (priv->result) {
-	case NFT_FIB_RESULT_OIF:
-		*dest = found->ifindex;
-		break;
-	case NFT_FIB_RESULT_OIFNAME:
-		strncpy((char *)dest, found->name, IFNAMSIZ);
-		break;
-	default:
-		WARN_ON_ONCE(1);
-		break;
-	}
+	nft_fib_store_result(dest, priv, found);
 }
 EXPORT_SYMBOL_GPL(nft_fib4_eval);
 
diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c
index 73cdc0bc63f7..ec068b0cffca 100644
--- a/net/ipv6/netfilter/nft_fib_ipv6.c
+++ b/net/ipv6/netfilter/nft_fib_ipv6.c
@@ -169,8 +169,7 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs,
 
 	if (nft_hook(pkt) == NF_INET_PRE_ROUTING &&
 	    nft_fib_is_loopback(pkt->skb, nft_in(pkt))) {
-		nft_fib_store_result(dest, priv, pkt,
-				     nft_in(pkt)->ifindex);
+		nft_fib_store_result(dest, priv, nft_in(pkt));
 		return;
 	}
 
@@ -187,18 +186,7 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs,
 	if (oif && oif != rt->rt6i_idev->dev)
 		goto put_rt_err;
 
-	switch (priv->result) {
-	case NFT_FIB_RESULT_OIF:
-		*dest = rt->rt6i_idev->dev->ifindex;
-		break;
-	case NFT_FIB_RESULT_OIFNAME:
-		strncpy((char *)dest, rt->rt6i_idev->dev->name, IFNAMSIZ);
-		break;
-	default:
-		WARN_ON_ONCE(1);
-		break;
-	}
-
+	nft_fib_store_result(dest, priv, rt->rt6i_idev->dev);
  put_rt_err:
 	ip6_rt_put(rt);
 }
diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c
index 21df8cccea65..77f00a99dfab 100644
--- a/net/netfilter/nft_fib.c
+++ b/net/netfilter/nft_fib.c
@@ -135,17 +135,17 @@ int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr)
 EXPORT_SYMBOL_GPL(nft_fib_dump);
 
 void nft_fib_store_result(void *reg, const struct nft_fib *priv,
-			  const struct nft_pktinfo *pkt, int index)
+			  const struct net_device *dev)
 {
-	struct net_device *dev;
 	u32 *dreg = reg;
+	int index;
 
 	switch (priv->result) {
 	case NFT_FIB_RESULT_OIF:
+		index = dev ? dev->ifindex : 0;
 		*dreg = (priv->flags & NFTA_FIB_F_PRESENT) ? !!index : index;
 		break;
 	case NFT_FIB_RESULT_OIFNAME:
-		dev = dev_get_by_index_rcu(nft_net(pkt), index);
 		if (priv->flags & NFTA_FIB_F_PRESENT)
 			*dreg = !!dev;
 		else
-- 
cgit 


From 719c7d563c17b150877cee03a4b812a424989dfa Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Fri, 17 May 2019 22:31:49 +0800
Subject: ipvs: Fix use-after-free in ip_vs_in

BUG: KASAN: use-after-free in ip_vs_in.part.29+0xe8/0xd20 [ip_vs]
Read of size 4 at addr ffff8881e9b26e2c by task sshd/5603

CPU: 0 PID: 5603 Comm: sshd Not tainted 4.19.39+ #30
Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
Call Trace:
 dump_stack+0x71/0xab
 print_address_description+0x6a/0x270
 kasan_report+0x179/0x2c0
 ip_vs_in.part.29+0xe8/0xd20 [ip_vs]
 ip_vs_in+0xd8/0x170 [ip_vs]
 nf_hook_slow+0x5f/0xe0
 __ip_local_out+0x1d5/0x250
 ip_local_out+0x19/0x60
 __tcp_transmit_skb+0xba1/0x14f0
 tcp_write_xmit+0x41f/0x1ed0
 ? _copy_from_iter_full+0xca/0x340
 __tcp_push_pending_frames+0x52/0x140
 tcp_sendmsg_locked+0x787/0x1600
 ? tcp_sendpage+0x60/0x60
 ? inet_sk_set_state+0xb0/0xb0
 tcp_sendmsg+0x27/0x40
 sock_sendmsg+0x6d/0x80
 sock_write_iter+0x121/0x1c0
 ? sock_sendmsg+0x80/0x80
 __vfs_write+0x23e/0x370
 vfs_write+0xe7/0x230
 ksys_write+0xa1/0x120
 ? __ia32_sys_read+0x50/0x50
 ? __audit_syscall_exit+0x3ce/0x450
 do_syscall_64+0x73/0x200
 entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7ff6f6147c60
Code: 73 01 c3 48 8b 0d 28 12 2d 00 f7 d8 64 89 01 48 83 c8 ff c3 66 0f 1f 44 00 00 83 3d 5d 73 2d 00 00 75 10 b8 01 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83
RSP: 002b:00007ffd772ead18 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
RAX: ffffffffffffffda RBX: 0000000000000034 RCX: 00007ff6f6147c60
RDX: 0000000000000034 RSI: 000055df30a31270 RDI: 0000000000000003
RBP: 000055df30a31270 R08: 0000000000000000 R09: 0000000000000000
R10: 00007ffd772ead70 R11: 0000000000000246 R12: 00007ffd772ead74
R13: 00007ffd772eae20 R14: 00007ffd772eae24 R15: 000055df2f12ddc0

Allocated by task 6052:
 kasan_kmalloc+0xa0/0xd0
 __kmalloc+0x10a/0x220
 ops_init+0x97/0x190
 register_pernet_operations+0x1ac/0x360
 register_pernet_subsys+0x24/0x40
 0xffffffffc0ea016d
 do_one_initcall+0x8b/0x253
 do_init_module+0xe3/0x335
 load_module+0x2fc0/0x3890
 __do_sys_finit_module+0x192/0x1c0
 do_syscall_64+0x73/0x200
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

Freed by task 6067:
 __kasan_slab_free+0x130/0x180
 kfree+0x90/0x1a0
 ops_free_list.part.7+0xa6/0xc0
 unregister_pernet_operations+0x18b/0x1f0
 unregister_pernet_subsys+0x1d/0x30
 ip_vs_cleanup+0x1d/0xd2f [ip_vs]
 __x64_sys_delete_module+0x20c/0x300
 do_syscall_64+0x73/0x200
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

The buggy address belongs to the object at ffff8881e9b26600 which belongs to the cache kmalloc-4096 of size 4096
The buggy address is located 2092 bytes inside of 4096-byte region [ffff8881e9b26600, ffff8881e9b27600)
The buggy address belongs to the page:
page:ffffea0007a6c800 count:1 mapcount:0 mapping:ffff888107c0e600 index:0x0 compound_mapcount: 0
flags: 0x17ffffc0008100(slab|head)
raw: 0017ffffc0008100 dead000000000100 dead000000000200 ffff888107c0e600
raw: 0000000000000000 0000000080070007 00000001ffffffff 0000000000000000
page dumped because: kasan: bad access detected

while unregistering ipvs module, ops_free_list calls
__ip_vs_cleanup, then nf_unregister_net_hooks be called to
do remove nf hook entries. It need a RCU period to finish,
however net->ipvs is set to NULL immediately, which will
trigger NULL pointer dereference when a packet is hooked
and handled by ip_vs_in where net->ipvs is dereferenced.

Another scene is ops_free_list call ops_free to free the
net_generic directly while __ip_vs_cleanup finished, then
calling ip_vs_in will triggers use-after-free.

This patch moves nf_unregister_net_hooks from __ip_vs_cleanup()
to __ip_vs_dev_cleanup(),  where rcu_barrier() is called by
unregister_pernet_device -> unregister_pernet_operations,
that will do the needed grace period.

Reported-by: Hulk Robot <hulkci@huawei.com>
Fixes: efe41606184e ("ipvs: convert to use pernet nf_hook api")
Suggested-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Acked-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipvs/ip_vs_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 14457551bcb4..8ebf21149ec3 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -2312,7 +2312,6 @@ static void __net_exit __ip_vs_cleanup(struct net *net)
 {
 	struct netns_ipvs *ipvs = net_ipvs(net);
 
-	nf_unregister_net_hooks(net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
 	ip_vs_service_net_cleanup(ipvs);	/* ip_vs_flush() with locks */
 	ip_vs_conn_net_cleanup(ipvs);
 	ip_vs_app_net_cleanup(ipvs);
@@ -2327,6 +2326,7 @@ static void __net_exit __ip_vs_dev_cleanup(struct net *net)
 {
 	struct netns_ipvs *ipvs = net_ipvs(net);
 	EnterFunction(2);
+	nf_unregister_net_hooks(net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
 	ipvs->enable = 0;	/* Disable packet reception */
 	smp_wmb();
 	ip_vs_sync_net_cleanup(ipvs);
-- 
cgit 


From 6bac76db1da3cb162c425d58ae421486f8e43955 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 20 May 2019 13:48:10 +0200
Subject: netfilter: nat: fix udp checksum corruption

Due to copy&paste error nf_nat_mangle_udp_packet passes IPPROTO_TCP,
resulting in incorrect udp checksum when payload had to be mangled.

Fixes: dac3fe72596f9 ("netfilter: nat: remove csum_recalc hook")
Reported-by: Marc Haber <mh+netdev@zugschlus.de>
Tested-by: Marc Haber <mh+netdev@zugschlus.de>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_nat_helper.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c
index ccc06f7539d7..53aeb12b70fb 100644
--- a/net/netfilter/nf_nat_helper.c
+++ b/net/netfilter/nf_nat_helper.c
@@ -170,7 +170,7 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb,
 	if (!udph->check && skb->ip_summed != CHECKSUM_PARTIAL)
 		return true;
 
-	nf_nat_csum_recalc(skb, nf_ct_l3num(ct), IPPROTO_TCP,
+	nf_nat_csum_recalc(skb, nf_ct_l3num(ct), IPPROTO_UDP,
 			   udph, &udph->check, datalen, oldlen);
 
 	return true;
-- 
cgit 


From e75b3e1c9bc5b997d09bdf8eb72ab3dd3c1a7072 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 21 May 2019 13:24:30 +0200
Subject: netfilter: nf_flow_table: ignore DF bit setting

Its irrelevant if the DF bit is set or not, we must pass packet to
stack in either case.

If the DF bit is set, we must pass it to stack so the appropriate
ICMP error can be generated.

If the DF is not set, we must pass it to stack for fragmentation.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_flow_table_ip.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index 0d603e20b519..bfd44db9f214 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -243,8 +243,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
 	rt = (struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
 	outdev = rt->dst.dev;
 
-	if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)) &&
-	    (ip_hdr(skb)->frag_off & htons(IP_DF)) != 0)
+	if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
 		return NF_ACCEPT;
 
 	if (skb_try_make_writable(skb, sizeof(*iph)))
-- 
cgit 


From 8437a6209f76f85a2db1abb12a9bde2170801617 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 21 May 2019 13:24:31 +0200
Subject: netfilter: nft_flow_offload: set liberal tracking mode for tcp

Without it, whenever a packet has to be pushed up the stack (e.g. because
of mtu mismatch), then conntrack will flag packets as invalid, which in
turn breaks NAT.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_flow_offload.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'net')

diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
index 69d7a8439c7a..bde63d9c3c4e 100644
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -72,6 +72,7 @@ static void nft_flow_offload_eval(const struct nft_expr *expr,
 	struct nf_flow_route route;
 	struct flow_offload *flow;
 	enum ip_conntrack_dir dir;
+	bool is_tcp = false;
 	struct nf_conn *ct;
 	int ret;
 
@@ -84,6 +85,8 @@ static void nft_flow_offload_eval(const struct nft_expr *expr,
 
 	switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum) {
 	case IPPROTO_TCP:
+		is_tcp = true;
+		break;
 	case IPPROTO_UDP:
 		break;
 	default:
@@ -108,6 +111,11 @@ static void nft_flow_offload_eval(const struct nft_expr *expr,
 	if (!flow)
 		goto err_flow_alloc;
 
+	if (is_tcp) {
+		ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
+		ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
+	}
+
 	ret = flow_offload_add(flowtable, flow);
 	if (ret < 0)
 		goto err_flow_add;
-- 
cgit 


From 91a9048f238063dde7feea752b9dd386f7e3808b Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 21 May 2019 13:24:32 +0200
Subject: netfilter: nft_flow_offload: don't offload when sequence numbers need
 adjustment

We can't deal with tcp sequence number rewrite in flow_offload.
While at it, simplify helper check, we only need to know if the extension
is present, we don't need the helper data.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_flow_offload.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
index bde63d9c3c4e..c97c03c3939a 100644
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -12,7 +12,6 @@
 #include <net/netfilter/nf_conntrack_core.h>
 #include <linux/netfilter/nf_conntrack_common.h>
 #include <net/netfilter/nf_flow_table.h>
-#include <net/netfilter/nf_conntrack_helper.h>
 
 struct nft_flow_offload {
 	struct nft_flowtable	*flowtable;
@@ -67,7 +66,6 @@ static void nft_flow_offload_eval(const struct nft_expr *expr,
 {
 	struct nft_flow_offload *priv = nft_expr_priv(expr);
 	struct nf_flowtable *flowtable = &priv->flowtable->data;
-	const struct nf_conn_help *help;
 	enum ip_conntrack_info ctinfo;
 	struct nf_flow_route route;
 	struct flow_offload *flow;
@@ -93,8 +91,8 @@ static void nft_flow_offload_eval(const struct nft_expr *expr,
 		goto out;
 	}
 
-	help = nfct_help(ct);
-	if (help)
+	if (nf_ct_ext_exist(ct, NF_CT_EXT_HELPER) ||
+	    ct->status & IPS_SEQ_ADJUST)
 		goto out;
 
 	if (!nf_ct_is_confirmed(ct))
-- 
cgit 


From 69aeb538587e087bfc81dd1f465eab3558ff3158 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 21 May 2019 13:24:33 +0200
Subject: netfilter: nft_flow_offload: IPCB is only valid for ipv4 family

Guard this with a check vs. ipv4, IPCB isn't valid in ipv6 case.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_flow_offload.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
index c97c03c3939a..d70742e95e14 100644
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -48,15 +48,20 @@ static int nft_flow_route(const struct nft_pktinfo *pkt,
 	return 0;
 }
 
-static bool nft_flow_offload_skip(struct sk_buff *skb)
+static bool nft_flow_offload_skip(struct sk_buff *skb, int family)
 {
-	struct ip_options *opt  = &(IPCB(skb)->opt);
-
-	if (unlikely(opt->optlen))
-		return true;
 	if (skb_sec_path(skb))
 		return true;
 
+	if (family == NFPROTO_IPV4) {
+		const struct ip_options *opt;
+
+		opt = &(IPCB(skb)->opt);
+
+		if (unlikely(opt->optlen))
+			return true;
+	}
+
 	return false;
 }
 
@@ -74,7 +79,7 @@ static void nft_flow_offload_eval(const struct nft_expr *expr,
 	struct nf_conn *ct;
 	int ret;
 
-	if (nft_flow_offload_skip(pkt->skb))
+	if (nft_flow_offload_skip(pkt->skb, nft_pf(pkt)))
 		goto out;
 
 	ct = nf_ct_get(pkt->skb, &ctinfo);
-- 
cgit 


From 7dc2bccab0ee37ac28096b8fcdc390a679a15841 Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@mellanox.com>
Date: Tue, 21 May 2019 06:40:04 +0000
Subject: Validate required parameters in inet6_validate_link_af

inet6_set_link_af requires that at least one of IFLA_INET6_TOKEN or
IFLA_INET6_ADDR_GET_MODE is passed. If none of them is passed, it
returns -EINVAL, which may cause do_setlink() to fail in the middle of
processing other commands and give the following warning message:

  A link change request failed with some changes committed already.
  Interface eth0 may have been left with an inconsistent configuration,
  please check.

Check the presence of at least one of them in inet6_validate_link_af to
detect invalid parameters at an early stage, before do_setlink does
anything. Also validate the address generation mode at an early stage.

Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 57 ++++++++++++++++++++++++++++++++---------------------
 1 file changed, 35 insertions(+), 22 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index f96d1de79509..b51630ddb728 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -5661,18 +5661,6 @@ static const struct nla_policy inet6_af_policy[IFLA_INET6_MAX + 1] = {
 	[IFLA_INET6_TOKEN]		= { .len = sizeof(struct in6_addr) },
 };
 
-static int inet6_validate_link_af(const struct net_device *dev,
-				  const struct nlattr *nla)
-{
-	struct nlattr *tb[IFLA_INET6_MAX + 1];
-
-	if (dev && !__in6_dev_get(dev))
-		return -EAFNOSUPPORT;
-
-	return nla_parse_nested_deprecated(tb, IFLA_INET6_MAX, nla,
-					   inet6_af_policy, NULL);
-}
-
 static int check_addr_gen_mode(int mode)
 {
 	if (mode != IN6_ADDR_GEN_MODE_EUI64 &&
@@ -5693,14 +5681,44 @@ static int check_stable_privacy(struct inet6_dev *idev, struct net *net,
 	return 1;
 }
 
+static int inet6_validate_link_af(const struct net_device *dev,
+				  const struct nlattr *nla)
+{
+	struct nlattr *tb[IFLA_INET6_MAX + 1];
+	struct inet6_dev *idev = NULL;
+	int err;
+
+	if (dev) {
+		idev = __in6_dev_get(dev);
+		if (!idev)
+			return -EAFNOSUPPORT;
+	}
+
+	err = nla_parse_nested_deprecated(tb, IFLA_INET6_MAX, nla,
+					  inet6_af_policy, NULL);
+	if (err)
+		return err;
+
+	if (!tb[IFLA_INET6_TOKEN] && !tb[IFLA_INET6_ADDR_GEN_MODE])
+		return -EINVAL;
+
+	if (tb[IFLA_INET6_ADDR_GEN_MODE]) {
+		u8 mode = nla_get_u8(tb[IFLA_INET6_ADDR_GEN_MODE]);
+
+		if (check_addr_gen_mode(mode) < 0)
+			return -EINVAL;
+		if (dev && check_stable_privacy(idev, dev_net(dev), mode) < 0)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla)
 {
-	int err = -EINVAL;
 	struct inet6_dev *idev = __in6_dev_get(dev);
 	struct nlattr *tb[IFLA_INET6_MAX + 1];
-
-	if (!idev)
-		return -EAFNOSUPPORT;
+	int err;
 
 	if (nla_parse_nested_deprecated(tb, IFLA_INET6_MAX, nla, NULL, NULL) < 0)
 		BUG();
@@ -5714,15 +5732,10 @@ static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla)
 	if (tb[IFLA_INET6_ADDR_GEN_MODE]) {
 		u8 mode = nla_get_u8(tb[IFLA_INET6_ADDR_GEN_MODE]);
 
-		if (check_addr_gen_mode(mode) < 0 ||
-		    check_stable_privacy(idev, dev_net(dev), mode) < 0)
-			return -EINVAL;
-
 		idev->cnf.addr_gen_mode = mode;
-		err = 0;
 	}
 
-	return err;
+	return 0;
 }
 
 static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev,
-- 
cgit 


From 38030d7cb77963ba84cdbe034806e2b81245339f Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Tue, 21 May 2019 19:02:00 -0700
Subject: net/tls: avoid NULL-deref on resync during device removal

When netdev with active kTLS sockets in unregistered
notifier callback walks the offloaded sockets and
cleans up offload state.  RX data may still be processed,
however, and if resync was requested prior to device
removal we would hit a NULL pointer dereference on
ctx->netdev use.

Make sure resync is under the device offload lock
and NULL-check the netdev pointer.

This should be safe, because the pointer is set to
NULL either in the netdev notifier (under said lock)
or when socket is completely dead and no resync can
happen.

The other access to ctx->netdev in tls_validate_xmit_skb()
does not dereference the pointer, it just checks it against
other device pointer, so it should be pretty safe (perhaps
we can add a READ_ONCE/WRITE_ONCE there, if paranoid).

Fixes: 4799ac81e52a ("tls: Add rx inline crypto offload")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tls/tls_device.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index ca54a7c7ec81..aa33e4accc32 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -553,8 +553,8 @@ void tls_device_write_space(struct sock *sk, struct tls_context *ctx)
 void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
-	struct net_device *netdev = tls_ctx->netdev;
 	struct tls_offload_context_rx *rx_ctx;
+	struct net_device *netdev;
 	u32 is_req_pending;
 	s64 resync_req;
 	u32 req_seq;
@@ -568,10 +568,15 @@ void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn)
 	is_req_pending = resync_req;
 
 	if (unlikely(is_req_pending) && req_seq == seq &&
-	    atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0))
-		netdev->tlsdev_ops->tls_dev_resync_rx(netdev, sk,
-						      seq + TLS_HEADER_SIZE - 1,
-						      rcd_sn);
+	    atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0)) {
+		seq += TLS_HEADER_SIZE - 1;
+		down_read(&device_offload_lock);
+		netdev = tls_ctx->netdev;
+		if (netdev)
+			netdev->tlsdev_ops->tls_dev_resync_rx(netdev, sk, seq,
+							      rcd_sn);
+		up_read(&device_offload_lock);
+	}
 }
 
 static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb)
-- 
cgit 


From 3686637e507b48525fcea6fb91e1988bdbc14530 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Tue, 21 May 2019 19:02:01 -0700
Subject: net/tls: fix state removal with feature flags off

TLS offload drivers shouldn't (and currently don't) block
the TLS offload feature changes based on whether there are
active offloaded connections or not.

This seems to be a good idea, because we want the admin to
be able to disable the TLS offload at any time, and there
is no clean way of disabling it for active connections
(TX side is quite problematic).  So if features are cleared
existing connections will stay offloaded until they close,
and new connections will not attempt offload to a given
device.

However, the offload state removal handling is currently
broken if feature flags get cleared while there are
active TLS offloads.

RX side will completely bail from cleanup, even on normal
remove path, leaving device state dangling, potentially
causing issues when the 5-tuple is reused.  It will also
fail to release the netdev reference.

Remove the RX-side warning message, in next release cycle
it should be printed when features are disabled, rather
than when connection dies, but for that we need a more
efficient method of finding connection of a given netdev
(a'la BPF offload code).

Fixes: 4799ac81e52a ("tls: Add rx inline crypto offload")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tls/tls_device.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'net')

diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index aa33e4accc32..07650446e892 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -939,12 +939,6 @@ void tls_device_offload_cleanup_rx(struct sock *sk)
 	if (!netdev)
 		goto out;
 
-	if (!(netdev->features & NETIF_F_HW_TLS_RX)) {
-		pr_err_ratelimited("%s: device is missing NETIF_F_HW_TLS_RX cap\n",
-				   __func__);
-		goto out;
-	}
-
 	netdev->tlsdev_ops->tls_dev_del(netdev, tls_ctx,
 					TLS_OFFLOAD_CTX_DIR_RX);
 
-- 
cgit 


From c3f4a6c39cf269a40d45f813c05fa830318ad875 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Tue, 21 May 2019 19:02:02 -0700
Subject: net/tls: don't ignore netdev notifications if no TLS features

On device surprise removal path (the notifier) we can't
bail just because the features are disabled.  They may
have been enabled during the lifetime of the device.
This bug leads to leaking netdev references and
use-after-frees if there are active connections while
device features are cleared.

Fixes: e8f69799810c ("net/tls: Add generic NIC offload infrastructure")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tls/tls_device.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index 07650446e892..b95c408fd771 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -997,7 +997,8 @@ static int tls_dev_event(struct notifier_block *this, unsigned long event,
 {
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
 
-	if (!(dev->features & (NETIF_F_HW_TLS_RX | NETIF_F_HW_TLS_TX)))
+	if (!dev->tlsdev_ops &&
+	    !(dev->features & (NETIF_F_HW_TLS_RX | NETIF_F_HW_TLS_TX)))
 		return NOTIFY_DONE;
 
 	switch (event) {
-- 
cgit 


From 31680ac265802397937d75461a2809a067b9fb93 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 22 May 2019 15:12:18 -0700
Subject: ipv6: Fix redirect with VRF

IPv6 redirect is broken for VRF. __ip6_route_redirect walks the FIB
entries looking for an exact match on ifindex. With VRF the flowi6_oif
is updated by l3mdev_update_flow to the l3mdev index and the
FLOWI_FLAG_SKIP_NH_OIF set in the flags to tell the lookup to skip the
device match. For redirects the device match is requires so use that
flag to know when the oif needs to be reset to the skb device index.

Fixes: ca254490c8df ("net: Add VRF support to IPv6 stack")
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 7a014ca877ed..848e944f07df 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2512,6 +2512,12 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
 	struct fib6_info *rt;
 	struct fib6_node *fn;
 
+	/* l3mdev_update_flow overrides oif if the device is enslaved; in
+	 * this case we must match on the real ingress device, so reset it
+	 */
+	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
+		fl6->flowi6_oif = skb->dev->ifindex;
+
 	/* Get the "current" route for this destination and
 	 * check if the redirect has come from appropriate router.
 	 *
-- 
cgit 


From 3580d04aa674383c42de7b635d28e52a1e5bc72c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 22 May 2019 16:51:22 -0700
Subject: ipv4/igmp: fix another memory leak in igmpv3_del_delrec()

syzbot reported memory leaks [1] that I have back tracked to
a missing cleanup from igmpv3_del_delrec() when
(im->sfmode != MCAST_INCLUDE)

Add ip_sf_list_clear_all() and kfree_pmc() helpers to explicitely
handle the cleanups before freeing.

[1]

BUG: memory leak
unreferenced object 0xffff888123e32b00 (size 64):
  comm "softirq", pid 0, jiffies 4294942968 (age 8.010s)
  hex dump (first 32 bytes):
    00 00 00 00 00 00 00 00 e0 00 00 01 00 00 00 00  ................
    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  backtrace:
    [<000000006105011b>] kmemleak_alloc_recursive include/linux/kmemleak.h:55 [inline]
    [<000000006105011b>] slab_post_alloc_hook mm/slab.h:439 [inline]
    [<000000006105011b>] slab_alloc mm/slab.c:3326 [inline]
    [<000000006105011b>] kmem_cache_alloc_trace+0x13d/0x280 mm/slab.c:3553
    [<000000004bba8073>] kmalloc include/linux/slab.h:547 [inline]
    [<000000004bba8073>] kzalloc include/linux/slab.h:742 [inline]
    [<000000004bba8073>] ip_mc_add1_src net/ipv4/igmp.c:1961 [inline]
    [<000000004bba8073>] ip_mc_add_src+0x36b/0x400 net/ipv4/igmp.c:2085
    [<00000000a46a65a0>] ip_mc_msfilter+0x22d/0x310 net/ipv4/igmp.c:2475
    [<000000005956ca89>] do_ip_setsockopt.isra.0+0x1795/0x1930 net/ipv4/ip_sockglue.c:957
    [<00000000848e2d2f>] ip_setsockopt+0x3b/0xb0 net/ipv4/ip_sockglue.c:1246
    [<00000000b9db185c>] udp_setsockopt+0x4e/0x90 net/ipv4/udp.c:2616
    [<000000003028e438>] sock_common_setsockopt+0x38/0x50 net/core/sock.c:3130
    [<0000000015b65589>] __sys_setsockopt+0x98/0x120 net/socket.c:2078
    [<00000000ac198ef0>] __do_sys_setsockopt net/socket.c:2089 [inline]
    [<00000000ac198ef0>] __se_sys_setsockopt net/socket.c:2086 [inline]
    [<00000000ac198ef0>] __x64_sys_setsockopt+0x26/0x30 net/socket.c:2086
    [<000000000a770437>] do_syscall_64+0x76/0x1a0 arch/x86/entry/common.c:301
    [<00000000d3adb93b>] entry_SYSCALL_64_after_hwframe+0x44/0xa9

Fixes: 9c8bb163ae78 ("igmp, mld: Fix memory leak in igmpv3/mld_del_delrec()")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Hangbin Liu <liuhangbin@gmail.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/igmp.c | 47 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 30 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 6c2febc39dca..1a8d36dd49d4 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -633,6 +633,24 @@ static void igmpv3_clear_zeros(struct ip_sf_list **ppsf)
 	}
 }
 
+static void ip_sf_list_clear_all(struct ip_sf_list *psf)
+{
+	struct ip_sf_list *next;
+
+	while (psf) {
+		next = psf->sf_next;
+		kfree(psf);
+		psf = next;
+	}
+}
+
+static void kfree_pmc(struct ip_mc_list *pmc)
+{
+	ip_sf_list_clear_all(pmc->sources);
+	ip_sf_list_clear_all(pmc->tomb);
+	kfree(pmc);
+}
+
 static void igmpv3_send_cr(struct in_device *in_dev)
 {
 	struct ip_mc_list *pmc, *pmc_prev, *pmc_next;
@@ -669,7 +687,7 @@ static void igmpv3_send_cr(struct in_device *in_dev)
 			else
 				in_dev->mc_tomb = pmc_next;
 			in_dev_put(pmc->interface);
-			kfree(pmc);
+			kfree_pmc(pmc);
 		} else
 			pmc_prev = pmc;
 	}
@@ -1215,14 +1233,18 @@ static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im)
 		im->interface = pmc->interface;
 		if (im->sfmode == MCAST_INCLUDE) {
 			im->tomb = pmc->tomb;
+			pmc->tomb = NULL;
+
 			im->sources = pmc->sources;
+			pmc->sources = NULL;
+
 			for (psf = im->sources; psf; psf = psf->sf_next)
 				psf->sf_crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
 		} else {
 			im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
 		}
 		in_dev_put(pmc->interface);
-		kfree(pmc);
+		kfree_pmc(pmc);
 	}
 	spin_unlock_bh(&im->lock);
 }
@@ -1243,21 +1265,18 @@ static void igmpv3_clear_delrec(struct in_device *in_dev)
 		nextpmc = pmc->next;
 		ip_mc_clear_src(pmc);
 		in_dev_put(pmc->interface);
-		kfree(pmc);
+		kfree_pmc(pmc);
 	}
 	/* clear dead sources, too */
 	rcu_read_lock();
 	for_each_pmc_rcu(in_dev, pmc) {
-		struct ip_sf_list *psf, *psf_next;
+		struct ip_sf_list *psf;
 
 		spin_lock_bh(&pmc->lock);
 		psf = pmc->tomb;
 		pmc->tomb = NULL;
 		spin_unlock_bh(&pmc->lock);
-		for (; psf; psf = psf_next) {
-			psf_next = psf->sf_next;
-			kfree(psf);
-		}
+		ip_sf_list_clear_all(psf);
 	}
 	rcu_read_unlock();
 }
@@ -2123,7 +2142,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
 
 static void ip_mc_clear_src(struct ip_mc_list *pmc)
 {
-	struct ip_sf_list *psf, *nextpsf, *tomb, *sources;
+	struct ip_sf_list *tomb, *sources;
 
 	spin_lock_bh(&pmc->lock);
 	tomb = pmc->tomb;
@@ -2135,14 +2154,8 @@ static void ip_mc_clear_src(struct ip_mc_list *pmc)
 	pmc->sfcount[MCAST_EXCLUDE] = 1;
 	spin_unlock_bh(&pmc->lock);
 
-	for (psf = tomb; psf; psf = nextpsf) {
-		nextpsf = psf->sf_next;
-		kfree(psf);
-	}
-	for (psf = sources; psf; psf = nextpsf) {
-		nextpsf = psf->sf_next;
-		kfree(psf);
-	}
+	ip_sf_list_clear_all(tomb);
+	ip_sf_list_clear_all(sources);
 }
 
 /* Join a multicast group
-- 
cgit 


From 903869bd10e6719b9df6718e785be7ec725df59f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 22 May 2019 18:35:16 -0700
Subject: ipv4/igmp: fix build error if !CONFIG_IP_MULTICAST

ip_sf_list_clear_all() needs to be defined even if !CONFIG_IP_MULTICAST

Fixes: 3580d04aa674 ("ipv4/igmp: fix another memory leak in igmpv3_del_delrec()")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/igmp.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 1a8d36dd49d4..eb03153dfe12 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -188,6 +188,17 @@ static void ip_ma_put(struct ip_mc_list *im)
 	     pmc != NULL;					\
 	     pmc = rtnl_dereference(pmc->next_rcu))
 
+static void ip_sf_list_clear_all(struct ip_sf_list *psf)
+{
+	struct ip_sf_list *next;
+
+	while (psf) {
+		next = psf->sf_next;
+		kfree(psf);
+		psf = next;
+	}
+}
+
 #ifdef CONFIG_IP_MULTICAST
 
 /*
@@ -633,17 +644,6 @@ static void igmpv3_clear_zeros(struct ip_sf_list **ppsf)
 	}
 }
 
-static void ip_sf_list_clear_all(struct ip_sf_list *psf)
-{
-	struct ip_sf_list *next;
-
-	while (psf) {
-		next = psf->sf_next;
-		kfree(psf);
-		psf = next;
-	}
-}
-
 static void kfree_pmc(struct ip_mc_list *pmc)
 {
 	ip_sf_list_clear_all(pmc->sources);
-- 
cgit 


From d2daa127ed51ac41217962d2b8f9c00be6e9c0d9 Mon Sep 17 00:00:00 2001
From: Andreas Oetken <andreas.oetken@siemens.com>
Date: Thu, 23 May 2019 13:57:14 +0200
Subject: hsr: fix don't prune the master node from the node_db

Don't prune the master node in the hsr_prune_nodes function.
Neither time_in[HSR_PT_SLAVE_A] nor time_in[HSR_PT_SLAVE_B]
will ever be updated by hsr_register_frame_in for the master port.
Thus, the master node will be repeatedly pruned leading to
repeated packet loss.
This bug never appeared because the hsr_prune_nodes function
was only called once. Since commit 5150b45fd355
("net: hsr: Fix node prune function for forget time expiry") this issue
is fixed unveiling the issue described above.

Fixes: 5150b45fd355 ("net: hsr: Fix node prune function for forget time expiry")
Signed-off-by: Andreas Oetken <andreas.oetken@siemens.com>
Tested-by: Murali Karicheri <m-karicheri2@ti.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/hsr/hsr_framereg.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'net')

diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c
index 9fa9abd83018..2d7a19750436 100644
--- a/net/hsr/hsr_framereg.c
+++ b/net/hsr/hsr_framereg.c
@@ -365,6 +365,14 @@ void hsr_prune_nodes(struct timer_list *t)
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(node, &hsr->node_db, mac_list) {
+		/* Don't prune own node. Neither time_in[HSR_PT_SLAVE_A]
+		 * nor time_in[HSR_PT_SLAVE_B], will ever be updated for
+		 * the master port. Thus the master node will be repeatedly
+		 * pruned leading to packet loss.
+		 */
+		if (hsr_addr_is_self(hsr, node->macaddress_A))
+			continue;
+
 		/* Shorthand */
 		time_a = node->time_in[HSR_PT_SLAVE_A];
 		time_b = node->time_in[HSR_PT_SLAVE_B];
-- 
cgit 


From 4097e9d250fb17958c1d9b94538386edd3f20144 Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu@mellanox.com>
Date: Thu, 23 May 2019 09:32:31 +0300
Subject: net: sched: don't use tc_action->order during action dump

Function tcf_action_dump() relies on tc_action->order field when starting
nested nla to send action data to userspace. This approach breaks in
several cases:

- When multiple filters point to same shared action, tc_action->order field
  is overwritten each time it is attached to filter. This causes filter
  dump to output action with incorrect attribute for all filters that have
  the action in different position (different order) from the last set
  tc_action->order value.

- When action data is displayed using tc action API (RTM_GETACTION), action
  order is overwritten by tca_action_gd() according to its position in
  resulting array of nl attributes, which will break filter dump for all
  filters attached to that shared action that expect it to have different
  order value.

Don't rely on tc_action->order when dumping actions. Set nla according to
action position in resulting array of actions instead.

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_api.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 683fcc00da49..c42ecf4b3c10 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -800,7 +800,7 @@ int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[],
 
 	for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) {
 		a = actions[i];
-		nest = nla_nest_start_noflag(skb, a->order);
+		nest = nla_nest_start_noflag(skb, i + 1);
 		if (nest == NULL)
 			goto nla_put_failure;
 		err = tcf_action_dump_1(skb, a, bind, ref);
@@ -1303,7 +1303,6 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
 			ret = PTR_ERR(act);
 			goto err;
 		}
-		act->order = i;
 		attr_size += tcf_action_fill_size(act);
 		actions[i - 1] = act;
 	}
-- 
cgit 


From 95baa60a0da80a0143e3ddd4d3725758b4513825 Mon Sep 17 00:00:00 2001
From: Gen Zhang <blackgod016574@gmail.com>
Date: Fri, 24 May 2019 11:19:46 +0800
Subject: ipv6_sockglue: Fix a missing-check bug in ip6_ra_control()

In function ip6_ra_control(), the pointer new_ra is allocated a memory
space via kmalloc(). And it is used in the following codes. However,
when there is a memory allocation error, kmalloc() fails. Thus null
pointer dereference may happen. And it will cause the kernel to crash.
Therefore, we should check the return value and handle the error.

Signed-off-by: Gen Zhang <blackgod016574@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ipv6_sockglue.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 40f21fef25ff..0a3d035feb61 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -68,6 +68,8 @@ int ip6_ra_control(struct sock *sk, int sel)
 		return -ENOPROTOOPT;
 
 	new_ra = (sel >= 0) ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
+	if (sel >= 0 && !new_ra)
+		return -ENOMEM;
 
 	write_lock_bh(&ip6_ra_lock);
 	for (rap = &ip6_ra_chain; (ra = *rap) != NULL; rap = &ra->next) {
-- 
cgit 


From 425aa0e1d01513437668fa3d4a971168bbaa8515 Mon Sep 17 00:00:00 2001
From: Gen Zhang <blackgod016574@gmail.com>
Date: Fri, 24 May 2019 11:24:26 +0800
Subject: ip_sockglue: Fix missing-check bug in ip_ra_control()

In function ip_ra_control(), the pointer new_ra is allocated a memory
space via kmalloc(). And it is used in the following codes. However,
when  there is a memory allocation error, kmalloc() fails. Thus null
pointer dereference may happen. And it will cause the kernel to crash.
Therefore, we should check the return value and handle the error.

Signed-off-by: Gen Zhang <blackgod016574@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_sockglue.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 82f341e84fae..aa3fd61818c4 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -343,6 +343,8 @@ int ip_ra_control(struct sock *sk, unsigned char on,
 		return -EINVAL;
 
 	new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
+	if (on && !new_ra)
+		return -ENOMEM;
 
 	mutex_lock(&net->ipv4.ra_mutex);
 	for (rap = &net->ipv4.ra_chain;
-- 
cgit 


From 46a1695960d0600d58da7af33c65f24f3d839674 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 24 May 2019 10:34:30 -0700
Subject: net/tls: fix lowat calculation if some data came from previous record

If some of the data came from the previous record, i.e. from
the rx_list it had already been decrypted, so it's not counted
towards the "decrypted" variable, but the "copied" variable.
Take that into account when checking lowat.

When calculating lowat target we need to pass the original len.
E.g. if lowat is at 80, len is 100 and we had 30 bytes on rx_list
target would currently be incorrectly calculated as 70, even though
we only need 50 more bytes to make up the 80.

Fixes: 692d7b5d1f91 ("tls: Fix recvmsg() to be able to peek across multiple records")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Tested-by: David Beckett <david.beckett@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tls/tls_sw.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index d93f83f77864..fc13234db74a 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1712,13 +1712,12 @@ int tls_sw_recvmsg(struct sock *sk,
 		copied = err;
 	}
 
-	len = len - copied;
-	if (len) {
-		target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
-		timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
-	} else {
+	if (len <= copied)
 		goto recv_end;
-	}
+
+	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
+	len = len - copied;
+	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
 
 	do {
 		bool retain_skb = false;
@@ -1853,7 +1852,7 @@ pick_next_record:
 		}
 
 		/* If we have a new message from strparser, continue now. */
-		if (decrypted >= target && !ctx->recv_pkt)
+		if (decrypted + copied >= target && !ctx->recv_pkt)
 			break;
 	} while (len);
 
-- 
cgit 


From 04b25a5411f966c2e586909a8496553b71876fae Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 24 May 2019 10:34:32 -0700
Subject: net/tls: fix no wakeup on partial reads

When tls_sw_recvmsg() partially copies a record it pops that
record from ctx->recv_pkt and places it on rx_list.

Next iteration of tls_sw_recvmsg() reads from rx_list via
process_rx_list() before it enters the decryption loop.
If there is no more records to be read tls_wait_data()
will put the process on the wait queue and got to sleep.
This is incorrect, because some data was already copied
in process_rx_list().

In case of RPC connections process may never get woken up,
because peer also simply blocks in read().

I think this may also fix a similar issue when BPF is at
play, because after __tcp_bpf_recvmsg() returns some data
we subtract it from len and use continue to restart the
loop, but len could have just reached 0, so again we'd
sleep unnecessarily. That's added by:
commit d3b18ad31f93 ("tls: add bpf support to sk_msg handling")

Fixes: 692d7b5d1f91 ("tls: Fix recvmsg() to be able to peek across multiple records")
Reported-by: David Beckett <david.beckett@netronome.com>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Tested-by: David Beckett <david.beckett@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tls/tls_sw.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index fc13234db74a..960494f437ac 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1719,7 +1719,7 @@ int tls_sw_recvmsg(struct sock *sk,
 	len = len - copied;
 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
 
-	do {
+	while (len && (decrypted + copied < target || ctx->recv_pkt)) {
 		bool retain_skb = false;
 		bool zc = false;
 		int to_decrypt;
@@ -1850,11 +1850,7 @@ pick_next_record:
 		} else {
 			break;
 		}
-
-		/* If we have a new message from strparser, continue now. */
-		if (decrypted + copied >= target && !ctx->recv_pkt)
-			break;
-	} while (len);
+	}
 
 recv_end:
 	if (num_async) {
-- 
cgit 


From 8fb44d60d4142cd2a440620cd291d346e23c131e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 27 May 2019 17:35:52 -0700
Subject: llc: fix skb leak in llc_build_and_send_ui_pkt()

If llc_mac_hdr_init() returns an error, we must drop the skb
since no llc_build_and_send_ui_pkt() caller will take care of this.

BUG: memory leak
unreferenced object 0xffff8881202b6800 (size 2048):
  comm "syz-executor907", pid 7074, jiffies 4294943781 (age 8.590s)
  hex dump (first 32 bytes):
    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
    1a 00 07 40 00 00 00 00 00 00 00 00 00 00 00 00  ...@............
  backtrace:
    [<00000000e25b5abe>] kmemleak_alloc_recursive include/linux/kmemleak.h:55 [inline]
    [<00000000e25b5abe>] slab_post_alloc_hook mm/slab.h:439 [inline]
    [<00000000e25b5abe>] slab_alloc mm/slab.c:3326 [inline]
    [<00000000e25b5abe>] __do_kmalloc mm/slab.c:3658 [inline]
    [<00000000e25b5abe>] __kmalloc+0x161/0x2c0 mm/slab.c:3669
    [<00000000a1ae188a>] kmalloc include/linux/slab.h:552 [inline]
    [<00000000a1ae188a>] sk_prot_alloc+0xd6/0x170 net/core/sock.c:1608
    [<00000000ded25bbe>] sk_alloc+0x35/0x2f0 net/core/sock.c:1662
    [<000000002ecae075>] llc_sk_alloc+0x35/0x170 net/llc/llc_conn.c:950
    [<00000000551f7c47>] llc_ui_create+0x7b/0x140 net/llc/af_llc.c:173
    [<0000000029027f0e>] __sock_create+0x164/0x250 net/socket.c:1430
    [<000000008bdec225>] sock_create net/socket.c:1481 [inline]
    [<000000008bdec225>] __sys_socket+0x69/0x110 net/socket.c:1523
    [<00000000b6439228>] __do_sys_socket net/socket.c:1532 [inline]
    [<00000000b6439228>] __se_sys_socket net/socket.c:1530 [inline]
    [<00000000b6439228>] __x64_sys_socket+0x1e/0x30 net/socket.c:1530
    [<00000000cec820c1>] do_syscall_64+0x76/0x1a0 arch/x86/entry/common.c:301
    [<000000000c32554f>] entry_SYSCALL_64_after_hwframe+0x44/0xa9

BUG: memory leak
unreferenced object 0xffff88811d750d00 (size 224):
  comm "syz-executor907", pid 7074, jiffies 4294943781 (age 8.600s)
  hex dump (first 32 bytes):
    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
    00 f0 0c 24 81 88 ff ff 00 68 2b 20 81 88 ff ff  ...$.....h+ ....
  backtrace:
    [<0000000053026172>] kmemleak_alloc_recursive include/linux/kmemleak.h:55 [inline]
    [<0000000053026172>] slab_post_alloc_hook mm/slab.h:439 [inline]
    [<0000000053026172>] slab_alloc_node mm/slab.c:3269 [inline]
    [<0000000053026172>] kmem_cache_alloc_node+0x153/0x2a0 mm/slab.c:3579
    [<00000000fa8f3c30>] __alloc_skb+0x6e/0x210 net/core/skbuff.c:198
    [<00000000d96fdafb>] alloc_skb include/linux/skbuff.h:1058 [inline]
    [<00000000d96fdafb>] alloc_skb_with_frags+0x5f/0x250 net/core/skbuff.c:5327
    [<000000000a34a2e7>] sock_alloc_send_pskb+0x269/0x2a0 net/core/sock.c:2225
    [<00000000ee39999b>] sock_alloc_send_skb+0x32/0x40 net/core/sock.c:2242
    [<00000000e034d810>] llc_ui_sendmsg+0x10a/0x540 net/llc/af_llc.c:933
    [<00000000c0bc8445>] sock_sendmsg_nosec net/socket.c:652 [inline]
    [<00000000c0bc8445>] sock_sendmsg+0x54/0x70 net/socket.c:671
    [<000000003b687167>] __sys_sendto+0x148/0x1f0 net/socket.c:1964
    [<00000000922d78d9>] __do_sys_sendto net/socket.c:1976 [inline]
    [<00000000922d78d9>] __se_sys_sendto net/socket.c:1972 [inline]
    [<00000000922d78d9>] __x64_sys_sendto+0x2a/0x30 net/socket.c:1972
    [<00000000cec820c1>] do_syscall_64+0x76/0x1a0 arch/x86/entry/common.c:301
    [<000000000c32554f>] entry_SYSCALL_64_after_hwframe+0x44/0xa9

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/llc/llc_output.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/llc/llc_output.c b/net/llc/llc_output.c
index 94425e421213..9e4b6bcf6920 100644
--- a/net/llc/llc_output.c
+++ b/net/llc/llc_output.c
@@ -72,6 +72,8 @@ int llc_build_and_send_ui_pkt(struct llc_sap *sap, struct sk_buff *skb,
 	rc = llc_mac_hdr_init(skb, skb->dev->dev_addr, dmac);
 	if (likely(!rc))
 		rc = dev_queue_xmit(skb);
+	else
+		kfree_skb(skb);
 	return rc;
 }
 
-- 
cgit 


From 458bf2f224f04a513b0be972f8708e78ee2c986e Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Tue, 28 May 2019 11:47:31 -0700
Subject: net: core: support XDP generic on stacked devices.

When a device is stacked like (team, bonding, failsafe or netvsc) the
XDP generic program for the parent device was not called.

Move the call to XDP generic inside __netif_receive_skb_core where
it can be done multiple times for stacked case.

Fixes: d445516966dc ("net: xdp: support xdp generic on virtual devices")
Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 58 ++++++++++++----------------------------------------------
 1 file changed, 12 insertions(+), 46 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index b6b8505cfb3e..cc2a4e257324 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4502,23 +4502,6 @@ static int netif_rx_internal(struct sk_buff *skb)
 
 	trace_netif_rx(skb);
 
-	if (static_branch_unlikely(&generic_xdp_needed_key)) {
-		int ret;
-
-		preempt_disable();
-		rcu_read_lock();
-		ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
-		rcu_read_unlock();
-		preempt_enable();
-
-		/* Consider XDP consuming the packet a success from
-		 * the netdev point of view we do not want to count
-		 * this as an error.
-		 */
-		if (ret != XDP_PASS)
-			return NET_RX_SUCCESS;
-	}
-
 #ifdef CONFIG_RPS
 	if (static_branch_unlikely(&rps_needed)) {
 		struct rps_dev_flow voidflow, *rflow = &voidflow;
@@ -4858,6 +4841,18 @@ another_round:
 
 	__this_cpu_inc(softnet_data.processed);
 
+	if (static_branch_unlikely(&generic_xdp_needed_key)) {
+		int ret2;
+
+		preempt_disable();
+		ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
+		preempt_enable();
+
+		if (ret2 != XDP_PASS)
+			return NET_RX_DROP;
+		skb_reset_mac_len(skb);
+	}
+
 	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
 	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
 		skb = skb_vlan_untag(skb);
@@ -5178,19 +5173,6 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
 	if (skb_defer_rx_timestamp(skb))
 		return NET_RX_SUCCESS;
 
-	if (static_branch_unlikely(&generic_xdp_needed_key)) {
-		int ret;
-
-		preempt_disable();
-		rcu_read_lock();
-		ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
-		rcu_read_unlock();
-		preempt_enable();
-
-		if (ret != XDP_PASS)
-			return NET_RX_DROP;
-	}
-
 	rcu_read_lock();
 #ifdef CONFIG_RPS
 	if (static_branch_unlikely(&rps_needed)) {
@@ -5211,7 +5193,6 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
 
 static void netif_receive_skb_list_internal(struct list_head *head)
 {
-	struct bpf_prog *xdp_prog = NULL;
 	struct sk_buff *skb, *next;
 	struct list_head sublist;
 
@@ -5224,21 +5205,6 @@ static void netif_receive_skb_list_internal(struct list_head *head)
 	}
 	list_splice_init(&sublist, head);
 
-	if (static_branch_unlikely(&generic_xdp_needed_key)) {
-		preempt_disable();
-		rcu_read_lock();
-		list_for_each_entry_safe(skb, next, head, list) {
-			xdp_prog = rcu_dereference(skb->dev->xdp_prog);
-			skb_list_del_init(skb);
-			if (do_xdp_generic(xdp_prog, skb) == XDP_PASS)
-				list_add_tail(&skb->list, &sublist);
-		}
-		rcu_read_unlock();
-		preempt_enable();
-		/* Put passed packets back on main list */
-		list_splice_init(&sublist, head);
-	}
-
 	rcu_read_lock();
 #ifdef CONFIG_RPS
 	if (static_branch_unlikely(&rps_needed)) {
-- 
cgit 


From 9609dad263f8bea347f41fddca29353dbf8a7d37 Mon Sep 17 00:00:00 2001
From: Young Xiao <92siuyang@gmail.com>
Date: Wed, 29 May 2019 16:10:59 +0800
Subject: ipv4: tcp_input: fix stack out of bounds when parsing TCP options.

The TCP option parsing routines in tcp_parse_options function could
read one byte out of the buffer of the TCP options.

1         while (length > 0) {
2                 int opcode = *ptr++;
3                 int opsize;
4
5                 switch (opcode) {
6                 case TCPOPT_EOL:
7                         return;
8                 case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
9                         length--;
10                        continue;
11                default:
12                        opsize = *ptr++; //out of bound access

If length = 1, then there is an access in line2.
And another access is occurred in line 12.
This would lead to out-of-bound access.

Therefore, in the patch we check that the available data length is
larger enough to pase both TCP option code and size.

Signed-off-by: Young Xiao <92siuyang@gmail.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c61edd023b35..08a477e74cf3 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3791,6 +3791,8 @@ void tcp_parse_options(const struct net *net,
 			length--;
 			continue;
 		default:
+			if (length < 2)
+				return;
 			opsize = *ptr++;
 			if (opsize < 2) /* "silly options" */
 				return;
-- 
cgit 


From d34d2baa9173f6e0c0f22d005d18e83d1cb54d8d Mon Sep 17 00:00:00 2001
From: Ioana Ciornei <ioana.ciornei@nxp.com>
Date: Thu, 30 May 2019 00:42:30 +0300
Subject: net: dsa: tag_8021q: Change order of rx_vid setup

The 802.1Q tagging performs an unbalanced setup in terms of RX VIDs on
the CPU port. For the ingress path of a 802.1Q switch to work, the RX
VID of a port needs to be seen as tagged egress on the CPU port.

While configuring the other front-panel ports to be part of this VID,
for bridge scenarios, the untagged flag is applied even on the CPU port
in dsa_switch_vlan_add.  This happens because DSA applies the same flags
on the CPU port as on the (bridge-controlled) slave ports, and the
effect in this case is that the CPU port tagged settings get deleted.

Instead of fixing DSA by introducing a way to control VLAN flags on the
CPU port (and hence stop inheriting from the slave ports) - a hard,
perhaps intractable problem - avoid this situation by moving the setup
part of the RX VID on the CPU port after all the other front-panel ports
have been added to the VID.

Fixes: f9bbe4477c30 ("net: dsa: Optional VLAN-based port separation for switches without tagging")
Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>
Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/tag_8021q.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c
index 8ae48c7e1e76..4adec6bbfe59 100644
--- a/net/dsa/tag_8021q.c
+++ b/net/dsa/tag_8021q.c
@@ -128,10 +128,7 @@ int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int port, bool enabled)
 		u16 flags;
 
 		if (i == upstream)
-			/* CPU port needs to see this port's RX VID
-			 * as tagged egress.
-			 */
-			flags = 0;
+			continue;
 		else if (i == port)
 			/* The RX VID is pvid on this port */
 			flags = BRIDGE_VLAN_INFO_UNTAGGED |
@@ -150,6 +147,20 @@ int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int port, bool enabled)
 			return err;
 		}
 	}
+
+	/* CPU port needs to see this port's RX VID
+	 * as tagged egress.
+	 */
+	if (enabled)
+		err = dsa_port_vid_add(upstream_dp, rx_vid, 0);
+	else
+		err = dsa_port_vid_del(upstream_dp, rx_vid);
+	if (err) {
+		dev_err(ds->dev, "Failed to apply RX VID %d to port %d: %d\n",
+			rx_vid, port, err);
+		return err;
+	}
+
 	/* Finally apply the TX VID on this port and on the CPU port */
 	if (enabled)
 		err = dsa_port_vid_add(dp, tx_vid, BRIDGE_VLAN_INFO_UNTAGGED);
-- 
cgit 


From 0471dd429cea6507a4000169ff6a33f41ba371b3 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <olteanv@gmail.com>
Date: Thu, 30 May 2019 00:42:31 +0300
Subject: net: dsa: tag_8021q: Create a stable binary format

Tools like tcpdump need to be able to decode the significance of fake
VLAN headers that DSA uses to separate switch ports.

But currently these have no global significance - they are simply an
ordered list of DSA_MAX_SWITCHES x DSA_MAX_PORTS numbers ending at 4095.

The reason why this is submitted as a fix is that the existing mapping
of VIDs should not enter into a stable kernel, so we can pretend that
only the new format exists. This way tcpdump won't need to try to make
something out of the VLAN tags on 5.2 kernels.

Fixes: f9bbe4477c30 ("net: dsa: Optional VLAN-based port separation for switches without tagging")
Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/tag_8021q.c | 60 ++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 50 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c
index 4adec6bbfe59..65a35e976d7b 100644
--- a/net/dsa/tag_8021q.c
+++ b/net/dsa/tag_8021q.c
@@ -11,20 +11,59 @@
 
 #include "dsa_priv.h"
 
-/* Allocating two VLAN tags per port - one for the RX VID and
- * the other for the TX VID - see below
+/* Binary structure of the fake 12-bit VID field (when the TPID is
+ * ETH_P_DSA_8021Q):
+ *
+ * | 11  | 10  |  9  |  8  |  7  |  6  |  5  |  4  |  3  |  2  |  1  |  0  |
+ * +-----------+-----+-----------------+-----------+-----------------------+
+ * |    DIR    | RSV |    SWITCH_ID    |    RSV    |          PORT         |
+ * +-----------+-----+-----------------+-----------+-----------------------+
+ *
+ * DIR - VID[11:10]:
+ *	Direction flags.
+ *	* 1 (0b01) for RX VLAN,
+ *	* 2 (0b10) for TX VLAN.
+ *	These values make the special VIDs of 0, 1 and 4095 to be left
+ *	unused by this coding scheme.
+ *
+ * RSV - VID[9]:
+ *	To be used for further expansion of SWITCH_ID or for other purposes.
+ *
+ * SWITCH_ID - VID[8:6]:
+ *	Index of switch within DSA tree. Must be between 0 and
+ *	DSA_MAX_SWITCHES - 1.
+ *
+ * RSV - VID[5:4]:
+ *	To be used for further expansion of PORT or for other purposes.
+ *
+ * PORT - VID[3:0]:
+ *	Index of switch port. Must be between 0 and DSA_MAX_PORTS - 1.
  */
-#define DSA_8021Q_VID_RANGE	(DSA_MAX_SWITCHES * DSA_MAX_PORTS)
-#define DSA_8021Q_VID_BASE	(VLAN_N_VID - 2 * DSA_8021Q_VID_RANGE - 1)
-#define DSA_8021Q_RX_VID_BASE	(DSA_8021Q_VID_BASE)
-#define DSA_8021Q_TX_VID_BASE	(DSA_8021Q_VID_BASE + DSA_8021Q_VID_RANGE)
+
+#define DSA_8021Q_DIR_SHIFT		10
+#define DSA_8021Q_DIR_MASK		GENMASK(11, 10)
+#define DSA_8021Q_DIR(x)		(((x) << DSA_8021Q_DIR_SHIFT) & \
+						 DSA_8021Q_DIR_MASK)
+#define DSA_8021Q_DIR_RX		DSA_8021Q_DIR(1)
+#define DSA_8021Q_DIR_TX		DSA_8021Q_DIR(2)
+
+#define DSA_8021Q_SWITCH_ID_SHIFT	6
+#define DSA_8021Q_SWITCH_ID_MASK	GENMASK(8, 6)
+#define DSA_8021Q_SWITCH_ID(x)		(((x) << DSA_8021Q_SWITCH_ID_SHIFT) & \
+						 DSA_8021Q_SWITCH_ID_MASK)
+
+#define DSA_8021Q_PORT_SHIFT		0
+#define DSA_8021Q_PORT_MASK		GENMASK(3, 0)
+#define DSA_8021Q_PORT(x)		(((x) << DSA_8021Q_PORT_SHIFT) & \
+						 DSA_8021Q_PORT_MASK)
 
 /* Returns the VID to be inserted into the frame from xmit for switch steering
  * instructions on egress. Encodes switch ID and port ID.
  */
 u16 dsa_8021q_tx_vid(struct dsa_switch *ds, int port)
 {
-	return DSA_8021Q_TX_VID_BASE + (DSA_MAX_PORTS * ds->index) + port;
+	return DSA_8021Q_DIR_TX | DSA_8021Q_SWITCH_ID(ds->index) |
+	       DSA_8021Q_PORT(port);
 }
 EXPORT_SYMBOL_GPL(dsa_8021q_tx_vid);
 
@@ -33,21 +72,22 @@ EXPORT_SYMBOL_GPL(dsa_8021q_tx_vid);
  */
 u16 dsa_8021q_rx_vid(struct dsa_switch *ds, int port)
 {
-	return DSA_8021Q_RX_VID_BASE + (DSA_MAX_PORTS * ds->index) + port;
+	return DSA_8021Q_DIR_RX | DSA_8021Q_SWITCH_ID(ds->index) |
+	       DSA_8021Q_PORT(port);
 }
 EXPORT_SYMBOL_GPL(dsa_8021q_rx_vid);
 
 /* Returns the decoded switch ID from the RX VID. */
 int dsa_8021q_rx_switch_id(u16 vid)
 {
-	return ((vid - DSA_8021Q_RX_VID_BASE) / DSA_MAX_PORTS);
+	return (vid & DSA_8021Q_SWITCH_ID_MASK) >> DSA_8021Q_SWITCH_ID_SHIFT;
 }
 EXPORT_SYMBOL_GPL(dsa_8021q_rx_switch_id);
 
 /* Returns the decoded port ID from the RX VID. */
 int dsa_8021q_rx_source_port(u16 vid)
 {
-	return ((vid - DSA_8021Q_RX_VID_BASE) % DSA_MAX_PORTS);
+	return (vid & DSA_8021Q_PORT_MASK) >> DSA_8021Q_PORT_SHIFT;
 }
 EXPORT_SYMBOL_GPL(dsa_8021q_rx_source_port);
 
-- 
cgit 


From a4270d6795b0580287453ea55974d948393e66ef Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 29 May 2019 15:36:10 -0700
Subject: net-gro: fix use-after-free read in napi_gro_frags()

If a network driver provides to napi_gro_frags() an
skb with a page fragment of exactly 14 bytes, the call
to gro_pull_from_frag0() will 'consume' the fragment
by calling skb_frag_unref(skb, 0), and the page might
be freed and reused.

Reading eth->h_proto at the end of napi_frags_skb() might
read mangled data, or crash under specific debugging features.

BUG: KASAN: use-after-free in napi_frags_skb net/core/dev.c:5833 [inline]
BUG: KASAN: use-after-free in napi_gro_frags+0xc6f/0xd10 net/core/dev.c:5841
Read of size 2 at addr ffff88809366840c by task syz-executor599/8957

CPU: 1 PID: 8957 Comm: syz-executor599 Not tainted 5.2.0-rc1+ #32
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x172/0x1f0 lib/dump_stack.c:113
 print_address_description.cold+0x7c/0x20d mm/kasan/report.c:188
 __kasan_report.cold+0x1b/0x40 mm/kasan/report.c:317
 kasan_report+0x12/0x20 mm/kasan/common.c:614
 __asan_report_load_n_noabort+0xf/0x20 mm/kasan/generic_report.c:142
 napi_frags_skb net/core/dev.c:5833 [inline]
 napi_gro_frags+0xc6f/0xd10 net/core/dev.c:5841
 tun_get_user+0x2f3c/0x3ff0 drivers/net/tun.c:1991
 tun_chr_write_iter+0xbd/0x156 drivers/net/tun.c:2037
 call_write_iter include/linux/fs.h:1872 [inline]
 do_iter_readv_writev+0x5f8/0x8f0 fs/read_write.c:693
 do_iter_write fs/read_write.c:970 [inline]
 do_iter_write+0x184/0x610 fs/read_write.c:951
 vfs_writev+0x1b3/0x2f0 fs/read_write.c:1015
 do_writev+0x15b/0x330 fs/read_write.c:1058

Fixes: a50e233c50db ("net-gro: restore frag0 optimization")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index cc2a4e257324..66f7508825bd 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5775,7 +5775,6 @@ static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
 	skb_reset_mac_header(skb);
 	skb_gro_reset_offset(skb);
 
-	eth = skb_gro_header_fast(skb, 0);
 	if (unlikely(skb_gro_header_hard(skb, hlen))) {
 		eth = skb_gro_header_slow(skb, hlen, 0);
 		if (unlikely(!eth)) {
@@ -5785,6 +5784,7 @@ static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
 			return NULL;
 		}
 	} else {
+		eth = (const struct ethhdr *)skb->data;
 		gro_pull_from_frag0(skb, hlen);
 		NAPI_GRO_CB(skb)->frag0 += hlen;
 		NAPI_GRO_CB(skb)->frag0_len -= hlen;
-- 
cgit 


From 2b81f8161dfeda4017cef4f2498ccb64b13f0d61 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 29 May 2019 16:33:23 -0700
Subject: net: don't clear sock->sk early to avoid trouble in strparser

af_inet sets sock->sk to NULL which trips strparser over:

BUG: kernel NULL pointer dereference, address: 0000000000000012
PGD 0 P4D 0
Oops: 0000 [#1] SMP PTI
CPU: 7 PID: 0 Comm: swapper/7 Not tainted 5.2.0-rc1-00139-g14629453a6d3 #21
RIP: 0010:tcp_peek_len+0x10/0x60
RSP: 0018:ffffc02e41c54b98 EFLAGS: 00010246
RAX: 0000000000000000 RBX: ffff9cf924c4e030 RCX: 0000000000000051
RDX: 0000000000000000 RSI: 000000000000000c RDI: ffff9cf97128f480
RBP: ffff9cf9365e0300 R08: ffff9cf94fe7d2c0 R09: 0000000000000000
R10: 000000000000036b R11: ffff9cf939735e00 R12: ffff9cf91ad9ae40
R13: ffff9cf924c4e000 R14: ffff9cf9a8fcbaae R15: 0000000000000020
FS: 0000000000000000(0000) GS:ffff9cf9af7c0000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000012 CR3: 000000013920a003 CR4: 00000000003606e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
 Call Trace:
 <IRQ>
 strp_data_ready+0x48/0x90
 tls_data_ready+0x22/0xd0 [tls]
 tcp_rcv_established+0x569/0x620
 tcp_v4_do_rcv+0x127/0x1e0
 tcp_v4_rcv+0xad7/0xbf0
 ip_protocol_deliver_rcu+0x2c/0x1c0
 ip_local_deliver_finish+0x41/0x50
 ip_local_deliver+0x6b/0xe0
 ? ip_protocol_deliver_rcu+0x1c0/0x1c0
 ip_rcv+0x52/0xd0
 ? ip_rcv_finish_core.isra.20+0x380/0x380
 __netif_receive_skb_one_core+0x7e/0x90
 netif_receive_skb_internal+0x42/0xf0
 napi_gro_receive+0xed/0x150
 nfp_net_poll+0x7a2/0xd30 [nfp]
 ? kmem_cache_free_bulk+0x286/0x310
 net_rx_action+0x149/0x3b0
 __do_softirq+0xe3/0x30a
 ? handle_irq_event_percpu+0x6a/0x80
 irq_exit+0xe8/0xf0
 do_IRQ+0x85/0xd0
 common_interrupt+0xf/0xf
 </IRQ>
RIP: 0010:cpuidle_enter_state+0xbc/0x450

To avoid this issue set sock->sk after sk_prot->close.
My grepping and testing did not discover any code which
would depend on the current behaviour.

Fixes: c46234ebb4d1 ("tls: RX path for ktls")
Reported-by: David Beckett <david.beckett@netronome.com>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/af_inet.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 5183a2daba64..aff93e7cdb31 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -428,8 +428,8 @@ int inet_release(struct socket *sock)
 		if (sock_flag(sk, SOCK_LINGER) &&
 		    !(current->flags & PF_EXITING))
 			timeout = sk->sk_lingertime;
-		sock->sk = NULL;
 		sk->sk_prot->close(sk, timeout);
+		sock->sk = NULL;
 	}
 	return 0;
 }
-- 
cgit 


From b73484b2fc0d0ba84a13e9d86eb4adcae718163b Mon Sep 17 00:00:00 2001
From: Maxime Chevallier <maxime.chevallier@bootlin.com>
Date: Thu, 30 May 2019 16:08:40 +0200
Subject: ethtool: Check for vlan etype or vlan tci when parsing flow_rule

When parsing an ethtool flow spec to build a flow_rule, the code checks
if both the vlan etype and the vlan tci are specified by the user to add
a FLOW_DISSECTOR_KEY_VLAN match.

However, when the user only specified a vlan etype or a vlan tci, this
check silently ignores these parameters.

For example, the following rule :

ethtool -N eth0 flow-type udp4 vlan 0x0010 action -1 loc 0

will result in no error being issued, but the equivalent rule will be
created and passed to the NIC driver :

ethtool -N eth0 flow-type udp4 action -1 loc 0

In the end, neither the NIC driver using the rule nor the end user have
a way to know that these keys were dropped along the way, or that
incorrect parameters were entered.

This kind of check should be left to either the driver, or the ethtool
flow spec layer.

This commit makes so that ethtool parameters are forwarded as-is to the
NIC driver.

Since none of the users of ethtool_rx_flow_rule_create are using the
VLAN dissector, I don't think this qualifies as a regression.

Fixes: eca4205f9ec3 ("ethtool: add ethtool_rx_flow_spec to flow_rule structure translator")
Signed-off-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Acked-by: Pablo Neira Ayuso <pablo@gnumonks.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/ethtool.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 4a593853cbf2..43e9add58340 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -3010,11 +3010,12 @@ ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input)
 		const struct ethtool_flow_ext *ext_h_spec = &fs->h_ext;
 		const struct ethtool_flow_ext *ext_m_spec = &fs->m_ext;
 
-		if (ext_m_spec->vlan_etype &&
-		    ext_m_spec->vlan_tci) {
+		if (ext_m_spec->vlan_etype) {
 			match->key.vlan.vlan_tpid = ext_h_spec->vlan_etype;
 			match->mask.vlan.vlan_tpid = ext_m_spec->vlan_etype;
+		}
 
+		if (ext_m_spec->vlan_tci) {
 			match->key.vlan.vlan_id =
 				ntohs(ext_h_spec->vlan_tci) & 0x0fff;
 			match->mask.vlan.vlan_id =
@@ -3024,7 +3025,10 @@ ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input)
 				(ntohs(ext_h_spec->vlan_tci) & 0xe000) >> 13;
 			match->mask.vlan.vlan_priority =
 				(ntohs(ext_m_spec->vlan_tci) & 0xe000) >> 13;
+		}
 
+		if (ext_m_spec->vlan_etype ||
+		    ext_m_spec->vlan_tci) {
 			match->dissector.used_keys |=
 				BIT(FLOW_DISSECTOR_KEY_VLAN);
 			match->dissector.offset[FLOW_DISSECTOR_KEY_VLAN] =
-- 
cgit 


From 100f6d8e09905c59be45b6316f8f369c0be1b2d8 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 30 May 2019 18:01:21 -0400
Subject: net: correct zerocopy refcnt with udp MSG_MORE

TCP zerocopy takes a uarg reference for every skb, plus one for the
tcp_sendmsg_locked datapath temporarily, to avoid reaching refcnt zero
as it builds, sends and frees skbs inside its inner loop.

UDP and RAW zerocopy do not send inside the inner loop so do not need
the extra sock_zerocopy_get + sock_zerocopy_put pair. Commit
52900d22288ed ("udp: elide zerocopy operation in hot path") introduced
extra_uref to pass the initial reference taken in sock_zerocopy_alloc
to the first generated skb.

But, sock_zerocopy_realloc takes this extra reference at the start of
every call. With MSG_MORE, no new skb may be generated to attach the
extra_uref to, so refcnt is incorrectly 2 with only one skb.

Do not take the extra ref if uarg && !tcp, which implies MSG_MORE.
Update extra_uref accordingly.

This conditional assignment triggers a false positive may be used
uninitialized warning, so have to initialize extra_uref at define.

Changes v1->v2: fix typo in Fixes SHA1

Fixes: 52900d22288e7 ("udp: elide zerocopy operation in hot path")
Reported-by: syzbot <syzkaller@googlegroups.com>
Diagnosed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c     | 6 +++++-
 net/ipv4/ip_output.c  | 4 ++--
 net/ipv6/ip6_output.c | 4 ++--
 3 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index e89be6282693..eaad23f9c7b5 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1036,7 +1036,11 @@ struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
 			uarg->len++;
 			uarg->bytelen = bytelen;
 			atomic_set(&sk->sk_zckey, ++next);
-			sock_zerocopy_get(uarg);
+
+			/* no extra ref when appending to datagram (MSG_MORE) */
+			if (sk->sk_type == SOCK_STREAM)
+				sock_zerocopy_get(uarg);
+
 			return uarg;
 		}
 	}
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index bfd0ca554977..8c9189a41b13 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -878,7 +878,7 @@ static int __ip_append_data(struct sock *sk,
 	int csummode = CHECKSUM_NONE;
 	struct rtable *rt = (struct rtable *)cork->dst;
 	unsigned int wmem_alloc_delta = 0;
-	bool paged, extra_uref;
+	bool paged, extra_uref = false;
 	u32 tskey = 0;
 
 	skb = skb_peek_tail(queue);
@@ -918,7 +918,7 @@ static int __ip_append_data(struct sock *sk,
 		uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
 		if (!uarg)
 			return -ENOBUFS;
-		extra_uref = true;
+		extra_uref = !skb;	/* only extra ref if !MSG_MORE */
 		if (rt->dst.dev->features & NETIF_F_SG &&
 		    csummode == CHECKSUM_PARTIAL) {
 			paged = true;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index adef2236abe2..f9e43323e667 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1275,7 +1275,7 @@ static int __ip6_append_data(struct sock *sk,
 	int csummode = CHECKSUM_NONE;
 	unsigned int maxnonfragsize, headersize;
 	unsigned int wmem_alloc_delta = 0;
-	bool paged, extra_uref;
+	bool paged, extra_uref = false;
 
 	skb = skb_peek_tail(queue);
 	if (!skb) {
@@ -1344,7 +1344,7 @@ emsgsize:
 		uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
 		if (!uarg)
 			return -ENOBUFS;
-		extra_uref = true;
+		extra_uref = !skb;	/* only extra ref if !MSG_MORE */
 		if (rt->dst.dev->features & NETIF_F_SG &&
 		    csummode == CHECKSUM_PARTIAL) {
 			paged = true;
-- 
cgit