aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/af_inet.c17
-rw-r--r--net/ipv4/arp.c25
-rw-r--r--net/ipv4/bpf_tcp_ca.c39
-rw-r--r--net/ipv4/cipso_ipv4.c12
-rw-r--r--net/ipv4/devinet.c4
-rw-r--r--net/ipv4/esp4.c4
-rw-r--r--net/ipv4/fib_semantics.c15
-rw-r--r--net/ipv4/fib_trie.c2
-rw-r--r--net/ipv4/icmp.c20
-rw-r--r--net/ipv4/inet_timewait_sock.c3
-rw-r--r--net/ipv4/inetpeer.c12
-rw-r--r--net/ipv4/ip_output.c10
-rw-r--r--net/ipv4/ip_tunnel.c21
-rw-r--r--net/ipv4/ipconfig.c8
-rw-r--r--net/ipv4/ipmr.c217
-rw-r--r--net/ipv4/ipmr_base.c53
-rw-r--r--net/ipv4/nexthop.c5
-rw-r--r--net/ipv4/ping.c36
-rw-r--r--net/ipv4/raw.c172
-rw-r--r--net/ipv4/raw_diag.c57
-rw-r--r--net/ipv4/route.c41
-rw-r--r--net/ipv4/syncookies.c2
-rw-r--r--net/ipv4/sysctl_net_ipv4.c12
-rw-r--r--net/ipv4/tcp.c116
-rw-r--r--net/ipv4/tcp_bpf.c1
-rw-r--r--net/ipv4/tcp_input.c11
-rw-r--r--net/ipv4/tcp_ipv4.c5
-rw-r--r--net/ipv4/tcp_output.c36
-rw-r--r--net/ipv4/tcp_timer.c19
-rw-r--r--net/ipv4/udp.c25
-rw-r--r--net/ipv4/udplite.c3
-rw-r--r--net/ipv4/xfrm4_policy.c2
32 files changed, 542 insertions, 463 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 93da9f783bec..fac552e98eb9 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -148,10 +148,10 @@ void inet_sock_destruct(struct sock *sk)
return;
}
- WARN_ON(atomic_read(&sk->sk_rmem_alloc));
- WARN_ON(refcount_read(&sk->sk_wmem_alloc));
- WARN_ON(sk->sk_wmem_queued);
- WARN_ON(sk_forward_alloc_get(sk));
+ WARN_ON_ONCE(atomic_read(&sk->sk_rmem_alloc));
+ WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
+ WARN_ON_ONCE(sk->sk_wmem_queued);
+ WARN_ON_ONCE(sk_forward_alloc_get(sk));
kfree(rcu_dereference_protected(inet->inet_opt, 1));
dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1));
@@ -1040,6 +1040,7 @@ const struct proto_ops inet_stream_ops = {
.sendpage = inet_sendpage,
.splice_read = tcp_splice_read,
.read_sock = tcp_read_sock,
+ .read_skb = tcp_read_skb,
.sendmsg_locked = tcp_sendmsg_locked,
.sendpage_locked = tcp_sendpage_locked,
.peek_len = tcp_peek_len,
@@ -1067,7 +1068,7 @@ const struct proto_ops inet_dgram_ops = {
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
- .read_sock = udp_read_sock,
+ .read_skb = udp_read_skb,
.recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
.sendpage = inet_sendpage,
@@ -1246,7 +1247,7 @@ static int inet_sk_reselect_saddr(struct sock *sk)
if (new_saddr == old_saddr)
return 0;
- if (sock_net(sk)->ipv4.sysctl_ip_dynaddr > 1) {
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_dynaddr) > 1) {
pr_info("%s(): shifting inet->saddr from %pI4 to %pI4\n",
__func__, &old_saddr, &new_saddr);
}
@@ -1301,7 +1302,7 @@ int inet_sk_rebuild_header(struct sock *sk)
* Other protocols have to map its equivalent state to TCP_SYN_SENT.
* DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme
*/
- if (!sock_net(sk)->ipv4.sysctl_ip_dynaddr ||
+ if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_dynaddr) ||
sk->sk_state != TCP_SYN_SENT ||
(sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
(err = inet_sk_reselect_saddr(sk)) != 0)
@@ -1929,6 +1930,8 @@ static int __init inet_init(void)
sock_skb_cb_check_size(sizeof(struct inet_skb_parm));
+ raw_hashinfo_init(&raw_v4_hashinfo);
+
rc = proto_register(&tcp_prot, 1);
if (rc)
goto out;
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index ab4a5601c82a..87c7e3fc5197 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -168,6 +168,7 @@ struct neigh_table arp_tbl = {
[NEIGH_VAR_RETRANS_TIME] = 1 * HZ,
[NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ,
[NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ,
+ [NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ,
[NEIGH_VAR_GC_STALETIME] = 60 * HZ,
[NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX,
[NEIGH_VAR_PROXY_QLEN] = 64,
@@ -428,6 +429,26 @@ static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip)
return !inet_confirm_addr(net, in_dev, sip, tip, scope);
}
+static int arp_accept(struct in_device *in_dev, __be32 sip)
+{
+ struct net *net = dev_net(in_dev->dev);
+ int scope = RT_SCOPE_LINK;
+
+ switch (IN_DEV_ARP_ACCEPT(in_dev)) {
+ case 0: /* Don't create new entries from garp */
+ return 0;
+ case 1: /* Create new entries from garp */
+ return 1;
+ case 2: /* Create a neighbor in the arp table only if sip
+ * is in the same subnet as an address configured
+ * on the interface that received the garp message
+ */
+ return !!inet_confirm_addr(net, in_dev, sip, 0, scope);
+ default:
+ return 0;
+ }
+}
+
static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
{
struct rtable *rt;
@@ -867,12 +888,12 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
n = __neigh_lookup(&arp_tbl, &sip, dev, 0);
addr_type = -1;
- if (n || IN_DEV_ARP_ACCEPT(in_dev)) {
+ if (n || arp_accept(in_dev, sip)) {
is_garp = arp_is_garp(net, dev, &addr_type, arp->ar_op,
sip, tip, sha, tha);
}
- if (IN_DEV_ARP_ACCEPT(in_dev)) {
+ if (arp_accept(in_dev, sip)) {
/* Unsolicited ARP is not accepted by default.
It is possible, that this option should be enabled for some
devices (strip is candidate)
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index f79ab942f03b..7a181631b995 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -14,18 +14,6 @@
/* "extern" is to avoid sparse warning. It is only used in bpf_struct_ops.c. */
extern struct bpf_struct_ops bpf_tcp_congestion_ops;
-static u32 optional_ops[] = {
- offsetof(struct tcp_congestion_ops, init),
- offsetof(struct tcp_congestion_ops, release),
- offsetof(struct tcp_congestion_ops, set_state),
- offsetof(struct tcp_congestion_ops, cwnd_event),
- offsetof(struct tcp_congestion_ops, in_ack_event),
- offsetof(struct tcp_congestion_ops, pkts_acked),
- offsetof(struct tcp_congestion_ops, min_tso_segs),
- offsetof(struct tcp_congestion_ops, sndbuf_expand),
- offsetof(struct tcp_congestion_ops, cong_control),
-};
-
static u32 unsupported_ops[] = {
offsetof(struct tcp_congestion_ops, get_info),
};
@@ -51,18 +39,6 @@ static int bpf_tcp_ca_init(struct btf *btf)
return 0;
}
-static bool is_optional(u32 member_offset)
-{
- unsigned int i;
-
- for (i = 0; i < ARRAY_SIZE(optional_ops); i++) {
- if (member_offset == optional_ops[i])
- return true;
- }
-
- return false;
-}
-
static bool is_unsupported(u32 member_offset)
{
unsigned int i;
@@ -111,6 +87,12 @@ static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log,
}
switch (off) {
+ case offsetof(struct sock, sk_pacing_rate):
+ end = offsetofend(struct sock, sk_pacing_rate);
+ break;
+ case offsetof(struct sock, sk_pacing_status):
+ end = offsetofend(struct sock, sk_pacing_status);
+ break;
case bpf_ctx_range(struct inet_connection_sock, icsk_ca_priv):
end = offsetofend(struct inet_connection_sock, icsk_ca_priv);
break;
@@ -240,7 +222,6 @@ static int bpf_tcp_ca_init_member(const struct btf_type *t,
{
const struct tcp_congestion_ops *utcp_ca;
struct tcp_congestion_ops *tcp_ca;
- int prog_fd;
u32 moff;
utcp_ca = (const struct tcp_congestion_ops *)udata;
@@ -262,14 +243,6 @@ static int bpf_tcp_ca_init_member(const struct btf_type *t,
return 1;
}
- if (!btf_type_resolve_func_ptr(btf_vmlinux, member->type, NULL))
- return 0;
-
- /* Ensure bpf_prog is provided for compulsory func ptr */
- prog_fd = (int)(*(unsigned long *)(udata + moff));
- if (!prog_fd && !is_optional(moff) && !is_unsupported(moff))
- return -EINVAL;
-
return 0;
}
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 62d5f99760aa..6cd3b6c559f0 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -239,7 +239,7 @@ static int cipso_v4_cache_check(const unsigned char *key,
struct cipso_v4_map_cache_entry *prev_entry = NULL;
u32 hash;
- if (!cipso_v4_cache_enabled)
+ if (!READ_ONCE(cipso_v4_cache_enabled))
return -ENOENT;
hash = cipso_v4_map_cache_hash(key, key_len);
@@ -296,13 +296,14 @@ static int cipso_v4_cache_check(const unsigned char *key,
int cipso_v4_cache_add(const unsigned char *cipso_ptr,
const struct netlbl_lsm_secattr *secattr)
{
+ int bkt_size = READ_ONCE(cipso_v4_cache_bucketsize);
int ret_val = -EPERM;
u32 bkt;
struct cipso_v4_map_cache_entry *entry = NULL;
struct cipso_v4_map_cache_entry *old_entry = NULL;
u32 cipso_ptr_len;
- if (!cipso_v4_cache_enabled || cipso_v4_cache_bucketsize <= 0)
+ if (!READ_ONCE(cipso_v4_cache_enabled) || bkt_size <= 0)
return 0;
cipso_ptr_len = cipso_ptr[1];
@@ -322,7 +323,7 @@ int cipso_v4_cache_add(const unsigned char *cipso_ptr,
bkt = entry->hash & (CIPSO_V4_CACHE_BUCKETS - 1);
spin_lock_bh(&cipso_v4_cache[bkt].lock);
- if (cipso_v4_cache[bkt].size < cipso_v4_cache_bucketsize) {
+ if (cipso_v4_cache[bkt].size < bkt_size) {
list_add(&entry->list, &cipso_v4_cache[bkt].list);
cipso_v4_cache[bkt].size += 1;
} else {
@@ -1199,7 +1200,8 @@ static int cipso_v4_gentag_rbm(const struct cipso_v4_doi *doi_def,
/* This will send packets using the "optimized" format when
* possible as specified in section 3.4.2.6 of the
* CIPSO draft. */
- if (cipso_v4_rbm_optfmt && ret_val > 0 && ret_val <= 10)
+ if (READ_ONCE(cipso_v4_rbm_optfmt) && ret_val > 0 &&
+ ret_val <= 10)
tag_len = 14;
else
tag_len = 4 + ret_val;
@@ -1603,7 +1605,7 @@ int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option)
* all the CIPSO validations here but it doesn't
* really specify _exactly_ what we need to validate
* ... so, just make it a sysctl tunable. */
- if (cipso_v4_rbm_strictvalid) {
+ if (READ_ONCE(cipso_v4_rbm_strictvalid)) {
if (cipso_v4_map_lvl_valid(doi_def,
tag[3]) < 0) {
err_offset = opt_iter + 3;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index b2366ad540e6..92b778e423df 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -244,7 +244,7 @@ void in_dev_finish_destroy(struct in_device *idev)
#ifdef NET_REFCNT_DEBUG
pr_debug("%s: %p=%s\n", __func__, idev, dev ? dev->name : "NIL");
#endif
- dev_put_track(dev, &idev->dev_tracker);
+ netdev_put(dev, &idev->dev_tracker);
if (!idev->dead)
pr_err("Freeing alive in_device %p\n", idev);
else
@@ -272,7 +272,7 @@ static struct in_device *inetdev_init(struct net_device *dev)
if (IPV4_DEVCONF(in_dev->cnf, FORWARDING))
dev_disable_lro(dev);
/* Reference in_dev->dev */
- dev_hold_track(dev, &in_dev->dev_tracker, GFP_KERNEL);
+ netdev_hold(dev, &in_dev->dev_tracker, GFP_KERNEL);
/* Account for reference dev->ip_ptr (below) */
refcount_set(&in_dev->refcnt, 1);
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index b21238df3301..7eae8d686e20 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -502,9 +502,7 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *
nfrags++;
- skb->len += tailen;
- skb->data_len += tailen;
- skb->truesize += tailen;
+ skb_len_add(skb, tailen);
if (sk && sk_fullsock(sk))
refcount_add(tailen, &sk->sk_wmem_alloc);
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index a57ba23571c9..14f037e8ac55 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -211,7 +211,7 @@ static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp)
void fib_nh_common_release(struct fib_nh_common *nhc)
{
- dev_put_track(nhc->nhc_dev, &nhc->nhc_dev_tracker);
+ netdev_put(nhc->nhc_dev, &nhc->nhc_dev_tracker);
lwtstate_put(nhc->nhc_lwtstate);
rt_fibinfo_free_cpus(nhc->nhc_pcpu_rth_output);
rt_fibinfo_free(&nhc->nhc_rth_input);
@@ -1057,7 +1057,8 @@ static int fib_check_nh_v6_gw(struct net *net, struct fib_nh *nh,
err = ipv6_stub->fib6_nh_init(net, &fib6_nh, &cfg, GFP_KERNEL, extack);
if (!err) {
nh->fib_nh_dev = fib6_nh.fib_nh_dev;
- dev_hold_track(nh->fib_nh_dev, &nh->fib_nh_dev_tracker, GFP_KERNEL);
+ netdev_hold(nh->fib_nh_dev, &nh->fib_nh_dev_tracker,
+ GFP_KERNEL);
nh->fib_nh_oif = nh->fib_nh_dev->ifindex;
nh->fib_nh_scope = RT_SCOPE_LINK;
@@ -1141,7 +1142,7 @@ static int fib_check_nh_v4_gw(struct net *net, struct fib_nh *nh, u32 table,
if (!netif_carrier_ok(dev))
nh->fib_nh_flags |= RTNH_F_LINKDOWN;
nh->fib_nh_dev = dev;
- dev_hold_track(dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC);
+ netdev_hold(dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC);
nh->fib_nh_scope = RT_SCOPE_LINK;
return 0;
}
@@ -1195,7 +1196,7 @@ static int fib_check_nh_v4_gw(struct net *net, struct fib_nh *nh, u32 table,
"No egress device for nexthop gateway");
goto out;
}
- dev_hold_track(dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC);
+ netdev_hold(dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC);
if (!netif_carrier_ok(dev))
nh->fib_nh_flags |= RTNH_F_LINKDOWN;
err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN;
@@ -1229,8 +1230,8 @@ static int fib_check_nh_nongw(struct net *net, struct fib_nh *nh,
}
nh->fib_nh_dev = in_dev->dev;
- dev_hold_track(nh->fib_nh_dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC);
- nh->fib_nh_scope = RT_SCOPE_HOST;
+ netdev_hold(nh->fib_nh_dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC);
+ nh->fib_nh_scope = RT_SCOPE_LINK;
if (!netif_carrier_ok(nh->fib_nh_dev))
nh->fib_nh_flags |= RTNH_F_LINKDOWN;
err = 0;
@@ -1811,7 +1812,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
goto nla_put_failure;
if (nexthop_is_blackhole(fi->nh))
rtm->rtm_type = RTN_BLACKHOLE;
- if (!fi->fib_net->ipv4.sysctl_nexthop_compat_mode)
+ if (!READ_ONCE(fi->fib_net->ipv4.sysctl_nexthop_compat_mode))
goto offload;
}
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 2734c3af7e24..46e8a5125853 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -498,7 +498,7 @@ static void tnode_free(struct key_vector *tn)
tn = container_of(head, struct tnode, rcu)->kv;
}
- if (tnode_free_size >= sysctl_fib_sync_mem) {
+ if (tnode_free_size >= READ_ONCE(sysctl_fib_sync_mem)) {
tnode_free_size = 0;
synchronize_rcu();
}
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index efea0e796f06..57c4f0d87a7a 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -253,11 +253,12 @@ bool icmp_global_allow(void)
spin_lock(&icmp_global.lock);
delta = min_t(u32, now - icmp_global.stamp, HZ);
if (delta >= HZ / 50) {
- incr = sysctl_icmp_msgs_per_sec * delta / HZ ;
+ incr = READ_ONCE(sysctl_icmp_msgs_per_sec) * delta / HZ;
if (incr)
WRITE_ONCE(icmp_global.stamp, now);
}
- credit = min_t(u32, icmp_global.credit + incr, sysctl_icmp_msgs_burst);
+ credit = min_t(u32, icmp_global.credit + incr,
+ READ_ONCE(sysctl_icmp_msgs_burst));
if (credit) {
/* We want to use a credit of one in average, but need to randomize
* it for security reasons.
@@ -281,7 +282,7 @@ static bool icmpv4_mask_allow(struct net *net, int type, int code)
return true;
/* Limit if icmp type is enabled in ratemask. */
- if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask))
+ if (!((1 << type) & READ_ONCE(net->ipv4.sysctl_icmp_ratemask)))
return true;
return false;
@@ -319,7 +320,8 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
vif = l3mdev_master_ifindex(dst->dev);
peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1);
- rc = inet_peer_xrlim_allow(peer, net->ipv4.sysctl_icmp_ratelimit);
+ rc = inet_peer_xrlim_allow(peer,
+ READ_ONCE(net->ipv4.sysctl_icmp_ratelimit));
if (peer)
inet_putpeer(peer);
out:
@@ -692,7 +694,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
rcu_read_lock();
if (rt_is_input_route(rt) &&
- net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr)
+ READ_ONCE(net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr))
dev = dev_get_by_index_rcu(net, inet_iif(skb_in));
if (dev)
@@ -932,7 +934,7 @@ static enum skb_drop_reason icmp_unreach(struct sk_buff *skb)
* get the other vendor to fix their kit.
*/
- if (!net->ipv4.sysctl_icmp_ignore_bogus_error_responses &&
+ if (!READ_ONCE(net->ipv4.sysctl_icmp_ignore_bogus_error_responses) &&
inet_addr_type_dev_table(net, skb->dev, iph->daddr) == RTN_BROADCAST) {
net_warn_ratelimited("%pI4 sent an invalid ICMP type %u, code %u error to a broadcast: %pI4 on %s\n",
&ip_hdr(skb)->saddr,
@@ -992,7 +994,7 @@ static enum skb_drop_reason icmp_echo(struct sk_buff *skb)
net = dev_net(skb_dst(skb)->dev);
/* should there be an ICMP stat for ignored echos? */
- if (net->ipv4.sysctl_icmp_echo_ignore_all)
+ if (READ_ONCE(net->ipv4.sysctl_icmp_echo_ignore_all))
return SKB_NOT_DROPPED_YET;
icmp_param.data.icmph = *icmp_hdr(skb);
@@ -1027,7 +1029,7 @@ bool icmp_build_probe(struct sk_buff *skb, struct icmphdr *icmphdr)
u16 ident_len;
u8 status;
- if (!net->ipv4.sysctl_icmp_echo_enable_probe)
+ if (!READ_ONCE(net->ipv4.sysctl_icmp_echo_enable_probe))
return false;
/* We currently only support probing interfaces on the proxy node
@@ -1248,7 +1250,7 @@ int icmp_rcv(struct sk_buff *skb)
*/
if ((icmph->type == ICMP_ECHO ||
icmph->type == ICMP_TIMESTAMP) &&
- net->ipv4.sysctl_icmp_echo_ignore_broadcasts) {
+ READ_ONCE(net->ipv4.sysctl_icmp_echo_ignore_broadcasts)) {
reason = SKB_DROP_REASON_INVALID_PROTO;
goto error;
}
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 0ec501845cb3..47ccc343c9fb 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -156,7 +156,8 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
{
struct inet_timewait_sock *tw;
- if (refcount_read(&dr->tw_refcount) - 1 >= dr->sysctl_max_tw_buckets)
+ if (refcount_read(&dr->tw_refcount) - 1 >=
+ READ_ONCE(dr->sysctl_max_tw_buckets))
return NULL;
tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab,
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index da21dfce24d7..e9fed83e9b3c 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -141,16 +141,20 @@ static void inet_peer_gc(struct inet_peer_base *base,
struct inet_peer *gc_stack[],
unsigned int gc_cnt)
{
+ int peer_threshold, peer_maxttl, peer_minttl;
struct inet_peer *p;
__u32 delta, ttl;
int i;
- if (base->total >= inet_peer_threshold)
+ peer_threshold = READ_ONCE(inet_peer_threshold);
+ peer_maxttl = READ_ONCE(inet_peer_maxttl);
+ peer_minttl = READ_ONCE(inet_peer_minttl);
+
+ if (base->total >= peer_threshold)
ttl = 0; /* be aggressive */
else
- ttl = inet_peer_maxttl
- - (inet_peer_maxttl - inet_peer_minttl) / HZ *
- base->total / inet_peer_threshold * HZ;
+ ttl = peer_maxttl - (peer_maxttl - peer_minttl) / HZ *
+ base->total / peer_threshold * HZ;
for (i = 0; i < gc_cnt; i++) {
p = gc_stack[i];
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index df7f9dfbe8be..d7bd1daf022b 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1236,9 +1236,7 @@ alloc_new_skb:
pfrag->offset += copy;
skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
- skb->len += copy;
- skb->data_len += copy;
- skb->truesize += copy;
+ skb_len_add(skb, copy);
wmem_alloc_delta += copy;
} else {
err = skb_zerocopy_iter_dgram(skb, from, copy);
@@ -1465,9 +1463,7 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
skb->csum = csum_block_add(skb->csum, csum, skb->len);
}
- skb->len += len;
- skb->data_len += len;
- skb->truesize += len;
+ skb_len_add(skb, len);
refcount_add(len, &sk->sk_wmem_alloc);
offset += len;
size -= len;
@@ -1726,7 +1722,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
tcp_hdr(skb)->source, tcp_hdr(skb)->dest,
arg->uid);
security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4));
- rt = ip_route_output_key(net, &fl4);
+ rt = ip_route_output_flow(net, &fl4, sk);
if (IS_ERR(rt))
return;
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 94017a8c3994..e65e948cab9f 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -242,7 +242,7 @@ static struct net_device *__ip_tunnel_create(struct net *net,
if (parms->name[0]) {
if (!dev_valid_name(parms->name))
goto failed;
- strlcpy(name, parms->name, IFNAMSIZ);
+ strscpy(name, parms->name, IFNAMSIZ);
} else {
if (strlen(ops->kind) > (IFNAMSIZ - 3))
goto failed;
@@ -641,6 +641,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
const struct iphdr *inner_iph;
unsigned int max_headroom; /* The extra header space needed */
struct rtable *rt = NULL; /* Route to the other host */
+ __be16 payload_protocol;
bool use_cache = false;
struct flowi4 fl4;
bool md = false;
@@ -651,6 +652,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
connected = (tunnel->parms.iph.daddr != 0);
+ payload_protocol = skb_protocol(skb, true);
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
@@ -670,13 +672,12 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
dst = tun_info->key.u.ipv4.dst;
md = true;
connected = true;
- }
- else if (skb->protocol == htons(ETH_P_IP)) {
+ } else if (payload_protocol == htons(ETH_P_IP)) {
rt = skb_rtable(skb);
dst = rt_nexthop(rt, inner_iph->daddr);
}
#if IS_ENABLED(CONFIG_IPV6)
- else if (skb->protocol == htons(ETH_P_IPV6)) {
+ else if (payload_protocol == htons(ETH_P_IPV6)) {
const struct in6_addr *addr6;
struct neighbour *neigh;
bool do_tx_error_icmp;
@@ -716,10 +717,10 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
tos = tnl_params->tos;
if (tos & 0x1) {
tos &= ~0x1;
- if (skb->protocol == htons(ETH_P_IP)) {
+ if (payload_protocol == htons(ETH_P_IP)) {
tos = inner_iph->tos;
connected = false;
- } else if (skb->protocol == htons(ETH_P_IPV6)) {
+ } else if (payload_protocol == htons(ETH_P_IPV6)) {
tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
connected = false;
}
@@ -765,7 +766,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
}
df = tnl_params->frag_off;
- if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
+ if (payload_protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
df |= (inner_iph->frag_off & htons(IP_DF));
if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) {
@@ -786,10 +787,10 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
ttl = tnl_params->ttl;
if (ttl == 0) {
- if (skb->protocol == htons(ETH_P_IP))
+ if (payload_protocol == htons(ETH_P_IP))
ttl = inner_iph->ttl;
#if IS_ENABLED(CONFIG_IPV6)
- else if (skb->protocol == htons(ETH_P_IPV6))
+ else if (payload_protocol == htons(ETH_P_IPV6))
ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
#endif
else
@@ -1065,7 +1066,7 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
memset(&parms, 0, sizeof(parms));
if (devname)
- strlcpy(parms.name, devname, IFNAMSIZ);
+ strscpy(parms.name, devname, IFNAMSIZ);
rtnl_lock();
itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 9d41d5d5cd1e..f53a0f2453af 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -1759,15 +1759,15 @@ static int __init ip_auto_config_setup(char *addrs)
case 4:
if ((dp = strchr(ip, '.'))) {
*dp++ = '\0';
- strlcpy(utsname()->domainname, dp,
+ strscpy(utsname()->domainname, dp,
sizeof(utsname()->domainname));
}
- strlcpy(utsname()->nodename, ip,
+ strscpy(utsname()->nodename, ip,
sizeof(utsname()->nodename));
ic_host_name_set = 1;
break;
case 5:
- strlcpy(user_dev_name, ip, sizeof(user_dev_name));
+ strscpy(user_dev_name, ip, sizeof(user_dev_name));
break;
case 6:
if (ic_proto_name(ip) == 0 &&
@@ -1814,7 +1814,7 @@ __setup("nfsaddrs=", nfsaddrs_config_setup);
static int __init vendor_class_identifier_setup(char *addrs)
{
- if (strlcpy(vendor_class_identifier, addrs,
+ if (strscpy(vendor_class_identifier, addrs,
sizeof(vendor_class_identifier))
>= sizeof(vendor_class_identifier))
pr_warn("DHCP: vendorclass too long, truncated to \"%s\"\n",
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 13e6329784fb..73651d17e51f 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -77,7 +77,12 @@ struct ipmr_result {
* Note that the changes are semaphored via rtnl_lock.
*/
-static DEFINE_RWLOCK(mrt_lock);
+static DEFINE_SPINLOCK(mrt_lock);
+
+static struct net_device *vif_dev_read(const struct vif_device *vif)
+{
+ return rcu_dereference(vif->dev);
+}
/* Multicast router control variables */
@@ -100,11 +105,11 @@ static void ipmr_free_table(struct mr_table *mrt);
static void ip_mr_forward(struct net *net, struct mr_table *mrt,
struct net_device *dev, struct sk_buff *skb,
struct mfc_cache *cache, int local);
-static int ipmr_cache_report(struct mr_table *mrt,
+static int ipmr_cache_report(const struct mr_table *mrt,
struct sk_buff *pkt, vifi_t vifi, int assert);
static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
int cmd);
-static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt);
+static void igmpmsg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt);
static void mroute_clean_tables(struct mr_table *mrt, int flags);
static void ipmr_expire_process(struct timer_list *t);
@@ -501,11 +506,15 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
return err;
}
- read_lock(&mrt_lock);
dev->stats.tx_bytes += skb->len;
dev->stats.tx_packets++;
- ipmr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, IGMPMSG_WHOLEPKT);
- read_unlock(&mrt_lock);
+ rcu_read_lock();
+
+ /* Pairs with WRITE_ONCE() in vif_add() and vif_delete() */
+ ipmr_cache_report(mrt, skb, READ_ONCE(mrt->mroute_reg_vif_num),
+ IGMPMSG_WHOLEPKT);
+
+ rcu_read_unlock();
kfree_skb(skb);
return NETDEV_TX_OK;
}
@@ -572,6 +581,7 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
{
struct net_device *reg_dev = NULL;
struct iphdr *encap;
+ int vif_num;
encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
/* Check that:
@@ -584,11 +594,10 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
ntohs(encap->tot_len) + pimlen > skb->len)
return 1;
- read_lock(&mrt_lock);
- if (mrt->mroute_reg_vif_num >= 0)
- reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev;
- read_unlock(&mrt_lock);
-
+ /* Pairs with WRITE_ONCE() in vif_add()/vid_delete() */
+ vif_num = READ_ONCE(mrt->mroute_reg_vif_num);
+ if (vif_num >= 0)
+ reg_dev = vif_dev_read(&mrt->vif_table[vif_num]);
if (!reg_dev)
return 1;
@@ -614,10 +623,11 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
static int call_ipmr_vif_entry_notifiers(struct net *net,
enum fib_event_type event_type,
struct vif_device *vif,
+ struct net_device *vif_dev,
vifi_t vif_index, u32 tb_id)
{
return mr_call_vif_notifiers(net, RTNL_FAMILY_IPMR, event_type,
- vif, vif_index, tb_id,
+ vif, vif_dev, vif_index, tb_id,
&net->ipv4.ipmr_seq);
}
@@ -649,22 +659,19 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,
v = &mrt->vif_table[vifi];
- if (VIF_EXISTS(mrt, vifi))
- call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_DEL, v, vifi,
- mrt->id);
-
- write_lock_bh(&mrt_lock);
- dev = v->dev;
- v->dev = NULL;
-
- if (!dev) {
- write_unlock_bh(&mrt_lock);
+ dev = rtnl_dereference(v->dev);
+ if (!dev)
return -EADDRNOTAVAIL;
- }
- if (vifi == mrt->mroute_reg_vif_num)
- mrt->mroute_reg_vif_num = -1;
+ spin_lock(&mrt_lock);
+ call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_DEL, v, dev,
+ vifi, mrt->id);
+ RCU_INIT_POINTER(v->dev, NULL);
+ if (vifi == mrt->mroute_reg_vif_num) {
+ /* Pairs with READ_ONCE() in ipmr_cache_report() and reg_vif_xmit() */
+ WRITE_ONCE(mrt->mroute_reg_vif_num, -1);
+ }
if (vifi + 1 == mrt->maxvif) {
int tmp;
@@ -672,10 +679,10 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,
if (VIF_EXISTS(mrt, tmp))
break;
}
- mrt->maxvif = tmp+1;
+ WRITE_ONCE(mrt->maxvif, tmp + 1);
}
- write_unlock_bh(&mrt_lock);
+ spin_unlock(&mrt_lock);
dev_set_allmulti(dev, -1);
@@ -691,7 +698,7 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,
if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER) && !notify)
unregister_netdevice_queue(dev, head);
- dev_put_track(dev, &v->dev_tracker);
+ netdev_put(dev, &v->dev_tracker);
return 0;
}
@@ -777,7 +784,7 @@ out:
spin_unlock(&mfc_unres_lock);
}
-/* Fill oifs list. It is called under write locked mrt_lock. */
+/* Fill oifs list. It is called under locked mrt_lock. */
static void ipmr_update_thresholds(struct mr_table *mrt, struct mr_mfc *cache,
unsigned char *ttls)
{
@@ -889,15 +896,18 @@ static int vif_add(struct net *net, struct mr_table *mrt,
v->remote = vifc->vifc_rmt_addr.s_addr;
/* And finish update writing critical data */
- write_lock_bh(&mrt_lock);
- v->dev = dev;
+ spin_lock(&mrt_lock);
+ rcu_assign_pointer(v->dev, dev);
netdev_tracker_alloc(dev, &v->dev_tracker, GFP_ATOMIC);
- if (v->flags & VIFF_REGISTER)
- mrt->mroute_reg_vif_num = vifi;
+ if (v->flags & VIFF_REGISTER) {
+ /* Pairs with READ_ONCE() in ipmr_cache_report() and reg_vif_xmit() */
+ WRITE_ONCE(mrt->mroute_reg_vif_num, vifi);
+ }
if (vifi+1 > mrt->maxvif)
- mrt->maxvif = vifi+1;
- write_unlock_bh(&mrt_lock);
- call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD, v, vifi, mrt->id);
+ WRITE_ONCE(mrt->maxvif, vifi + 1);
+ spin_unlock(&mrt_lock);
+ call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD, v, dev,
+ vifi, mrt->id);
return 0;
}
@@ -1001,9 +1011,9 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
/* Bounce a cache query up to mrouted and netlink.
*
- * Called under mrt_lock.
+ * Called under rcu_read_lock().
*/
-static int ipmr_cache_report(struct mr_table *mrt,
+static int ipmr_cache_report(const struct mr_table *mrt,
struct sk_buff *pkt, vifi_t vifi, int assert)
{
const int ihl = ip_hdrlen(pkt);
@@ -1038,8 +1048,11 @@ static int ipmr_cache_report(struct mr_table *mrt,
msg->im_vif = vifi;
msg->im_vif_hi = vifi >> 8;
} else {
- msg->im_vif = mrt->mroute_reg_vif_num;
- msg->im_vif_hi = mrt->mroute_reg_vif_num >> 8;
+ /* Pairs with WRITE_ONCE() in vif_add() and vif_delete() */
+ int vif_num = READ_ONCE(mrt->mroute_reg_vif_num);
+
+ msg->im_vif = vif_num;
+ msg->im_vif_hi = vif_num >> 8;
}
ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
@@ -1064,10 +1077,8 @@ static int ipmr_cache_report(struct mr_table *mrt,
skb->transport_header = skb->network_header;
}
- rcu_read_lock();
mroute_sk = rcu_dereference(mrt->mroute_sk);
if (!mroute_sk) {
- rcu_read_unlock();
kfree_skb(skb);
return -EINVAL;
}
@@ -1076,7 +1087,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
/* Deliver to mrouted */
ret = sock_queue_rcv_skb(mroute_sk, skb);
- rcu_read_unlock();
+
if (ret < 0) {
net_warn_ratelimited("mroute: pending queue full, dropping entries\n");
kfree_skb(skb);
@@ -1086,6 +1097,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
}
/* Queue a packet for resolution. It gets locked cache entry! */
+/* Called under rcu_read_lock() */
static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
struct sk_buff *skb, struct net_device *dev)
{
@@ -1198,12 +1210,12 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
mfc->mfcc_mcastgrp.s_addr, parent);
rcu_read_unlock();
if (c) {
- write_lock_bh(&mrt_lock);
+ spin_lock(&mrt_lock);
c->_c.mfc_parent = mfc->mfcc_parent;
ipmr_update_thresholds(mrt, &c->_c, mfc->mfcc_ttls);
if (!mrtsock)
c->_c.mfc_flags |= MFC_STATIC;
- write_unlock_bh(&mrt_lock);
+ spin_unlock(&mrt_lock);
call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, c,
mrt->id);
mroute_netlink_event(mrt, c, RTM_NEWROUTE);
@@ -1598,20 +1610,20 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
if (vr.vifi >= mrt->maxvif)
return -EINVAL;
vr.vifi = array_index_nospec(vr.vifi, mrt->maxvif);
- read_lock(&mrt_lock);
+ rcu_read_lock();
vif = &mrt->vif_table[vr.vifi];
if (VIF_EXISTS(mrt, vr.vifi)) {
- vr.icount = vif->pkt_in;
- vr.ocount = vif->pkt_out;
- vr.ibytes = vif->bytes_in;
- vr.obytes = vif->bytes_out;
- read_unlock(&mrt_lock);
+ vr.icount = READ_ONCE(vif->pkt_in);
+ vr.ocount = READ_ONCE(vif->pkt_out);
+ vr.ibytes = READ_ONCE(vif->bytes_in);
+ vr.obytes = READ_ONCE(vif->bytes_out);
+ rcu_read_unlock();
if (copy_to_user(arg, &vr, sizeof(vr)))
return -EFAULT;
return 0;
}
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
return -EADDRNOTAVAIL;
case SIOCGETSGCNT:
if (copy_from_user(&sr, arg, sizeof(sr)))
@@ -1673,20 +1685,20 @@ int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
if (vr.vifi >= mrt->maxvif)
return -EINVAL;
vr.vifi = array_index_nospec(vr.vifi, mrt->maxvif);
- read_lock(&mrt_lock);
+ rcu_read_lock();
vif = &mrt->vif_table[vr.vifi];
if (VIF_EXISTS(mrt, vr.vifi)) {
- vr.icount = vif->pkt_in;
- vr.ocount = vif->pkt_out;
- vr.ibytes = vif->bytes_in;
- vr.obytes = vif->bytes_out;
- read_unlock(&mrt_lock);
+ vr.icount = READ_ONCE(vif->pkt_in);
+ vr.ocount = READ_ONCE(vif->pkt_out);
+ vr.ibytes = READ_ONCE(vif->bytes_in);
+ vr.obytes = READ_ONCE(vif->bytes_out);
+ rcu_read_unlock();
if (copy_to_user(arg, &vr, sizeof(vr)))
return -EFAULT;
return 0;
}
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
return -EADDRNOTAVAIL;
case SIOCGETSGCNT:
if (copy_from_user(&sr, arg, sizeof(sr)))
@@ -1726,7 +1738,7 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v
ipmr_for_each_table(mrt, net) {
v = &mrt->vif_table[0];
for (ct = 0; ct < mrt->maxvif; ct++, v++) {
- if (v->dev == dev)
+ if (rcu_access_pointer(v->dev) == dev)
vif_delete(mrt, ct, 1, NULL);
}
}
@@ -1804,26 +1816,28 @@ static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt,
}
#endif
-/* Processing handlers for ipmr_forward */
+/* Processing handlers for ipmr_forward, under rcu_read_lock() */
static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
int in_vifi, struct sk_buff *skb, int vifi)
{
const struct iphdr *iph = ip_hdr(skb);
struct vif_device *vif = &mrt->vif_table[vifi];
+ struct net_device *vif_dev;
struct net_device *dev;
struct rtable *rt;
struct flowi4 fl4;
int encap = 0;
- if (!vif->dev)
+ vif_dev = vif_dev_read(vif);
+ if (!vif_dev)
goto out_free;
if (vif->flags & VIFF_REGISTER) {
- vif->pkt_out++;
- vif->bytes_out += skb->len;
- vif->dev->stats.tx_bytes += skb->len;
- vif->dev->stats.tx_packets++;
+ WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1);
+ WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len);
+ vif_dev->stats.tx_bytes += skb->len;
+ vif_dev->stats.tx_packets++;
ipmr_cache_report(mrt, skb, vifi, IGMPMSG_WHOLEPKT);
goto out_free;
}
@@ -1868,8 +1882,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
goto out_free;
}
- vif->pkt_out++;
- vif->bytes_out += skb->len;
+ WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1);
+ WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len);
skb_dst_drop(skb);
skb_dst_set(skb, &rt->dst);
@@ -1881,8 +1895,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
if (vif->flags & VIFF_TUNNEL) {
ip_encap(net, skb, vif->local, vif->remote);
/* FIXME: extra output firewall step used to be here. --RR */
- vif->dev->stats.tx_packets++;
- vif->dev->stats.tx_bytes += skb->len;
+ vif_dev->stats.tx_packets++;
+ vif_dev->stats.tx_bytes += skb->len;
}
IPCB(skb)->flags |= IPSKB_FORWARDED;
@@ -1906,18 +1920,20 @@ out_free:
kfree_skb(skb);
}
-static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev)
+/* Called with mrt_lock or rcu_read_lock() */
+static int ipmr_find_vif(const struct mr_table *mrt, struct net_device *dev)
{
int ct;
-
- for (ct = mrt->maxvif-1; ct >= 0; ct--) {
- if (mrt->vif_table[ct].dev == dev)
+ /* Pairs with WRITE_ONCE() in vif_delete()/vif_add() */
+ for (ct = READ_ONCE(mrt->maxvif) - 1; ct >= 0; ct--) {
+ if (rcu_access_pointer(mrt->vif_table[ct].dev) == dev)
break;
}
return ct;
}
/* "local" means that we should preserve one skb (for local delivery) */
+/* Called uner rcu_read_lock() */
static void ip_mr_forward(struct net *net, struct mr_table *mrt,
struct net_device *dev, struct sk_buff *skb,
struct mfc_cache *c, int local)
@@ -1944,7 +1960,7 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt,
}
/* Wrong interface: drop packet and (maybe) send PIM assert. */
- if (mrt->vif_table[vif].dev != dev) {
+ if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev) {
if (rt_is_output_route(skb_rtable(skb))) {
/* It is our own packet, looped back.
* Very complicated situation...
@@ -1983,8 +1999,10 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt,
}
forward:
- mrt->vif_table[vif].pkt_in++;
- mrt->vif_table[vif].bytes_in += skb->len;
+ WRITE_ONCE(mrt->vif_table[vif].pkt_in,
+ mrt->vif_table[vif].pkt_in + 1);
+ WRITE_ONCE(mrt->vif_table[vif].bytes_in,
+ mrt->vif_table[vif].bytes_in + skb->len);
/* Forward the frame */
if (c->mfc_origin == htonl(INADDR_ANY) &&
@@ -2140,22 +2158,14 @@ int ip_mr_input(struct sk_buff *skb)
skb = skb2;
}
- read_lock(&mrt_lock);
vif = ipmr_find_vif(mrt, dev);
- if (vif >= 0) {
- int err2 = ipmr_cache_unresolved(mrt, vif, skb, dev);
- read_unlock(&mrt_lock);
-
- return err2;
- }
- read_unlock(&mrt_lock);
+ if (vif >= 0)
+ return ipmr_cache_unresolved(mrt, vif, skb, dev);
kfree_skb(skb);
return -ENODEV;
}
- read_lock(&mrt_lock);
ip_mr_forward(net, mrt, dev, skb, cache, local);
- read_unlock(&mrt_lock);
if (local)
return ip_local_deliver(skb);
@@ -2252,18 +2262,15 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb,
int vif = -1;
dev = skb->dev;
- read_lock(&mrt_lock);
if (dev)
vif = ipmr_find_vif(mrt, dev);
if (vif < 0) {
- read_unlock(&mrt_lock);
rcu_read_unlock();
return -ENODEV;
}
skb2 = skb_realloc_headroom(skb, sizeof(struct iphdr));
if (!skb2) {
- read_unlock(&mrt_lock);
rcu_read_unlock();
return -ENOMEM;
}
@@ -2277,14 +2284,11 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb,
iph->daddr = daddr;
iph->version = 0;
err = ipmr_cache_unresolved(mrt, vif, skb2, dev);
- read_unlock(&mrt_lock);
rcu_read_unlock();
return err;
}
- read_lock(&mrt_lock);
err = mr_fill_mroute(mrt, skb, &cache->_c, rtm);
- read_unlock(&mrt_lock);
rcu_read_unlock();
return err;
}
@@ -2404,7 +2408,7 @@ static size_t igmpmsg_netlink_msgsize(size_t payloadlen)
return len;
}
-static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt)
+static void igmpmsg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt)
{
struct net *net = read_pnet(&mrt->net);
struct nlmsghdr *nlh;
@@ -2744,18 +2748,21 @@ static bool ipmr_fill_table(struct mr_table *mrt, struct sk_buff *skb)
static bool ipmr_fill_vif(struct mr_table *mrt, u32 vifid, struct sk_buff *skb)
{
+ struct net_device *vif_dev;
struct nlattr *vif_nest;
struct vif_device *vif;
+ vif = &mrt->vif_table[vifid];
+ vif_dev = rtnl_dereference(vif->dev);
/* if the VIF doesn't exist just continue */
- if (!VIF_EXISTS(mrt, vifid))
+ if (!vif_dev)
return true;
- vif = &mrt->vif_table[vifid];
vif_nest = nla_nest_start_noflag(skb, IPMRA_VIF);
if (!vif_nest)
return false;
- if (nla_put_u32(skb, IPMRA_VIFA_IFINDEX, vif->dev->ifindex) ||
+
+ if (nla_put_u32(skb, IPMRA_VIFA_IFINDEX, vif_dev->ifindex) ||
nla_put_u32(skb, IPMRA_VIFA_VIF_ID, vifid) ||
nla_put_u16(skb, IPMRA_VIFA_FLAGS, vif->flags) ||
nla_put_u64_64bit(skb, IPMRA_VIFA_BYTES_IN, vif->bytes_in,
@@ -2887,7 +2894,7 @@ out:
*/
static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
- __acquires(mrt_lock)
+ __acquires(RCU)
{
struct mr_vif_iter *iter = seq->private;
struct net *net = seq_file_net(seq);
@@ -2899,14 +2906,14 @@ static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
iter->mrt = mrt;
- read_lock(&mrt_lock);
+ rcu_read_lock();
return mr_vif_seq_start(seq, pos);
}
static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
- __releases(mrt_lock)
+ __releases(RCU)
{
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
}
static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
@@ -2919,9 +2926,11 @@ static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
"Interface BytesIn PktsIn BytesOut PktsOut Flags Local Remote\n");
} else {
const struct vif_device *vif = v;
- const char *name = vif->dev ?
- vif->dev->name : "none";
+ const struct net_device *vif_dev;
+ const char *name;
+ vif_dev = vif_dev_read(vif);
+ name = vif_dev ? vif_dev->name : "none";
seq_printf(seq,
"%2td %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n",
vif - mrt->vif_table,
@@ -3017,7 +3026,7 @@ static int ipmr_dump(struct net *net, struct notifier_block *nb,
struct netlink_ext_ack *extack)
{
return mr_dump(net, nb, RTNL_FAMILY_IPMR, ipmr_rules_dump,
- ipmr_mr_table_iter, &mrt_lock, extack);
+ ipmr_mr_table_iter, extack);
}
static const struct fib_notifier_ops ipmr_notifier_ops_template = {
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
index aa8738a91210..271dc03fc6db 100644
--- a/net/ipv4/ipmr_base.c
+++ b/net/ipv4/ipmr_base.c
@@ -13,7 +13,7 @@ void vif_device_init(struct vif_device *v,
unsigned short flags,
unsigned short get_iflink_mask)
{
- v->dev = NULL;
+ RCU_INIT_POINTER(v->dev, NULL);
v->bytes_in = 0;
v->bytes_out = 0;
v->pkt_in = 0;
@@ -208,6 +208,7 @@ EXPORT_SYMBOL(mr_mfc_seq_next);
int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
struct mr_mfc *c, struct rtmsg *rtm)
{
+ struct net_device *vif_dev;
struct rta_mfc_stats mfcs;
struct nlattr *mp_attr;
struct rtnexthop *nhp;
@@ -220,10 +221,13 @@ int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
return -ENOENT;
}
- if (VIF_EXISTS(mrt, c->mfc_parent) &&
- nla_put_u32(skb, RTA_IIF,
- mrt->vif_table[c->mfc_parent].dev->ifindex) < 0)
+ rcu_read_lock();
+ vif_dev = rcu_dereference(mrt->vif_table[c->mfc_parent].dev);
+ if (vif_dev && nla_put_u32(skb, RTA_IIF, vif_dev->ifindex) < 0) {
+ rcu_read_unlock();
return -EMSGSIZE;
+ }
+ rcu_read_unlock();
if (c->mfc_flags & MFC_OFFLOAD)
rtm->rtm_flags |= RTNH_F_OFFLOAD;
@@ -232,23 +236,27 @@ int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
if (!mp_attr)
return -EMSGSIZE;
+ rcu_read_lock();
for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
- if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
- struct vif_device *vif;
+ struct vif_device *vif = &mrt->vif_table[ct];
+
+ vif_dev = rcu_dereference(vif->dev);
+ if (vif_dev && c->mfc_un.res.ttls[ct] < 255) {
nhp = nla_reserve_nohdr(skb, sizeof(*nhp));
if (!nhp) {
+ rcu_read_unlock();
nla_nest_cancel(skb, mp_attr);
return -EMSGSIZE;
}
nhp->rtnh_flags = 0;
nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
- vif = &mrt->vif_table[ct];
- nhp->rtnh_ifindex = vif->dev->ifindex;
+ nhp->rtnh_ifindex = vif_dev->ifindex;
nhp->rtnh_len = sizeof(*nhp);
}
}
+ rcu_read_unlock();
nla_nest_end(skb, mp_attr);
@@ -275,13 +283,14 @@ static bool mr_mfc_uses_dev(const struct mr_table *mrt,
int ct;
for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
- if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
- const struct vif_device *vif;
-
- vif = &mrt->vif_table[ct];
- if (vif->dev == dev)
- return true;
- }
+ const struct net_device *vif_dev;
+ const struct vif_device *vif;
+
+ vif = &mrt->vif_table[ct];
+ vif_dev = rcu_access_pointer(vif->dev);
+ if (vif_dev && c->mfc_un.res.ttls[ct] < 255 &&
+ vif_dev == dev)
+ return true;
}
return false;
}
@@ -390,7 +399,6 @@ int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family,
struct netlink_ext_ack *extack),
struct mr_table *(*mr_iter)(struct net *net,
struct mr_table *mrt),
- rwlock_t *mrt_lock,
struct netlink_ext_ack *extack)
{
struct mr_table *mrt;
@@ -402,22 +410,25 @@ int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family,
for (mrt = mr_iter(net, NULL); mrt; mrt = mr_iter(net, mrt)) {
struct vif_device *v = &mrt->vif_table[0];
+ struct net_device *vif_dev;
struct mr_mfc *mfc;
int vifi;
/* Notifiy on table VIF entries */
- read_lock(mrt_lock);
+ rcu_read_lock();
for (vifi = 0; vifi < mrt->maxvif; vifi++, v++) {
- if (!v->dev)
+ vif_dev = rcu_dereference(v->dev);
+ if (!vif_dev)
continue;
err = mr_call_vif_notifier(nb, family,
- FIB_EVENT_VIF_ADD,
- v, vifi, mrt->id, extack);
+ FIB_EVENT_VIF_ADD, v,
+ vif_dev, vifi,
+ mrt->id, extack);
if (err)
break;
}
- read_unlock(mrt_lock);
+ rcu_read_unlock();
if (err)
return err;
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index e459a391e607..853a75a8fbaf 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -1858,7 +1858,7 @@ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
/* __ip6_del_rt does a release, so do a hold here */
fib6_info_hold(f6i);
ipv6_stub->ip6_del_rt(net, f6i,
- !net->ipv4.sysctl_nexthop_compat_mode);
+ !READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode));
}
}
@@ -2361,7 +2361,8 @@ out:
if (!rc) {
nh_base_seq_inc(net);
nexthop_notify(RTM_NEWNEXTHOP, new_nh, &cfg->nlinfo);
- if (replace_notify && net->ipv4.sysctl_nexthop_compat_mode)
+ if (replace_notify &&
+ READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode))
nexthop_replace_notify(net, new_nh, &cfg->nlinfo);
}
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 3c6101def7d6..b83c2bd9d722 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -50,7 +50,7 @@
struct ping_table {
struct hlist_nulls_head hash[PING_HTABLE_SIZE];
- rwlock_t lock;
+ spinlock_t lock;
};
static struct ping_table ping_table;
@@ -82,7 +82,7 @@ int ping_get_port(struct sock *sk, unsigned short ident)
struct sock *sk2 = NULL;
isk = inet_sk(sk);
- write_lock_bh(&ping_table.lock);
+ spin_lock(&ping_table.lock);
if (ident == 0) {
u32 i;
u16 result = ping_port_rover + 1;
@@ -128,14 +128,15 @@ next_port:
if (sk_unhashed(sk)) {
pr_debug("was not hashed\n");
sock_hold(sk);
- hlist_nulls_add_head(&sk->sk_nulls_node, hlist);
+ sock_set_flag(sk, SOCK_RCU_FREE);
+ hlist_nulls_add_head_rcu(&sk->sk_nulls_node, hlist);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
}
- write_unlock_bh(&ping_table.lock);
+ spin_unlock(&ping_table.lock);
return 0;
fail:
- write_unlock_bh(&ping_table.lock);
+ spin_unlock(&ping_table.lock);
return 1;
}
EXPORT_SYMBOL_GPL(ping_get_port);
@@ -153,19 +154,19 @@ void ping_unhash(struct sock *sk)
struct inet_sock *isk = inet_sk(sk);
pr_debug("ping_unhash(isk=%p,isk->num=%u)\n", isk, isk->inet_num);
- write_lock_bh(&ping_table.lock);
+ spin_lock(&ping_table.lock);
if (sk_hashed(sk)) {
- hlist_nulls_del(&sk->sk_nulls_node);
- sk_nulls_node_init(&sk->sk_nulls_node);
+ hlist_nulls_del_init_rcu(&sk->sk_nulls_node);
sock_put(sk);
isk->inet_num = 0;
isk->inet_sport = 0;
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
}
- write_unlock_bh(&ping_table.lock);
+ spin_unlock(&ping_table.lock);
}
EXPORT_SYMBOL_GPL(ping_unhash);
+/* Called under rcu_read_lock() */
static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident)
{
struct hlist_nulls_head *hslot = ping_hashslot(&ping_table, net, ident);
@@ -190,8 +191,6 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident)
return NULL;
}
- read_lock_bh(&ping_table.lock);
-
ping_portaddr_for_each_entry(sk, hnode, hslot) {
isk = inet_sk(sk);
@@ -230,13 +229,11 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident)
sk->sk_bound_dev_if != sdif)
continue;
- sock_hold(sk);
goto exit;
}
sk = NULL;
exit:
- read_unlock_bh(&ping_table.lock);
return sk;
}
@@ -592,7 +589,7 @@ void ping_err(struct sk_buff *skb, int offset, u32 info)
sk->sk_err = err;
sk_error_report(sk);
out:
- sock_put(sk);
+ return;
}
EXPORT_SYMBOL_GPL(ping_err);
@@ -998,7 +995,6 @@ enum skb_drop_reason ping_rcv(struct sk_buff *skb)
reason = __ping_queue_rcv_skb(sk, skb2);
else
reason = SKB_DROP_REASON_NOMEM;
- sock_put(sk);
}
if (reason)
@@ -1084,13 +1080,13 @@ static struct sock *ping_get_idx(struct seq_file *seq, loff_t pos)
}
void *ping_seq_start(struct seq_file *seq, loff_t *pos, sa_family_t family)
- __acquires(ping_table.lock)
+ __acquires(RCU)
{
struct ping_iter_state *state = seq->private;
state->bucket = 0;
state->family = family;
- read_lock_bh(&ping_table.lock);
+ rcu_read_lock();
return *pos ? ping_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
}
@@ -1116,9 +1112,9 @@ void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos)
EXPORT_SYMBOL_GPL(ping_seq_next);
void ping_seq_stop(struct seq_file *seq, void *v)
- __releases(ping_table.lock)
+ __releases(RCU)
{
- read_unlock_bh(&ping_table.lock);
+ rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(ping_seq_stop);
@@ -1202,5 +1198,5 @@ void __init ping_init(void)
for (i = 0; i < PING_HTABLE_SIZE; i++)
INIT_HLIST_NULLS_HEAD(&ping_table.hash[i], i);
- rwlock_init(&ping_table.lock);
+ spin_lock_init(&ping_table.lock);
}
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index bbd717805b10..006c1f0ed8b4 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -85,21 +85,20 @@ struct raw_frag_vec {
int hlen;
};
-struct raw_hashinfo raw_v4_hashinfo = {
- .lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock),
-};
+struct raw_hashinfo raw_v4_hashinfo;
EXPORT_SYMBOL_GPL(raw_v4_hashinfo);
int raw_hash_sk(struct sock *sk)
{
struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;
- struct hlist_head *head;
+ struct hlist_nulls_head *hlist;
- head = &h->ht[inet_sk(sk)->inet_num & (RAW_HTABLE_SIZE - 1)];
+ hlist = &h->ht[inet_sk(sk)->inet_num & (RAW_HTABLE_SIZE - 1)];
- write_lock_bh(&h->lock);
- sk_add_node(sk, head);
- write_unlock_bh(&h->lock);
+ spin_lock(&h->lock);
+ __sk_nulls_add_node_rcu(sk, hlist);
+ sock_set_flag(sk, SOCK_RCU_FREE);
+ spin_unlock(&h->lock);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
return 0;
@@ -110,31 +109,26 @@ void raw_unhash_sk(struct sock *sk)
{
struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;
- write_lock_bh(&h->lock);
- if (sk_del_node_init(sk))
+ spin_lock(&h->lock);
+ if (__sk_nulls_del_node_init_rcu(sk))
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
- write_unlock_bh(&h->lock);
+ spin_unlock(&h->lock);
}
EXPORT_SYMBOL_GPL(raw_unhash_sk);
-struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
- unsigned short num, __be32 raddr, __be32 laddr,
- int dif, int sdif)
+bool raw_v4_match(struct net *net, struct sock *sk, unsigned short num,
+ __be32 raddr, __be32 laddr, int dif, int sdif)
{
- sk_for_each_from(sk) {
- struct inet_sock *inet = inet_sk(sk);
-
- if (net_eq(sock_net(sk), net) && inet->inet_num == num &&
- !(inet->inet_daddr && inet->inet_daddr != raddr) &&
- !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) &&
- raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
- goto found; /* gotcha */
- }
- sk = NULL;
-found:
- return sk;
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (net_eq(sock_net(sk), net) && inet->inet_num == num &&
+ !(inet->inet_daddr && inet->inet_daddr != raddr) &&
+ !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) &&
+ raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
+ return true;
+ return false;
}
-EXPORT_SYMBOL_GPL(__raw_v4_lookup);
+EXPORT_SYMBOL_GPL(raw_v4_match);
/*
* 0 - deliver
@@ -168,23 +162,20 @@ static int icmp_filter(const struct sock *sk, const struct sk_buff *skb)
*/
static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash)
{
+ struct net *net = dev_net(skb->dev);
+ struct hlist_nulls_head *hlist;
+ struct hlist_nulls_node *hnode;
int sdif = inet_sdif(skb);
int dif = inet_iif(skb);
- struct sock *sk;
- struct hlist_head *head;
int delivered = 0;
- struct net *net;
-
- read_lock(&raw_v4_hashinfo.lock);
- head = &raw_v4_hashinfo.ht[hash];
- if (hlist_empty(head))
- goto out;
-
- net = dev_net(skb->dev);
- sk = __raw_v4_lookup(net, __sk_head(head), iph->protocol,
- iph->saddr, iph->daddr, dif, sdif);
+ struct sock *sk;
- while (sk) {
+ hlist = &raw_v4_hashinfo.ht[hash];
+ rcu_read_lock();
+ sk_nulls_for_each(sk, hnode, hlist) {
+ if (!raw_v4_match(net, sk, iph->protocol,
+ iph->saddr, iph->daddr, dif, sdif))
+ continue;
delivered = 1;
if ((iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) &&
ip_mc_sf_allow(sk, iph->daddr, iph->saddr,
@@ -195,31 +186,16 @@ static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash)
if (clone)
raw_rcv(sk, clone);
}
- sk = __raw_v4_lookup(net, sk_next(sk), iph->protocol,
- iph->saddr, iph->daddr,
- dif, sdif);
}
-out:
- read_unlock(&raw_v4_hashinfo.lock);
+ rcu_read_unlock();
return delivered;
}
int raw_local_deliver(struct sk_buff *skb, int protocol)
{
- int hash;
- struct sock *raw_sk;
-
- hash = protocol & (RAW_HTABLE_SIZE - 1);
- raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]);
-
- /* If there maybe a raw socket we must check - if not we
- * don't care less
- */
- if (raw_sk && !raw_v4_input(skb, ip_hdr(skb), hash))
- raw_sk = NULL;
-
- return raw_sk != NULL;
+ int hash = protocol & (RAW_HTABLE_SIZE - 1);
+ return raw_v4_input(skb, ip_hdr(skb), hash);
}
static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
@@ -286,31 +262,27 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info)
{
- int hash;
- struct sock *raw_sk;
+ struct net *net = dev_net(skb->dev);
+ struct hlist_nulls_head *hlist;
+ struct hlist_nulls_node *hnode;
+ int dif = skb->dev->ifindex;
+ int sdif = inet_sdif(skb);
const struct iphdr *iph;
- struct net *net;
+ struct sock *sk;
+ int hash;
hash = protocol & (RAW_HTABLE_SIZE - 1);
+ hlist = &raw_v4_hashinfo.ht[hash];
- read_lock(&raw_v4_hashinfo.lock);
- raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]);
- if (raw_sk) {
- int dif = skb->dev->ifindex;
- int sdif = inet_sdif(skb);
-
+ rcu_read_lock();
+ sk_nulls_for_each(sk, hnode, hlist) {
iph = (const struct iphdr *)skb->data;
- net = dev_net(skb->dev);
-
- while ((raw_sk = __raw_v4_lookup(net, raw_sk, protocol,
- iph->daddr, iph->saddr,
- dif, sdif)) != NULL) {
- raw_err(raw_sk, skb, info);
- raw_sk = sk_next(raw_sk);
- iph = (const struct iphdr *)skb->data;
- }
+ if (!raw_v4_match(net, sk, iph->protocol,
+ iph->daddr, iph->saddr, dif, sdif))
+ continue;
+ raw_err(sk, skb, info);
}
- read_unlock(&raw_v4_hashinfo.lock);
+ rcu_read_unlock();
}
static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb)
@@ -971,44 +943,41 @@ struct proto raw_prot = {
};
#ifdef CONFIG_PROC_FS
-static struct sock *raw_get_first(struct seq_file *seq)
+static struct sock *raw_get_first(struct seq_file *seq, int bucket)
{
- struct sock *sk;
struct raw_hashinfo *h = pde_data(file_inode(seq->file));
struct raw_iter_state *state = raw_seq_private(seq);
+ struct hlist_nulls_head *hlist;
+ struct hlist_nulls_node *hnode;
+ struct sock *sk;
- for (state->bucket = 0; state->bucket < RAW_HTABLE_SIZE;
+ for (state->bucket = bucket; state->bucket < RAW_HTABLE_SIZE;
++state->bucket) {
- sk_for_each(sk, &h->ht[state->bucket])
+ hlist = &h->ht[state->bucket];
+ sk_nulls_for_each(sk, hnode, hlist) {
if (sock_net(sk) == seq_file_net(seq))
- goto found;
+ return sk;
+ }
}
- sk = NULL;
-found:
- return sk;
+ return NULL;
}
static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk)
{
- struct raw_hashinfo *h = pde_data(file_inode(seq->file));
struct raw_iter_state *state = raw_seq_private(seq);
do {
- sk = sk_next(sk);
-try_again:
- ;
+ sk = sk_nulls_next(sk);
} while (sk && sock_net(sk) != seq_file_net(seq));
- if (!sk && ++state->bucket < RAW_HTABLE_SIZE) {
- sk = sk_head(&h->ht[state->bucket]);
- goto try_again;
- }
+ if (!sk)
+ return raw_get_first(seq, state->bucket + 1);
return sk;
}
static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos)
{
- struct sock *sk = raw_get_first(seq);
+ struct sock *sk = raw_get_first(seq, 0);
if (sk)
while (pos && (sk = raw_get_next(seq, sk)) != NULL)
@@ -1017,11 +986,9 @@ static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos)
}
void *raw_seq_start(struct seq_file *seq, loff_t *pos)
- __acquires(&h->lock)
+ __acquires(RCU)
{
- struct raw_hashinfo *h = pde_data(file_inode(seq->file));
-
- read_lock(&h->lock);
+ rcu_read_lock();
return *pos ? raw_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
}
EXPORT_SYMBOL_GPL(raw_seq_start);
@@ -1031,7 +998,7 @@ void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos)
struct sock *sk;
if (v == SEQ_START_TOKEN)
- sk = raw_get_first(seq);
+ sk = raw_get_first(seq, 0);
else
sk = raw_get_next(seq, v);
++*pos;
@@ -1040,11 +1007,9 @@ void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos)
EXPORT_SYMBOL_GPL(raw_seq_next);
void raw_seq_stop(struct seq_file *seq, void *v)
- __releases(&h->lock)
+ __releases(RCU)
{
- struct raw_hashinfo *h = pde_data(file_inode(seq->file));
-
- read_unlock(&h->lock);
+ rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(raw_seq_stop);
@@ -1106,6 +1071,7 @@ static __net_initdata struct pernet_operations raw_net_ops = {
int __init raw_proc_init(void)
{
+
return register_pernet_subsys(&raw_net_ops);
}
diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c
index ccacbde30a2c..999321834b94 100644
--- a/net/ipv4/raw_diag.c
+++ b/net/ipv4/raw_diag.c
@@ -34,57 +34,57 @@ raw_get_hashinfo(const struct inet_diag_req_v2 *r)
* use helper to figure it out.
*/
-static struct sock *raw_lookup(struct net *net, struct sock *from,
- const struct inet_diag_req_v2 *req)
+static bool raw_lookup(struct net *net, struct sock *sk,
+ const struct inet_diag_req_v2 *req)
{
struct inet_diag_req_raw *r = (void *)req;
- struct sock *sk = NULL;
if (r->sdiag_family == AF_INET)
- sk = __raw_v4_lookup(net, from, r->sdiag_raw_protocol,
- r->id.idiag_dst[0],
- r->id.idiag_src[0],
- r->id.idiag_if, 0);
+ return raw_v4_match(net, sk, r->sdiag_raw_protocol,
+ r->id.idiag_dst[0],
+ r->id.idiag_src[0],
+ r->id.idiag_if, 0);
#if IS_ENABLED(CONFIG_IPV6)
else
- sk = __raw_v6_lookup(net, from, r->sdiag_raw_protocol,
- (const struct in6_addr *)r->id.idiag_src,
- (const struct in6_addr *)r->id.idiag_dst,
- r->id.idiag_if, 0);
+ return raw_v6_match(net, sk, r->sdiag_raw_protocol,
+ (const struct in6_addr *)r->id.idiag_src,
+ (const struct in6_addr *)r->id.idiag_dst,
+ r->id.idiag_if, 0);
#endif
- return sk;
+ return false;
}
static struct sock *raw_sock_get(struct net *net, const struct inet_diag_req_v2 *r)
{
struct raw_hashinfo *hashinfo = raw_get_hashinfo(r);
- struct sock *sk = NULL, *s;
+ struct hlist_nulls_head *hlist;
+ struct hlist_nulls_node *hnode;
+ struct sock *sk;
int slot;
if (IS_ERR(hashinfo))
return ERR_CAST(hashinfo);
- read_lock(&hashinfo->lock);
+ rcu_read_lock();
for (slot = 0; slot < RAW_HTABLE_SIZE; slot++) {
- sk_for_each(s, &hashinfo->ht[slot]) {
- sk = raw_lookup(net, s, r);
- if (sk) {
+ hlist = &hashinfo->ht[slot];
+ sk_nulls_for_each(sk, hnode, hlist) {
+ if (raw_lookup(net, sk, r)) {
/*
* Grab it and keep until we fill
- * diag meaage to be reported, so
+ * diag message to be reported, so
* caller should call sock_put then.
- * We can do that because we're keeping
- * hashinfo->lock here.
*/
- sock_hold(sk);
- goto out_unlock;
+ if (refcount_inc_not_zero(&sk->sk_refcnt))
+ goto out_unlock;
}
}
}
+ sk = ERR_PTR(-ENOENT);
out_unlock:
- read_unlock(&hashinfo->lock);
+ rcu_read_unlock();
- return sk ? sk : ERR_PTR(-ENOENT);
+ return sk;
}
static int raw_diag_dump_one(struct netlink_callback *cb,
@@ -142,6 +142,8 @@ static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
struct raw_hashinfo *hashinfo = raw_get_hashinfo(r);
struct net *net = sock_net(skb->sk);
struct inet_diag_dump_data *cb_data;
+ struct hlist_nulls_head *hlist;
+ struct hlist_nulls_node *hnode;
int num, s_num, slot, s_slot;
struct sock *sk = NULL;
struct nlattr *bc;
@@ -154,11 +156,12 @@ static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
s_slot = cb->args[0];
num = s_num = cb->args[1];
- read_lock(&hashinfo->lock);
+ rcu_read_lock();
for (slot = s_slot; slot < RAW_HTABLE_SIZE; s_num = 0, slot++) {
num = 0;
- sk_for_each(sk, &hashinfo->ht[slot]) {
+ hlist = &hashinfo->ht[slot];
+ sk_nulls_for_each(sk, hnode, hlist) {
struct inet_sock *inet = inet_sk(sk);
if (!net_eq(sock_net(sk), net))
@@ -181,7 +184,7 @@ next:
}
out_unlock:
- read_unlock(&hashinfo->lock);
+ rcu_read_unlock();
cb->args[0] = slot;
cb->args[1] = num;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 356f535f3443..8f4a8f66d604 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1550,9 +1550,8 @@ void rt_flush_dev(struct net_device *dev)
if (rt->dst.dev != dev)
continue;
rt->dst.dev = blackhole_netdev;
- dev_replace_track(dev, blackhole_netdev,
- &rt->dst.dev_tracker,
- GFP_ATOMIC);
+ netdev_ref_replace(dev, blackhole_netdev,
+ &rt->dst.dev_tracker, GFP_ATOMIC);
list_move(&rt->rt_uncached, &ul->quarantine);
}
spin_unlock_bh(&ul->lock);
@@ -2440,24 +2439,9 @@ martian_source:
goto out;
}
-int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
- u8 tos, struct net_device *dev)
-{
- struct fib_result res;
- int err;
-
- tos &= IPTOS_RT_MASK;
- rcu_read_lock();
- err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
- rcu_read_unlock();
-
- return err;
-}
-EXPORT_SYMBOL(ip_route_input_noref);
-
/* called with rcu_read_lock held */
-int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
- u8 tos, struct net_device *dev, struct fib_result *res)
+static int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+ u8 tos, struct net_device *dev, struct fib_result *res)
{
/* Multicast recognition logic is moved from route cache to here.
* The problem was that too many Ethernet cards have broken/missing
@@ -2506,6 +2490,21 @@ int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
}
+int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+ u8 tos, struct net_device *dev)
+{
+ struct fib_result res;
+ int err;
+
+ tos &= IPTOS_RT_MASK;
+ rcu_read_lock();
+ err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
+ rcu_read_unlock();
+
+ return err;
+}
+EXPORT_SYMBOL(ip_route_input_noref);
+
/* called with rcu_read_lock() */
static struct rtable *__mkroute_output(const struct fib_result *res,
const struct flowi4 *fl4, int orig_oif,
@@ -2851,7 +2850,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
new->output = dst_discard_out;
new->dev = net->loopback_dev;
- dev_hold_track(new->dev, &new->dev_tracker, GFP_ATOMIC);
+ netdev_hold(new->dev, &new->dev_tracker, GFP_ATOMIC);
rt->rt_is_input = ort->rt_is_input;
rt->rt_iif = ort->rt_iif;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index f33c31dd7366..b387c4835155 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -273,7 +273,7 @@ bool cookie_ecn_ok(const struct tcp_options_received *tcp_opt,
if (!ecn_ok)
return false;
- if (net->ipv4.sysctl_tcp_ecn)
+ if (READ_ONCE(net->ipv4.sysctl_tcp_ecn))
return true;
return dst_feature(dst, RTAX_FEATURE_ECN);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index cd448cdd3b38..108fd86f2718 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -599,6 +599,8 @@ static struct ctl_table ipv4_net_table[] = {
.maxlen = sizeof(u8),
.mode = 0644,
.proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE
},
{
.procname = "icmp_echo_enable_probe",
@@ -615,6 +617,8 @@ static struct ctl_table ipv4_net_table[] = {
.maxlen = sizeof(u8),
.mode = 0644,
.proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE
},
{
.procname = "icmp_ignore_bogus_error_responses",
@@ -622,6 +626,8 @@ static struct ctl_table ipv4_net_table[] = {
.maxlen = sizeof(u8),
.mode = 0644,
.proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE
},
{
.procname = "icmp_errors_use_inbound_ifaddr",
@@ -629,6 +635,8 @@ static struct ctl_table ipv4_net_table[] = {
.maxlen = sizeof(u8),
.mode = 0644,
.proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE
},
{
.procname = "icmp_ratelimit",
@@ -668,6 +676,8 @@ static struct ctl_table ipv4_net_table[] = {
.maxlen = sizeof(u8),
.mode = 0644,
.proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
},
{
.procname = "tcp_ecn_fallback",
@@ -675,6 +685,8 @@ static struct ctl_table ipv4_net_table[] = {
.maxlen = sizeof(u8),
.mode = 0644,
.proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
{
.procname = "ip_dynaddr",
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index fdb80b9608fe..ddf63a0ad643 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -294,6 +294,8 @@ EXPORT_SYMBOL(sysctl_tcp_mem);
atomic_long_t tcp_memory_allocated ____cacheline_aligned_in_smp; /* Current allocated memory. */
EXPORT_SYMBOL(tcp_memory_allocated);
+DEFINE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc);
+EXPORT_PER_CPU_SYMBOL_GPL(tcp_memory_per_cpu_fw_alloc);
#if IS_ENABLED(CONFIG_SMC)
DEFINE_STATIC_KEY_FALSE(tcp_have_smc);
@@ -856,9 +858,6 @@ struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
{
struct sk_buff *skb;
- if (unlikely(tcp_under_memory_pressure(sk)))
- sk_mem_reclaim_partial(sk);
-
skb = alloc_skb_fclone(size + MAX_TCP_HEADER, gfp);
if (likely(skb)) {
bool mem_scheduled;
@@ -952,6 +951,24 @@ static int tcp_downgrade_zcopy_pure(struct sock *sk, struct sk_buff *skb)
return 0;
}
+
+static int tcp_wmem_schedule(struct sock *sk, int copy)
+{
+ int left;
+
+ if (likely(sk_wmem_schedule(sk, copy)))
+ return copy;
+
+ /* We could be in trouble if we have nothing queued.
+ * Use whatever is left in sk->sk_forward_alloc and tcp_wmem[0]
+ * to guarantee some progress.
+ */
+ left = sock_net(sk)->ipv4.sysctl_tcp_wmem[0] - sk->sk_wmem_queued;
+ if (left > 0)
+ sk_forced_mem_schedule(sk, min(left, copy));
+ return min(copy, sk->sk_forward_alloc);
+}
+
static struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags,
struct page *page, int offset, size_t *size)
{
@@ -987,7 +1004,11 @@ new_segment:
tcp_mark_push(tp, skb);
goto new_segment;
}
- if (tcp_downgrade_zcopy_pure(sk, skb) || !sk_wmem_schedule(sk, copy))
+ if (tcp_downgrade_zcopy_pure(sk, skb))
+ return NULL;
+
+ copy = tcp_wmem_schedule(sk, copy);
+ if (!copy)
return NULL;
if (can_coalesce) {
@@ -1347,7 +1368,8 @@ new_segment:
skb_zcopy_downgrade_managed(skb);
}
- if (!sk_wmem_schedule(sk, copy))
+ copy = tcp_wmem_schedule(sk, copy);
+ if (!copy)
goto wait_for_space;
err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
@@ -1374,7 +1396,8 @@ new_segment:
skb_shinfo(skb)->flags |= SKBFL_PURE_ZEROCOPY;
if (!skb_zcopy_pure(skb)) {
- if (!sk_wmem_schedule(sk, copy))
+ copy = tcp_wmem_schedule(sk, copy);
+ if (!copy)
goto wait_for_space;
}
@@ -1720,6 +1743,50 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
}
EXPORT_SYMBOL(tcp_read_sock);
+int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 seq = tp->copied_seq;
+ struct sk_buff *skb;
+ int copied = 0;
+ u32 offset;
+
+ if (sk->sk_state == TCP_LISTEN)
+ return -ENOTCONN;
+
+ while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
+ int used;
+
+ __skb_unlink(skb, &sk->sk_receive_queue);
+ used = recv_actor(sk, skb);
+ if (used <= 0) {
+ if (!copied)
+ copied = used;
+ break;
+ }
+ seq += used;
+ copied += used;
+
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
+ consume_skb(skb);
+ ++seq;
+ break;
+ }
+ consume_skb(skb);
+ break;
+ }
+ WRITE_ONCE(tp->copied_seq, seq);
+
+ tcp_rcv_space_adjust(sk);
+
+ /* Clean up data we have read: This will do ACK frames. */
+ if (copied > 0)
+ tcp_cleanup_rbuf(sk, copied);
+
+ return copied;
+}
+EXPORT_SYMBOL(tcp_read_skb);
+
int tcp_peek_len(struct socket *sock)
{
return tcp_inq(sock->sk);
@@ -2726,7 +2793,8 @@ static void tcp_orphan_update(struct timer_list *unused)
static bool tcp_too_many_orphans(int shift)
{
- return READ_ONCE(tcp_orphan_cache) << shift > sysctl_tcp_max_orphans;
+ return READ_ONCE(tcp_orphan_cache) << shift >
+ READ_ONCE(sysctl_tcp_max_orphans);
}
bool tcp_check_oom(struct sock *sk, int shift)
@@ -2773,8 +2841,6 @@ void __tcp_close(struct sock *sk, long timeout)
__kfree_skb(skb);
}
- sk_mem_reclaim(sk);
-
/* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
if (sk->sk_state == TCP_CLOSE)
goto adjudge_to_death;
@@ -2882,7 +2948,6 @@ adjudge_to_death:
}
}
if (sk->sk_state != TCP_CLOSE) {
- sk_mem_reclaim(sk);
if (tcp_check_oom(sk, 0)) {
tcp_set_state(sk, TCP_CLOSE);
tcp_send_active_reset(sk, GFP_ATOMIC);
@@ -2960,7 +3025,6 @@ void tcp_write_queue_purge(struct sock *sk)
}
tcp_rtx_queue_purge(sk);
INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
- sk_mem_reclaim(sk);
tcp_clear_all_retrans_hints(tcp_sk(sk));
tcp_sk(sk)->packets_out = 0;
inet_csk(sk)->icsk_backoff = 0;
@@ -4521,16 +4585,24 @@ EXPORT_SYMBOL_GPL(tcp_done);
int tcp_abort(struct sock *sk, int err)
{
- if (!sk_fullsock(sk)) {
- if (sk->sk_state == TCP_NEW_SYN_RECV) {
- struct request_sock *req = inet_reqsk(sk);
+ int state = inet_sk_state_load(sk);
- local_bh_disable();
- inet_csk_reqsk_queue_drop(req->rsk_listener, req);
- local_bh_enable();
- return 0;
- }
- return -EOPNOTSUPP;
+ if (state == TCP_NEW_SYN_RECV) {
+ struct request_sock *req = inet_reqsk(sk);
+
+ local_bh_disable();
+ inet_csk_reqsk_queue_drop(req->rsk_listener, req);
+ local_bh_enable();
+ return 0;
+ }
+ if (state == TCP_TIME_WAIT) {
+ struct inet_timewait_sock *tw = inet_twsk(sk);
+
+ refcount_inc(&tw->tw_refcnt);
+ local_bh_disable();
+ inet_twsk_deschedule_put(tw);
+ local_bh_enable();
+ return 0;
}
/* Don't race with userspace socket closes such as tcp_close. */
@@ -4662,11 +4734,11 @@ void __init tcp_init(void)
max_wshare = min(4UL*1024*1024, limit);
max_rshare = min(6UL*1024*1024, limit);
- init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
+ init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE;
init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;
init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
- init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
+ init_net.ipv4.sysctl_tcp_rmem[0] = PAGE_SIZE;
init_net.ipv4.sysctl_tcp_rmem[1] = 131072;
init_net.ipv4.sysctl_tcp_rmem[2] = max(131072, max_rshare);
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 0d3f68bb51c0..a1626afe87a1 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -540,6 +540,7 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS],
struct proto *base)
{
prot[TCP_BPF_BASE] = *base;
+ prot[TCP_BPF_BASE].destroy = sock_map_destroy;
prot[TCP_BPF_BASE].close = sock_map_close;
prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg;
prot[TCP_BPF_BASE].sock_is_readable = sk_msg_is_readable;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2e2a9ece9af2..acee998c14b2 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -805,7 +805,6 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
* restart window, so that we send ACKs quickly.
*/
tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
- sk_mem_reclaim(sk);
}
}
icsk->icsk_ack.lrcvtime = now;
@@ -3967,7 +3966,7 @@ static bool smc_parse_options(const struct tcphdr *th,
/* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped
* value on success.
*/
-static u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss)
+u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss)
{
const unsigned char *ptr = (const unsigned char *)(th + 1);
int length = (th->doff * 4) - sizeof(struct tcphdr);
@@ -4006,6 +4005,7 @@ static u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss)
}
return mss;
}
+EXPORT_SYMBOL_GPL(tcp_parse_mss_option);
/* Look for tcp options. Normally only called on SYN and SYNACK packets.
* But, this can also be called on packets in the established flow when
@@ -4390,7 +4390,6 @@ void tcp_fin(struct sock *sk)
skb_rbtree_purge(&tp->out_of_order_queue);
if (tcp_is_sack(tp))
tcp_sack_reset(&tp->rx_opt);
- sk_mem_reclaim(sk);
if (!sock_flag(sk, SOCK_DEAD)) {
sk->sk_state_change(sk);
@@ -5287,7 +5286,7 @@ new_range:
before(TCP_SKB_CB(skb)->end_seq, start)) {
/* Do not attempt collapsing tiny skbs */
if (range_truesize != head->truesize ||
- end - start >= SKB_WITH_OVERHEAD(SK_MEM_QUANTUM)) {
+ end - start >= SKB_WITH_OVERHEAD(PAGE_SIZE)) {
tcp_collapse(sk, NULL, &tp->out_of_order_queue,
head, skb, start, end);
} else {
@@ -5336,7 +5335,6 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
tcp_drop_reason(sk, rb_to_skb(node),
SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE);
if (!prev || goal <= 0) {
- sk_mem_reclaim(sk);
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
!tcp_under_memory_pressure(sk))
break;
@@ -5383,7 +5381,6 @@ static int tcp_prune_queue(struct sock *sk)
skb_peek(&sk->sk_receive_queue),
NULL,
tp->copied_seq, tp->rcv_nxt);
- sk_mem_reclaim(sk);
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
return 0;
@@ -6729,7 +6726,7 @@ static void tcp_ecn_create_request(struct request_sock *req,
ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK);
- ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst;
+ ecn_ok = READ_ONCE(net->ipv4.sysctl_tcp_ecn) || ecn_ok_dst;
if (((!ect || th->res1) && ecn_ok) || tcp_ca_needs_ecn(listen_sk) ||
(ecn_ok_dst & DST_FEATURE_ECN_CA) ||
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index da5a3c44c4fb..228d36692d08 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -819,6 +819,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
inet_twsk(sk)->tw_priority : sk->sk_priority;
transmit_time = tcp_transmit_time(sk);
+ xfrm_sk_clone_policy(ctl_sk, sk);
}
ip_send_unicast_reply(ctl_sk,
skb, &TCP_SKB_CB(skb)->header.h4.opt,
@@ -827,6 +828,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
transmit_time);
ctl_sk->sk_mark = 0;
+ xfrm_sk_free_policy(ctl_sk);
sock_net_set(ctl_sk, &init_net);
__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
@@ -3049,7 +3051,10 @@ struct proto tcp_prot = {
.stream_memory_free = tcp_stream_memory_free,
.sockets_allocated = &tcp_sockets_allocated,
.orphan_count = &tcp_orphan_count,
+
.memory_allocated = &tcp_memory_allocated,
+ .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc,
+
.memory_pressure = &tcp_memory_pressure,
.sysctl_mem = sysctl_tcp_mem,
.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 1c054431e358..858a15cc2cc9 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -324,7 +324,7 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
- bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
+ bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 ||
tcp_ca_needs_ecn(sk) || bpf_needs_ecn;
if (!use_ecn) {
@@ -346,7 +346,7 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb)
{
- if (sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback))
/* tp->ecn_flags are cleared at a later point in time when
* SYN ACK is ultimatively being received.
*/
@@ -3144,7 +3144,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
struct tcp_sock *tp = tcp_sk(sk);
unsigned int cur_mss;
int diff, len, err;
-
+ int avail_wnd;
/* Inconclusive MTU probe */
if (icsk->icsk_mtup.probe_size)
@@ -3166,17 +3166,25 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
return -EHOSTUNREACH; /* Routing failure or similar. */
cur_mss = tcp_current_mss(sk);
+ avail_wnd = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
/* If receiver has shrunk his window, and skb is out of
* new window, do not retransmit it. The exception is the
* case, when window is shrunk to zero. In this case
- * our retransmit serves as a zero window probe.
+ * our retransmit of one segment serves as a zero window probe.
*/
- if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) &&
- TCP_SKB_CB(skb)->seq != tp->snd_una)
- return -EAGAIN;
+ if (avail_wnd <= 0) {
+ if (TCP_SKB_CB(skb)->seq != tp->snd_una)
+ return -EAGAIN;
+ avail_wnd = cur_mss;
+ }
len = cur_mss * segs;
+ if (len > avail_wnd) {
+ len = rounddown(avail_wnd, cur_mss);
+ if (!len)
+ len = avail_wnd;
+ }
if (skb->len > len) {
if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len,
cur_mss, GFP_ATOMIC))
@@ -3190,8 +3198,9 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
diff -= tcp_skb_pcount(skb);
if (diff)
tcp_adjust_pcount(sk, skb, diff);
- if (skb->len < cur_mss)
- tcp_retrans_try_collapse(sk, skb, cur_mss);
+ avail_wnd = min_t(int, avail_wnd, cur_mss);
+ if (skb->len < avail_wnd)
+ tcp_retrans_try_collapse(sk, skb, avail_wnd);
}
/* RFC3168, section 6.1.1.1. ECN fallback */
@@ -3362,12 +3371,13 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
*/
void sk_forced_mem_schedule(struct sock *sk, int size)
{
- int amt;
+ int delta, amt;
- if (size <= sk->sk_forward_alloc)
+ delta = size - sk->sk_forward_alloc;
+ if (delta <= 0)
return;
- amt = sk_mem_pages(size);
- sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
+ amt = sk_mem_pages(delta);
+ sk->sk_forward_alloc += amt << PAGE_SHIFT;
sk_memory_allocated_add(sk, amt);
if (mem_cgroup_sockets_enabled && sk->sk_memcg)
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 20cf4a98c69d..2208755e8efc 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -290,15 +290,13 @@ void tcp_delack_timer_handler(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
- sk_mem_reclaim_partial(sk);
-
if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) ||
!(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
- goto out;
+ return;
if (time_after(icsk->icsk_ack.timeout, jiffies)) {
sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout);
- goto out;
+ return;
}
icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
@@ -317,10 +315,6 @@ void tcp_delack_timer_handler(struct sock *sk)
tcp_send_ack(sk);
__NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS);
}
-
-out:
- if (tcp_under_memory_pressure(sk))
- sk_mem_reclaim(sk);
}
@@ -600,11 +594,11 @@ void tcp_write_timer_handler(struct sock *sk)
if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) ||
!icsk->icsk_pending)
- goto out;
+ return;
if (time_after(icsk->icsk_timeout, jiffies)) {
sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
- goto out;
+ return;
}
tcp_mstamp_refresh(tcp_sk(sk));
@@ -626,9 +620,6 @@ void tcp_write_timer_handler(struct sock *sk)
tcp_probe_timer(sk);
break;
}
-
-out:
- sk_mem_reclaim(sk);
}
static void tcp_write_timer(struct timer_list *t)
@@ -743,8 +734,6 @@ static void tcp_keepalive_timer (struct timer_list *t)
elapsed = keepalive_time_when(tp) - elapsed;
}
- sk_mem_reclaim(sk);
-
resched:
inet_csk_reset_keepalive_timer (sk, elapsed);
goto out;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index aa9f2ec3dc46..2516078aa03e 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -125,6 +125,8 @@ EXPORT_SYMBOL(sysctl_udp_mem);
atomic_long_t udp_memory_allocated ____cacheline_aligned_in_smp;
EXPORT_SYMBOL(udp_memory_allocated);
+DEFINE_PER_CPU(int, udp_memory_per_cpu_fw_alloc);
+EXPORT_PER_CPU_SYMBOL_GPL(udp_memory_per_cpu_fw_alloc);
#define MAX_UDP_PORTS 65536
#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN)
@@ -1461,11 +1463,11 @@ static void udp_rmem_release(struct sock *sk, int size, int partial,
sk->sk_forward_alloc += size;
- amt = (sk->sk_forward_alloc - partial) & ~(SK_MEM_QUANTUM - 1);
+ amt = (sk->sk_forward_alloc - partial) & ~(PAGE_SIZE - 1);
sk->sk_forward_alloc -= amt;
if (amt)
- __sk_mem_reduce_allocated(sk, amt >> SK_MEM_QUANTUM_SHIFT);
+ __sk_mem_reduce_allocated(sk, amt >> PAGE_SHIFT);
atomic_sub(size, &sk->sk_rmem_alloc);
@@ -1558,7 +1560,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
spin_lock(&list->lock);
if (size >= sk->sk_forward_alloc) {
amt = sk_mem_pages(size);
- delta = amt << SK_MEM_QUANTUM_SHIFT;
+ delta = amt << PAGE_SHIFT;
if (!__sk_mem_raise_allocated(sk, delta, amt, SK_MEM_RECV)) {
err = -ENOBUFS;
spin_unlock(&list->lock);
@@ -1795,8 +1797,7 @@ busy_check:
}
EXPORT_SYMBOL(__skb_recv_udp);
-int udp_read_sock(struct sock *sk, read_descriptor_t *desc,
- sk_read_actor_t recv_actor)
+int udp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
{
int copied = 0;
@@ -1818,7 +1819,8 @@ int udp_read_sock(struct sock *sk, read_descriptor_t *desc,
continue;
}
- used = recv_actor(desc, skb, 0, skb->len);
+ WARN_ON(!skb_set_owner_sk_safe(skb, sk));
+ used = recv_actor(sk, skb);
if (used <= 0) {
if (!copied)
copied = used;
@@ -1829,13 +1831,12 @@ int udp_read_sock(struct sock *sk, read_descriptor_t *desc,
}
kfree_skb(skb);
- if (!desc->count)
- break;
+ break;
}
return copied;
}
-EXPORT_SYMBOL(udp_read_sock);
+EXPORT_SYMBOL(udp_read_skb);
/*
* This should be easy, if there is something there we
@@ -2946,6 +2947,8 @@ struct proto udp_prot = {
.psock_update_sk_prot = udp_bpf_update_proto,
#endif
.memory_allocated = &udp_memory_allocated,
+ .per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc,
+
.sysctl_mem = sysctl_udp_mem,
.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min),
.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min),
@@ -3263,8 +3266,8 @@ EXPORT_SYMBOL(udp_flow_hashrnd);
static void __udp_sysctl_init(struct net *net)
{
- net->ipv4.sysctl_udp_rmem_min = SK_MEM_QUANTUM;
- net->ipv4.sysctl_udp_wmem_min = SK_MEM_QUANTUM;
+ net->ipv4.sysctl_udp_rmem_min = PAGE_SIZE;
+ net->ipv4.sysctl_udp_wmem_min = PAGE_SIZE;
#ifdef CONFIG_NET_L3_MASTER_DEV
net->ipv4.sysctl_udp_l3mdev_accept = 0;
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index cd1cd68adeec..6e08a76ae1e7 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -51,7 +51,10 @@ struct proto udplite_prot = {
.unhash = udp_lib_unhash,
.rehash = udp_v4_rehash,
.get_port = udp_v4_get_port,
+
.memory_allocated = &udp_memory_allocated,
+ .per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc,
+
.sysctl_mem = sysctl_udp_mem,
.obj_size = sizeof(struct udp_sock),
.h.udp_table = &udplite_table,
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 6fde0b184791..3d0dfa6cf9f9 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -75,7 +75,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
xdst->u.rt.rt_iif = fl4->flowi4_iif;
xdst->u.dst.dev = dev;
- dev_hold_track(dev, &xdst->u.dst.dev_tracker, GFP_ATOMIC);
+ netdev_hold(dev, &xdst->u.dst.dev_tracker, GFP_ATOMIC);
/* Sheit... I remember I did this right. Apparently,
* it was magically lost, so this code needs audit */