diff options
22 files changed, 1731 insertions, 360 deletions
diff --git a/Documentation/networking/ipvs-sysctl.rst b/Documentation/networking/ipvs-sysctl.rst index 387fda80f05f..3fb5fa142eef 100644 --- a/Documentation/networking/ipvs-sysctl.rst +++ b/Documentation/networking/ipvs-sysctl.rst @@ -129,6 +129,26 @@ drop_packet - INTEGER threshold. When the mode 3 is set, the always mode drop rate is controlled by the /proc/sys/net/ipv4/vs/am_droprate. +est_cpulist - CPULIST + Allowed CPUs for estimation kthreads + + Syntax: standard cpulist format + empty list - stop kthread tasks and estimation + default - the system's housekeeping CPUs for kthreads + + Example: + "all": all possible CPUs + "0-N": all possible CPUs, N denotes last CPU number + "0,1-N:1/2": first and all CPUs with odd number + "": empty list + +est_nice - INTEGER + default 0 + Valid range: -20 (more favorable) .. 19 (less favorable) + + Niceness value to use for the estimation kthreads (scheduling + priority) + expire_nodest_conn - BOOLEAN - 0 - disabled (default) - not 0 - enabled @@ -304,8 +324,8 @@ run_estimation - BOOLEAN 0 - disabled not 0 - enabled (default) - If disabled, the estimation will be stop, and you can't see - any update on speed estimation data. + If disabled, the estimation will be suspended and kthread tasks + stopped. You can always re-enable estimation by setting this value to 1. But be careful, the first estimation after re-enable is not diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index ada1296c87d5..ab934ad951a8 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -515,6 +515,16 @@ ip_set_init_skbinfo(struct ip_set_skbinfo *skbinfo, *skbinfo = ext->skbinfo; } +static inline void +nf_inet_addr_mask_inplace(union nf_inet_addr *a1, + const union nf_inet_addr *mask) +{ + a1->all[0] &= mask->all[0]; + a1->all[1] &= mask->all[1]; + a1->all[2] &= mask->all[2]; + a1->all[3] &= mask->all[3]; +} + #define IP_SET_INIT_KEXT(skb, opt, set) \ { .bytes = (skb)->len, .packets = 1, .target = true,\ .timeout = ip_set_adt_opt_timeout(opt, set) } diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index ff1804a0c469..c6c61100d244 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -29,6 +29,7 @@ #include <net/netfilter/nf_conntrack.h> #endif #include <net/net_namespace.h> /* Netw namespace */ +#include <linux/sched/isolation.h> #define IP_VS_HDR_INVERSE 1 #define IP_VS_HDR_ICMP 2 @@ -42,6 +43,8 @@ static inline struct netns_ipvs *net_ipvs(struct net* net) /* Connections' size value needed by ip_vs_ctl.c */ extern int ip_vs_conn_tab_size; +extern struct mutex __ip_vs_mutex; + struct ip_vs_iphdr { int hdr_flags; /* ipvs flags */ __u32 off; /* Where IP or IPv4 header starts */ @@ -351,11 +354,11 @@ struct ip_vs_seq { /* counters per cpu */ struct ip_vs_counters { - __u64 conns; /* connections scheduled */ - __u64 inpkts; /* incoming packets */ - __u64 outpkts; /* outgoing packets */ - __u64 inbytes; /* incoming bytes */ - __u64 outbytes; /* outgoing bytes */ + u64_stats_t conns; /* connections scheduled */ + u64_stats_t inpkts; /* incoming packets */ + u64_stats_t outpkts; /* outgoing packets */ + u64_stats_t inbytes; /* incoming bytes */ + u64_stats_t outbytes; /* outgoing bytes */ }; /* Stats per cpu */ struct ip_vs_cpu_stats { @@ -363,9 +366,12 @@ struct ip_vs_cpu_stats { struct u64_stats_sync syncp; }; +/* Default nice for estimator kthreads */ +#define IPVS_EST_NICE 0 + /* IPVS statistics objects */ struct ip_vs_estimator { - struct list_head list; + struct hlist_node list; u64 last_inbytes; u64 last_outbytes; @@ -378,6 +384,10 @@ struct ip_vs_estimator { u64 outpps; u64 inbps; u64 outbps; + + s32 ktid:16, /* kthread ID, -1=temp list */ + ktrow:8, /* row/tick ID for kthread */ + ktcid:8; /* chain ID for kthread tick */ }; /* @@ -405,6 +415,76 @@ struct ip_vs_stats { struct ip_vs_kstats kstats0; /* reset values */ }; +struct ip_vs_stats_rcu { + struct ip_vs_stats s; + struct rcu_head rcu_head; +}; + +int ip_vs_stats_init_alloc(struct ip_vs_stats *s); +struct ip_vs_stats *ip_vs_stats_alloc(void); +void ip_vs_stats_release(struct ip_vs_stats *stats); +void ip_vs_stats_free(struct ip_vs_stats *stats); + +/* Process estimators in multiple timer ticks (20/50/100, see ktrow) */ +#define IPVS_EST_NTICKS 50 +/* Estimation uses a 2-second period containing ticks (in jiffies) */ +#define IPVS_EST_TICK ((2 * HZ) / IPVS_EST_NTICKS) + +/* Limit of CPU load per kthread (8 for 12.5%), ratio of CPU capacity (1/C). + * Value of 4 and above ensures kthreads will take work without exceeding + * the CPU capacity under different circumstances. + */ +#define IPVS_EST_LOAD_DIVISOR 8 + +/* Kthreads should not have work that exceeds the CPU load above 50% */ +#define IPVS_EST_CPU_KTHREADS (IPVS_EST_LOAD_DIVISOR / 2) + +/* Desired number of chains per timer tick (chain load factor in 100us units), + * 48=4.8ms of 40ms tick (12% CPU usage): + * 2 sec * 1000 ms in sec * 10 (100us in ms) / 8 (12.5%) / 50 + */ +#define IPVS_EST_CHAIN_FACTOR \ + ALIGN_DOWN(2 * 1000 * 10 / IPVS_EST_LOAD_DIVISOR / IPVS_EST_NTICKS, 8) + +/* Compiled number of chains per tick + * The defines should match cond_resched_rcu + */ +#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU) +#define IPVS_EST_TICK_CHAINS IPVS_EST_CHAIN_FACTOR +#else +#define IPVS_EST_TICK_CHAINS 1 +#endif + +#if IPVS_EST_NTICKS > 127 +#error Too many timer ticks for ktrow +#endif + +/* Multiple chains processed in same tick */ +struct ip_vs_est_tick_data { + struct hlist_head chains[IPVS_EST_TICK_CHAINS]; + DECLARE_BITMAP(present, IPVS_EST_TICK_CHAINS); + DECLARE_BITMAP(full, IPVS_EST_TICK_CHAINS); + int chain_len[IPVS_EST_TICK_CHAINS]; +}; + +/* Context for estimation kthread */ +struct ip_vs_est_kt_data { + struct netns_ipvs *ipvs; + struct task_struct *task; /* task if running */ + struct ip_vs_est_tick_data __rcu *ticks[IPVS_EST_NTICKS]; + DECLARE_BITMAP(avail, IPVS_EST_NTICKS); /* tick has space for ests */ + unsigned long est_timer; /* estimation timer (jiffies) */ + struct ip_vs_stats *calc_stats; /* Used for calculation */ + int tick_len[IPVS_EST_NTICKS]; /* est count */ + int id; /* ktid per netns */ + int chain_max; /* max ests per tick chain */ + int tick_max; /* max ests per tick */ + int est_count; /* attached ests to kthread */ + int est_max_count; /* max ests per kthread */ + int add_row; /* row for new ests */ + int est_row; /* estimated row */ +}; + struct dst_entry; struct iphdr; struct ip_vs_conn; @@ -688,6 +768,7 @@ struct ip_vs_dest { union nf_inet_addr vaddr; /* virtual IP address */ __u32 vfwmark; /* firewall mark of service */ + struct rcu_head rcu_head; struct list_head t_list; /* in dest_trash */ unsigned int in_rs_table:1; /* we are in rs_table */ }; @@ -869,7 +950,7 @@ struct netns_ipvs { atomic_t conn_count; /* connection counter */ /* ip_vs_ctl */ - struct ip_vs_stats tot_stats; /* Statistics & est. */ + struct ip_vs_stats_rcu *tot_stats; /* Statistics & est. */ int num_services; /* no of virtual services */ int num_services6; /* IPv6 virtual services */ @@ -932,6 +1013,12 @@ struct netns_ipvs { int sysctl_schedule_icmp; int sysctl_ignore_tunneled; int sysctl_run_estimation; +#ifdef CONFIG_SYSCTL + cpumask_var_t sysctl_est_cpulist; /* kthread cpumask */ + int est_cpulist_valid; /* cpulist set */ + int sysctl_est_nice; /* kthread nice */ + int est_stopped; /* stop tasks */ +#endif /* ip_vs_lblc */ int sysctl_lblc_expiration; @@ -942,9 +1029,17 @@ struct netns_ipvs { struct ctl_table_header *lblcr_ctl_header; struct ctl_table *lblcr_ctl_table; /* ip_vs_est */ - struct list_head est_list; /* estimator list */ - spinlock_t est_lock; - struct timer_list est_timer; /* Estimation timer */ + struct delayed_work est_reload_work;/* Reload kthread tasks */ + struct mutex est_mutex; /* protect kthread tasks */ + struct hlist_head est_temp_list; /* Ests during calc phase */ + struct ip_vs_est_kt_data **est_kt_arr; /* Array of kthread data ptrs */ + unsigned long est_max_threads;/* Hard limit of kthreads */ + int est_calc_phase; /* Calculation phase */ + int est_chain_max; /* Calculated chain_max */ + int est_kt_count; /* Allocated ptrs */ + int est_add_ktid; /* ktid where to add ests */ + atomic_t est_genid; /* kthreads reload genid */ + atomic_t est_genid_done; /* applied genid */ /* ip_vs_sync */ spinlock_t sync_lock; struct ipvs_master_sync_state *ms; @@ -1077,6 +1172,19 @@ static inline int sysctl_run_estimation(struct netns_ipvs *ipvs) return ipvs->sysctl_run_estimation; } +static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs) +{ + if (ipvs->est_cpulist_valid) + return ipvs->sysctl_est_cpulist; + else + return housekeeping_cpumask(HK_TYPE_KTHREAD); +} + +static inline int sysctl_est_nice(struct netns_ipvs *ipvs) +{ + return ipvs->sysctl_est_nice; +} + #else static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs) @@ -1174,6 +1282,16 @@ static inline int sysctl_run_estimation(struct netns_ipvs *ipvs) return 1; } +static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs) +{ + return housekeeping_cpumask(HK_TYPE_KTHREAD); +} + +static inline int sysctl_est_nice(struct netns_ipvs *ipvs) +{ + return IPVS_EST_NICE; +} + #endif /* IPVS core functions @@ -1475,10 +1593,41 @@ int stop_sync_thread(struct netns_ipvs *ipvs, int state); void ip_vs_sync_conn(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, int pkts); /* IPVS rate estimator prototypes (from ip_vs_est.c) */ -void ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats); +int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats); void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats); void ip_vs_zero_estimator(struct ip_vs_stats *stats); void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats); +void ip_vs_est_reload_start(struct netns_ipvs *ipvs); +int ip_vs_est_kthread_start(struct netns_ipvs *ipvs, + struct ip_vs_est_kt_data *kd); +void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd); + +static inline void ip_vs_est_stopped_recalc(struct netns_ipvs *ipvs) +{ +#ifdef CONFIG_SYSCTL + /* Stop tasks while cpulist is empty or if disabled with flag */ + ipvs->est_stopped = !sysctl_run_estimation(ipvs) || + (ipvs->est_cpulist_valid && + cpumask_empty(sysctl_est_cpulist(ipvs))); +#endif +} + +static inline bool ip_vs_est_stopped(struct netns_ipvs *ipvs) +{ +#ifdef CONFIG_SYSCTL + return ipvs->est_stopped; +#else + return false; +#endif +} + +static inline int ip_vs_est_max_threads(struct netns_ipvs *ipvs) +{ + unsigned int limit = IPVS_EST_CPU_KTHREADS * + cpumask_weight(sysctl_est_cpulist(ipvs)); + + return max(1U, limit); +} /* Various IPVS packet transmitters (from ip_vs_xmit.c) */ int ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index b2b9de70d9f4..71d1269fe4d4 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -71,8 +71,7 @@ static inline int nf_conntrack_confirm(struct sk_buff *skb) return ret; } -unsigned int nf_confirm(struct sk_buff *skb, unsigned int protoff, - struct nf_conn *ct, enum ip_conntrack_info ctinfo); +unsigned int nf_confirm(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); void print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_l4proto *proto); diff --git a/include/uapi/linux/netfilter/ipset/ip_set.h b/include/uapi/linux/netfilter/ipset/ip_set.h index 79e5d68b87af..333807efd32b 100644 --- a/include/uapi/linux/netfilter/ipset/ip_set.h +++ b/include/uapi/linux/netfilter/ipset/ip_set.h @@ -85,6 +85,7 @@ enum { IPSET_ATTR_CADT_LINENO = IPSET_ATTR_LINENO, /* 9 */ IPSET_ATTR_MARK, /* 10 */ IPSET_ATTR_MARKMASK, /* 11 */ + IPSET_ATTR_BITMASK, /* 12 */ /* Reserve empty slots */ IPSET_ATTR_CADT_MAX = 16, /* Create-only specific attributes */ @@ -153,6 +154,7 @@ enum ipset_errno { IPSET_ERR_COMMENT, IPSET_ERR_INVALID_MARKMASK, IPSET_ERR_SKBINFO, + IPSET_ERR_BITMASK_NETMASK_EXCL, /* Type specific error codes */ IPSET_ERR_TYPE_SPECIFIC = 4352, diff --git a/include/uapi/linux/netfilter/nf_conntrack_sctp.h b/include/uapi/linux/netfilter/nf_conntrack_sctp.h index edc6ddab0de6..c742469afe21 100644 --- a/include/uapi/linux/netfilter/nf_conntrack_sctp.h +++ b/include/uapi/linux/netfilter/nf_conntrack_sctp.h @@ -16,6 +16,7 @@ enum sctp_conntrack { SCTP_CONNTRACK_SHUTDOWN_ACK_SENT, SCTP_CONNTRACK_HEARTBEAT_SENT, SCTP_CONNTRACK_HEARTBEAT_ACKED, + SCTP_CONNTRACK_DATA_SENT, SCTP_CONNTRACK_MAX }; diff --git a/include/uapi/linux/netfilter/nfnetlink_cttimeout.h b/include/uapi/linux/netfilter/nfnetlink_cttimeout.h index 6b20fb22717b..94e74034706d 100644 --- a/include/uapi/linux/netfilter/nfnetlink_cttimeout.h +++ b/include/uapi/linux/netfilter/nfnetlink_cttimeout.h @@ -95,6 +95,7 @@ enum ctattr_timeout_sctp { CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT, CTA_TIMEOUT_SCTP_HEARTBEAT_SENT, CTA_TIMEOUT_SCTP_HEARTBEAT_ACKED, + CTA_TIMEOUT_SCTP_DATA_SENT, __CTA_TIMEOUT_SCTP_MAX }; #define CTA_TIMEOUT_SCTP_MAX (__CTA_TIMEOUT_SCTP_MAX - 1) diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c index 73242962be5d..5c5dd437f1c2 100644 --- a/net/bridge/netfilter/nf_conntrack_bridge.c +++ b/net/bridge/netfilter/nf_conntrack_bridge.c @@ -366,42 +366,12 @@ static int nf_ct_bridge_refrag_post(struct net *net, struct sock *sk, return br_dev_queue_push_xmit(net, sk, skb); } -static unsigned int nf_ct_bridge_confirm(struct sk_buff *skb) -{ - enum ip_conntrack_info ctinfo; - struct nf_conn *ct; - int protoff; - - ct = nf_ct_get(skb, &ctinfo); - if (!ct || ctinfo == IP_CT_RELATED_REPLY) - return nf_conntrack_confirm(skb); - - switch (skb->protocol) { - case htons(ETH_P_IP): - protoff = skb_network_offset(skb) + ip_hdrlen(skb); - break; - case htons(ETH_P_IPV6): { - unsigned char pnum = ipv6_hdr(skb)->nexthdr; - __be16 frag_off; - - protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, - &frag_off); - if (protoff < 0 || (frag_off & htons(~0x7)) != 0) - return nf_conntrack_confirm(skb); - } - break; - default: - return NF_ACCEPT; - } - return nf_confirm(skb, protoff, ct, ctinfo); -} - static unsigned int nf_ct_bridge_post(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { int ret; - ret = nf_ct_bridge_confirm(skb); + ret = nf_confirm(priv, skb, state); if (ret != NF_ACCEPT) return ret; diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 7499192af586..7c2399541771 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -159,6 +159,17 @@ htable_size(u8 hbits) (SET_WITH_TIMEOUT(set) && \ ip_set_timeout_expired(ext_timeout(d, set))) +#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK) +static const union nf_inet_addr onesmask = { + .all[0] = 0xffffffff, + .all[1] = 0xffffffff, + .all[2] = 0xffffffff, + .all[3] = 0xffffffff +}; + +static const union nf_inet_addr zeromask = {}; +#endif + #endif /* _IP_SET_HASH_GEN_H */ #ifndef MTYPE @@ -283,8 +294,9 @@ struct htype { u32 markmask; /* markmask value for mark mask to store */ #endif u8 bucketsize; /* max elements in an array block */ -#ifdef IP_SET_HASH_WITH_NETMASK +#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK) u8 netmask; /* netmask value for subnets to store */ + union nf_inet_addr bitmask; /* stores bitmask */ #endif struct list_head ad; /* Resize add|del backlist */ struct mtype_elem next; /* temporary storage for uadd */ @@ -459,8 +471,8 @@ mtype_same_set(const struct ip_set *a, const struct ip_set *b) /* Resizing changes htable_bits, so we ignore it */ return x->maxelem == y->maxelem && a->timeout == b->timeout && -#ifdef IP_SET_HASH_WITH_NETMASK - x->netmask == y->netmask && +#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK) + nf_inet_addr_cmp(&x->bitmask, &y->bitmask) && #endif #ifdef IP_SET_HASH_WITH_MARKMASK x->markmask == y->markmask && @@ -1264,9 +1276,21 @@ mtype_head(struct ip_set *set, struct sk_buff *skb) htonl(jhash_size(htable_bits))) || nla_put_net32(skb, IPSET_ATTR_MAXELEM, htonl(h->maxelem))) goto nla_put_failure; +#ifdef IP_SET_HASH_WITH_BITMASK + /* if netmask is set to anything other than HOST_MASK we know that the user supplied netmask + * and not bitmask. These two are mutually exclusive. */ + if (h->netmask == HOST_MASK && !nf_inet_addr_cmp(&onesmask, &h->bitmask)) { + if (set->family == NFPROTO_IPV4) { + if (nla_put_ipaddr4(skb, IPSET_ATTR_BITMASK, h->bitmask.ip)) + goto nla_put_failure; + } else if (set->family == NFPROTO_IPV6) { + if (nla_put_ipaddr6(skb, IPSET_ATTR_BITMASK, &h->bitmask.in6)) + goto nla_put_failure; + } + } +#endif #ifdef IP_SET_HASH_WITH_NETMASK - if (h->netmask != HOST_MASK && - nla_put_u8(skb, IPSET_ATTR_NETMASK, h->netmask)) + if (h->netmask != HOST_MASK && nla_put_u8(skb, IPSET_ATTR_NETMASK, h->netmask)) goto nla_put_failure; #endif #ifdef IP_SET_HASH_WITH_MARKMASK @@ -1429,8 +1453,10 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, u32 markmask; #endif u8 hbits; -#ifdef IP_SET_HASH_WITH_NETMASK - u8 netmask; +#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK) + int ret __attribute__((unused)) = 0; + u8 netmask = set->family == NFPROTO_IPV4 ? 32 : 128; + union nf_inet_addr bitmask = onesmask; #endif size_t hsize; struct htype *h; @@ -1468,7 +1494,6 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, #endif #ifdef IP_SET_HASH_WITH_NETMASK - netmask = set->family == NFPROTO_IPV4 ? 32 : 128; if (tb[IPSET_ATTR_NETMASK]) { netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]); @@ -1476,6 +1501,33 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, (set->family == NFPROTO_IPV6 && netmask > 128) || netmask == 0) return -IPSET_ERR_INVALID_NETMASK; + + /* we convert netmask to bitmask and store it */ + if (set->family == NFPROTO_IPV4) + bitmask.ip = ip_set_netmask(netmask); + else + ip6_netmask(&bitmask, netmask); + } +#endif + +#ifdef IP_SET_HASH_WITH_BITMASK + if (tb[IPSET_ATTR_BITMASK]) { + /* bitmask and netmask do the same thing, allow only one of these options */ + if (tb[IPSET_ATTR_NETMASK]) + return -IPSET_ERR_BITMASK_NETMASK_EXCL; + + if (set->family == NFPROTO_IPV4) { + ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_BITMASK], &bitmask.ip); + if (ret || !bitmask.ip) + return -IPSET_ERR_INVALID_NETMASK; + } else if (set->family == NFPROTO_IPV6) { + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_BITMASK], &bitmask); + if (ret || ipv6_addr_any(&bitmask.in6)) + return -IPSET_ERR_INVALID_NETMASK; + } + + if (nf_inet_addr_cmp(&bitmask, &zeromask)) + return -IPSET_ERR_INVALID_NETMASK; } #endif @@ -1518,7 +1570,8 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, for (i = 0; i < ahash_numof_locks(hbits); i++) spin_lock_init(&t->hregion[i].lock); h->maxelem = maxelem; -#ifdef IP_SET_HASH_WITH_NETMASK +#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK) + h->bitmask = bitmask; h->netmask = netmask; #endif #ifdef IP_SET_HASH_WITH_MARKMASK diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index 75d556d71652..e30513cefd90 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -24,7 +24,8 @@ /* 2 Comments support */ /* 3 Forceadd support */ /* 4 skbinfo support */ -#define IPSET_TYPE_REV_MAX 5 /* bucketsize, initval support */ +/* 5 bucketsize, initval support */ +#define IPSET_TYPE_REV_MAX 6 /* bitmask support */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); @@ -34,6 +35,7 @@ MODULE_ALIAS("ip_set_hash:ip"); /* Type specific function prefix */ #define HTYPE hash_ip #define IP_SET_HASH_WITH_NETMASK +#define IP_SET_HASH_WITH_BITMASK /* IPv4 variant */ @@ -86,7 +88,7 @@ hash_ip4_kadt(struct ip_set *set, const struct sk_buff *skb, __be32 ip; ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &ip); - ip &= ip_set_netmask(h->netmask); + ip &= h->bitmask.ip; if (ip == 0) return -EINVAL; @@ -119,7 +121,7 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret) return ret; - ip &= ip_set_hostmask(h->netmask); + ip &= ntohl(h->bitmask.ip); e.ip = htonl(ip); if (e.ip == 0) return -IPSET_ERR_HASH_ELEM; @@ -185,12 +187,6 @@ hash_ip6_data_equal(const struct hash_ip6_elem *ip1, return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6); } -static void -hash_ip6_netmask(union nf_inet_addr *ip, u8 prefix) -{ - ip6_netmask(ip, prefix); -} - static bool hash_ip6_data_list(struct sk_buff *skb, const struct hash_ip6_elem *e) { @@ -227,7 +223,7 @@ hash_ip6_kadt(struct ip_set *set, const struct sk_buff *skb, struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); - hash_ip6_netmask(&e.ip, h->netmask); + nf_inet_addr_mask_inplace(&e.ip, &h->bitmask); if (ipv6_addr_any(&e.ip.in6)) return -EINVAL; @@ -266,7 +262,7 @@ hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[], if (ret) return ret; - hash_ip6_netmask(&e.ip, h->netmask); + nf_inet_addr_mask_inplace(&e.ip, &h->bitmask); if (ipv6_addr_any(&e.ip.in6)) return -IPSET_ERR_HASH_ELEM; @@ -293,6 +289,7 @@ static struct ip_set_type hash_ip_type __read_mostly = { [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_NETMASK] = { .type = NLA_U8 }, + [IPSET_ATTR_BITMASK] = { .type = NLA_NESTED }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index 7303138e46be..2ffbd0b78a8c 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -26,7 +26,8 @@ /* 3 Comments support added */ /* 4 Forceadd support added */ /* 5 skbinfo support added */ -#define IPSET_TYPE_REV_MAX 6 /* bucketsize, initval support added */ +/* 6 bucketsize, initval support added */ +#define IPSET_TYPE_REV_MAX 7 /* bitmask support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); @@ -35,6 +36,8 @@ MODULE_ALIAS("ip_set_hash:ip,port"); /* Type specific function prefix */ #define HTYPE hash_ipport +#define IP_SET_HASH_WITH_NETMASK +#define IP_SET_HASH_WITH_BITMASK /* IPv4 variant */ @@ -92,12 +95,16 @@ hash_ipport4_kadt(struct ip_set *set, const struct sk_buff *skb, ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipport4_elem e = { .ip = 0 }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + const struct MTYPE *h = set->data; if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.port, &e.proto)) return -EINVAL; ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); + e.ip &= h->bitmask.ip; + if (e.ip == 0) + return -EINVAL; return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } @@ -129,6 +136,10 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret) return ret; + e.ip &= h->bitmask.ip; + if (e.ip == 0) + return -EINVAL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { @@ -253,12 +264,17 @@ hash_ipport6_kadt(struct ip_set *set, const struct sk_buff *skb, ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipport6_elem e = { .ip = { .all = { 0 } } }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + const struct MTYPE *h = set->data; if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.port, &e.proto)) return -EINVAL; ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); + nf_inet_addr_mask_inplace(&e.ip, &h->bitmask); + if (ipv6_addr_any(&e.ip.in6)) + return -EINVAL; + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } @@ -298,6 +314,10 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], if (ret) return ret; + nf_inet_addr_mask_inplace(&e.ip, &h->bitmask); + if (ipv6_addr_any(&e.ip.in6)) + return -EINVAL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { @@ -356,6 +376,8 @@ static struct ip_set_type hash_ipport_type __read_mostly = { [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + [IPSET_ATTR_NETMASK] = { .type = NLA_U8 }, + [IPSET_ATTR_BITMASK] = { .type = NLA_NESTED }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index 3d09eefe998a..cdfb78c6e0d3 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -23,7 +23,8 @@ #define IPSET_TYPE_REV_MIN 0 /* 1 Forceadd support added */ /* 2 skbinfo support added */ -#define IPSET_TYPE_REV_MAX 3 /* bucketsize, initval support added */ +/* 3 bucketsize, initval support added */ +#define IPSET_TYPE_REV_MAX 4 /* bitmask support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>"); @@ -33,6 +34,8 @@ MODULE_ALIAS("ip_set_hash:net,net"); /* Type specific function prefix */ #define HTYPE hash_netnet #define IP_SET_HASH_WITH_NETS +#define IP_SET_HASH_WITH_NETMASK +#define IP_SET_HASH_WITH_BITMASK #define IPSET_NET_COUNT 2 /* IPv4 variants */ @@ -153,8 +156,8 @@ hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb, ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0]); ip4addrptr(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.ip[1]); - e.ip[0] &= ip_set_netmask(e.cidr[0]); - e.ip[1] &= ip_set_netmask(e.cidr[1]); + e.ip[0] &= (ip_set_netmask(e.cidr[0]) & h->bitmask.ip); + e.ip[1] &= (ip_set_netmask(e.cidr[1]) & h->bitmask.ip); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } @@ -213,8 +216,8 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_IP2_TO])) { - e.ip[0] = htonl(ip & ip_set_hostmask(e.cidr[0])); - e.ip[1] = htonl(ip2_from & ip_set_hostmask(e.cidr[1])); + e.ip[0] = htonl(ip & ntohl(h->bitmask.ip) & ip_set_hostmask(e.cidr[0])); + e.ip[1] = htonl(ip2_from & ntohl(h->bitmask.ip) & ip_set_hostmask(e.cidr[1])); ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_enomatch(ret, flags, adt, set) ? -ret : ip_set_eexist(ret, flags) ? 0 : ret; @@ -404,6 +407,11 @@ hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb, ip6_netmask(&e.ip[0], e.cidr[0]); ip6_netmask(&e.ip[1], e.cidr[1]); + nf_inet_addr_mask_inplace(&e.ip[0], &h->bitmask); + nf_inet_addr_mask_inplace(&e.ip[1], &h->bitmask); + if (e.cidr[0] == HOST_MASK && ipv6_addr_any(&e.ip[0].in6)) + return -EINVAL; + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } @@ -414,6 +422,7 @@ hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[], ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netnet6_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + const struct hash_netnet6 *h = set->data; int ret; if (tb[IPSET_ATTR_LINENO]) @@ -453,6 +462,11 @@ hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[], ip6_netmask(&e.ip[0], e.cidr[0]); ip6_netmask(&e.ip[1], e.cidr[1]); + nf_inet_addr_mask_inplace(&e.ip[0], &h->bitmask); + nf_inet_addr_mask_inplace(&e.ip[1], &h->bitmask); + if (e.cidr[0] == HOST_MASK && ipv6_addr_any(&e.ip[0].in6)) + return -IPSET_ERR_HASH_ELEM; + if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); @@ -484,6 +498,8 @@ static struct ip_set_type hash_netnet_type __read_mostly = { [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + [IPSET_ATTR_NETMASK] = { .type = NLA_U8 }, + [IPSET_ATTR_BITMASK] = { .type = NLA_NESTED }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 51ad557a525b..2fcc26507d69 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -132,21 +132,21 @@ ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) s = this_cpu_ptr(dest->stats.cpustats); u64_stats_update_begin(&s->syncp); - s->cnt.inpkts++; - s->cnt.inbytes += skb->len; + u64_stats_inc(&s->cnt.inpkts); + u64_stats_add(&s->cnt.inbytes, skb->len); u64_stats_update_end(&s->syncp); svc = rcu_dereference(dest->svc); s = this_cpu_ptr(svc->stats.cpustats); u64_stats_update_begin(&s->syncp); - s->cnt.inpkts++; - s->cnt.inbytes += skb->len; + u64_stats_inc(&s->cnt.inpkts); + u64_stats_add(&s->cnt.inbytes, skb->len); u64_stats_update_end(&s->syncp); - s = this_cpu_ptr(ipvs->tot_stats.cpustats); + s = this_cpu_ptr(ipvs->tot_stats->s.cpustats); u64_stats_update_begin(&s->syncp); - s->cnt.inpkts++; - s->cnt.inbytes += skb->len; + u64_stats_inc(&s->cnt.inpkts); + u64_stats_add(&s->cnt.inbytes, skb->len); u64_stats_update_end(&s->syncp); local_bh_enable(); @@ -168,21 +168,21 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) s = this_cpu_ptr(dest->stats.cpustats); u64_stats_update_begin(&s->syncp); - s->cnt.outpkts++; - s->cnt.outbytes += skb->len; + u64_stats_inc(&s->cnt.outpkts); + u64_stats_add(&s->cnt.outbytes, skb->len); u64_stats_update_end(&s->syncp); svc = rcu_dereference(dest->svc); s = this_cpu_ptr(svc->stats.cpustats); u64_stats_update_begin(&s->syncp); - s->cnt.outpkts++; - s->cnt.outbytes += skb->len; + u64_stats_inc(&s->cnt.outpkts); + u64_stats_add(&s->cnt.outbytes, skb->len); u64_stats_update_end(&s->syncp); - s = this_cpu_ptr(ipvs->tot_stats.cpustats); + s = this_cpu_ptr(ipvs->tot_stats->s.cpustats); u64_stats_update_begin(&s->syncp); - s->cnt.outpkts++; - s->cnt.outbytes += skb->len; + u64_stats_inc(&s->cnt.outpkts); + u64_stats_add(&s->cnt.outbytes, skb->len); u64_stats_update_end(&s->syncp); local_bh_enable(); @@ -200,17 +200,17 @@ ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) s = this_cpu_ptr(cp->dest->stats.cpustats); u64_stats_update_begin(&s->syncp); - s->cnt.conns++; + u64_stats_inc(&s->cnt.conns); u64_stats_update_end(&s->syncp); s = this_cpu_ptr(svc->stats.cpustats); u64_stats_update_begin(&s->syncp); - s->cnt.conns++; + u64_stats_inc(&s->cnt.conns); u64_stats_update_end(&s->syncp); - s = this_cpu_ptr(ipvs->tot_stats.cpustats); + s = this_cpu_ptr(ipvs->tot_stats->s.cpustats); u64_stats_update_begin(&s->syncp); - s->cnt.conns++; + u64_stats_inc(&s->cnt.conns); u64_stats_update_end(&s->syncp); local_bh_enable(); @@ -2448,6 +2448,10 @@ static void __exit ip_vs_cleanup(void) ip_vs_conn_cleanup(); ip_vs_protocol_cleanup(); ip_vs_control_cleanup(); + /* common rcu_barrier() used by: + * - ip_vs_control_cleanup() + */ + rcu_barrier(); pr_info("ipvs unloaded.\n"); } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 4d62059a6021..c9f598505642 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -49,8 +49,7 @@ MODULE_ALIAS_GENL_FAMILY(IPVS_GENL_NAME); -/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */ -static DEFINE_MUTEX(__ip_vs_mutex); +DEFINE_MUTEX(__ip_vs_mutex); /* Serialize configuration with sockopt/netlink */ /* sysctl variables */ @@ -241,6 +240,47 @@ static void defense_work_handler(struct work_struct *work) } #endif +static void est_reload_work_handler(struct work_struct *work) +{ + struct netns_ipvs *ipvs = + container_of(work, struct netns_ipvs, est_reload_work.work); + int genid_done = atomic_read(&ipvs->est_genid_done); + unsigned long delay = HZ / 10; /* repeat startups after failure */ + bool repeat = false; + int genid; + int id; + + mutex_lock(&ipvs->est_mutex); + genid = atomic_read(&ipvs->est_genid); + for (id = 0; id < ipvs->est_kt_count; id++) { + struct ip_vs_est_kt_data *kd = ipvs->est_kt_arr[id]; + + /* netns clean up started, abort delayed work */ + if (!ipvs->enable) + goto unlock; + if (!kd) + continue; + /* New config ? Stop kthread tasks */ + if (genid != genid_done) + ip_vs_est_kthread_stop(kd); + if (!kd->task && !ip_vs_est_stopped(ipvs)) { + /* Do not start kthreads above 0 in calc phase */ + if ((!id || !ipvs->est_calc_phase) && + ip_vs_est_kthread_start(ipvs, kd) < 0) + repeat = true; + } + } + + atomic_set(&ipvs->est_genid_done, genid); + + if (repeat) + queue_delayed_work(system_long_wq, &ipvs->est_reload_work, + delay); + +unlock: + mutex_unlock(&ipvs->est_mutex); +} + int ip_vs_use_count_inc(void) { @@ -471,7 +511,7 @@ __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) static void ip_vs_service_free(struct ip_vs_service *svc) { - free_percpu(svc->stats.cpustats); + ip_vs_stats_release(&svc->stats); kfree(svc); } @@ -483,17 +523,14 @@ static void ip_vs_service_rcu_free(struct rcu_head *head) ip_vs_service_free(svc); } -static void __ip_vs_svc_put(struct ip_vs_service *svc, bool do_delay) +static void __ip_vs_svc_put(struct ip_vs_service *svc) { if (atomic_dec_and_test(&svc->refcnt)) { IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n", svc->fwmark, IP_VS_DBG_ADDR(svc->af, &svc->addr), ntohs(svc->port)); - if (do_delay) - call_rcu(&svc->rcu_head, ip_vs_service_rcu_free); - else - ip_vs_service_free(svc); + call_rcu(&svc->rcu_head, ip_vs_service_rcu_free); } } @@ -780,14 +817,22 @@ out: return dest; } +static void ip_vs_dest_rcu_free(struct rcu_head *head) +{ + struct ip_vs_dest *dest; + + dest = container_of(head, struct ip_vs_dest, rcu_head); + ip_vs_stats_release(&dest->stats); + ip_vs_dest_put_and_free(dest); +} + static void ip_vs_dest_free(struct ip_vs_dest *dest) { struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1); __ip_vs_dst_cache_reset(dest); - __ip_vs_svc_put(svc, false); - free_percpu(dest->stats.cpustats); - ip_vs_dest_put_and_free(dest); + __ip_vs_svc_put(svc); + call_rcu(&dest->rcu_head, ip_vs_dest_rcu_free); } /* @@ -811,12 +856,22 @@ static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs) } } +static void ip_vs_stats_rcu_free(struct rcu_head *head) +{ + struct ip_vs_stats_rcu *rs = container_of(head, + struct ip_vs_stats_rcu, + rcu_head); + + ip_vs_stats_release(&rs->s); + kfree(rs); +} + static void ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src) { #define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->kstats.c - src->kstats0.c - spin_lock_bh(&src->lock); + spin_lock(&src->lock); IP_VS_SHOW_STATS_COUNTER(conns); IP_VS_SHOW_STATS_COUNTER(inpkts); @@ -826,7 +881,7 @@ ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src) ip_vs_read_estimator(dst, src); - spin_unlock_bh(&src->lock); + spin_unlock(&src->lock); } static void @@ -847,7 +902,7 @@ ip_vs_export_stats_user(struct ip_vs_stats_user *dst, struct ip_vs_kstats *src) static void ip_vs_zero_stats(struct ip_vs_stats *stats) { - spin_lock_bh(&stats->lock); + spin_lock(&stats->lock); /* get current counters as zero point, rates are zeroed */ @@ -861,7 +916,48 @@ ip_vs_zero_stats(struct ip_vs_stats *stats) ip_vs_zero_estimator(stats); - spin_unlock_bh(&stats->lock); + spin_unlock(&stats->lock); +} + +/* Allocate fields after kzalloc */ +int ip_vs_stats_init_alloc(struct ip_vs_stats *s) +{ + int i; + + spin_lock_init(&s->lock); + s->cpustats = alloc_percpu(struct ip_vs_cpu_stats); + if (!s->cpustats) + return -ENOMEM; + + for_each_possible_cpu(i) { + struct ip_vs_cpu_stats *cs = per_cpu_ptr(s->cpustats, i); + + u64_stats_init(&cs->syncp); + } + return 0; +} + +struct ip_vs_stats *ip_vs_stats_alloc(void) +{ + struct ip_vs_stats *s = kzalloc(sizeof(*s), GFP_KERNEL); + + if (s && ip_vs_stats_init_alloc(s) >= 0) + return s; + kfree(s); + return NULL; +} + +void ip_vs_stats_release(struct ip_vs_stats *stats) +{ + free_percpu(stats->cpustats); +} + +void ip_vs_stats_free(struct ip_vs_stats *stats) +{ + if (stats) { + ip_vs_stats_release(stats); + kfree(stats); + } } /* @@ -923,7 +1019,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, if (old_svc != svc) { ip_vs_zero_stats(&dest->stats); __ip_vs_bind_svc(dest, svc); - __ip_vs_svc_put(old_svc, true); + __ip_vs_svc_put(old_svc); } } @@ -942,7 +1038,6 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, spin_unlock_bh(&dest->dst_lock); if (add) { - ip_vs_start_estimator(svc->ipvs, &dest->stats); list_add_rcu(&dest->n_list, &svc->destinations); svc->num_dests++; sched = rcu_dereference_protected(svc->scheduler, 1); @@ -963,14 +1058,13 @@ static int ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) { struct ip_vs_dest *dest; - unsigned int atype, i; + unsigned int atype; + int ret; EnterFunction(2); #ifdef CONFIG_IP_VS_IPV6 if (udest->af == AF_INET6) { - int ret; - atype = ipv6_addr_type(&udest->addr.in6); if ((!(atype & IPV6_ADDR_UNICAST) || atype & IPV6_ADDR_LINKLOCAL) && @@ -992,15 +1086,13 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) if (dest == NULL) return -ENOMEM; - dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); - if (!dest->stats.cpustats) + ret = ip_vs_stats_init_alloc(&dest->stats); + if (ret < 0) goto err_alloc; - for_each_possible_cpu(i) { - struct ip_vs_cpu_stats *ip_vs_dest_stats; - ip_vs_dest_stats = per_cpu_ptr(dest->stats.cpustats, i); - u64_stats_init(&ip_vs_dest_stats->syncp); - } + ret = ip_vs_start_estimator(svc->ipvs, &dest->stats); + if (ret < 0) + goto err_stats; dest->af = udest->af; dest->protocol = svc->protocol; @@ -1017,15 +1109,17 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) INIT_HLIST_NODE(&dest->d_list); spin_lock_init(&dest->dst_lock); - spin_lock_init(&dest->stats.lock); __ip_vs_update_dest(svc, dest, udest, 1); LeaveFunction(2); return 0; +err_stats: + ip_vs_stats_release(&dest->stats); + err_alloc: kfree(dest); - return -ENOMEM; + return ret; } @@ -1087,14 +1181,18 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) IP_VS_DBG_ADDR(svc->af, &dest->vaddr), ntohs(dest->vport)); + ret = ip_vs_start_estimator(svc->ipvs, &dest->stats); + if (ret < 0) + goto err; __ip_vs_update_dest(svc, dest, udest, 1); - ret = 0; } else { /* * Allocate and initialize the dest structure */ ret = ip_vs_new_dest(svc, udest); } + +err: LeaveFunction(2); return ret; @@ -1284,7 +1382,7 @@ static int ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, struct ip_vs_service **svc_p) { - int ret = 0, i; + int ret = 0; struct ip_vs_scheduler *sched = NULL; struct ip_vs_pe *pe = NULL; struct ip_vs_service *svc = NULL; @@ -1344,18 +1442,9 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, ret = -ENOMEM; goto out_err; } - svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); - if (!svc->stats.cpustats) { - ret = -ENOMEM; + ret = ip_vs_stats_init_alloc(&svc->stats); + if (ret < 0) goto out_err; - } - - for_each_possible_cpu(i) { - struct ip_vs_cpu_stats *ip_vs_stats; - ip_vs_stats = per_cpu_ptr(svc->stats.cpustats, i); - u64_stats_init(&ip_vs_stats->syncp); - } - /* I'm the first user of the service */ atomic_set(&svc->refcnt, 0); @@ -1372,7 +1461,6 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, INIT_LIST_HEAD(&svc->destinations); spin_lock_init(&svc->sched_lock); - spin_lock_init(&svc->stats.lock); /* Bind the scheduler */ if (sched) { @@ -1382,6 +1470,10 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, sched = NULL; } + ret = ip_vs_start_estimator(ipvs, &svc->stats); + if (ret < 0) + goto out_err; + /* Bind the ct retriever */ RCU_INIT_POINTER(svc->pe, pe); pe = NULL; @@ -1394,8 +1486,6 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, if (svc->pe && svc->pe->conn_out) atomic_inc(&ipvs->conn_out_counter); - ip_vs_start_estimator(ipvs, &svc->stats); - /* Count only IPv4 services for old get/setsockopt interface */ if (svc->af == AF_INET) ipvs->num_services++; @@ -1406,8 +1496,15 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, ip_vs_svc_hash(svc); *svc_p = svc; - /* Now there is a service - full throttle */ - ipvs->enable = 1; + + if (!ipvs->enable) { + /* Now there is a service - full throttle */ + ipvs->enable = 1; + + /* Start estimation for first time */ + ip_vs_est_reload_start(ipvs); + } + return 0; @@ -1571,7 +1668,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) /* * Free the service if nobody refers to it */ - __ip_vs_svc_put(svc, true); + __ip_vs_svc_put(svc); /* decrease the module use count */ ip_vs_use_count_dec(); @@ -1761,7 +1858,7 @@ static int ip_vs_zero_all(struct netns_ipvs *ipvs) } } - ip_vs_zero_stats(&ipvs->tot_stats); + ip_vs_zero_stats(&ipvs->tot_stats->s); return 0; } @@ -1843,6 +1940,148 @@ proc_do_sync_ports(struct ctl_table *table, int write, return rc; } +static int ipvs_proc_est_cpumask_set(struct ctl_table *table, void *buffer) +{ + struct netns_ipvs *ipvs = table->extra2; + cpumask_var_t *valp = table->data; + cpumask_var_t newmask; + int ret; + + if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) + return -ENOMEM; + + ret = cpulist_parse(buffer, newmask); + if (ret) + goto out; + + mutex_lock(&ipvs->est_mutex); + + if (!ipvs->est_cpulist_valid) { + if (!zalloc_cpumask_var(valp, GFP_KERNEL)) { + ret = -ENOMEM; + goto unlock; + } + ipvs->est_cpulist_valid = 1; + } + cpumask_and(newmask, newmask, ¤t->cpus_mask); + cpumask_copy(*valp, newmask); + /* est_max_threads may depend on cpulist size */ + ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); + ipvs->est_calc_phase = 1; + ip_vs_est_reload_start(ipvs); + +unlock: + mutex_unlock(&ipvs->est_mutex); + +out: + free_cpumask_var(newmask); + return ret; +} + +static int ipvs_proc_est_cpumask_get(struct ctl_table *table, void *buffer, + size_t size) +{ + struct netns_ipvs *ipvs = table->extra2; + cpumask_var_t *valp = table->data; + struct cpumask *mask; + int ret; + + mutex_lock(&ipvs->est_mutex); + + if (ipvs->est_cpulist_valid) + mask = *valp; + else + mask = (struct cpumask *)housekeeping_cpumask(HK_TYPE_KTHREAD); + ret = scnprintf(buffer, size, "%*pbl\n", cpumask_pr_args(mask)); + + mutex_unlock(&ipvs->est_mutex); + + return ret; +} + +static int ipvs_proc_est_cpulist(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + /* Ignore both read and write(append) if *ppos not 0 */ + if (*ppos || !*lenp) { + *lenp = 0; + return 0; + } + if (write) { + /* proc_sys_call_handler() appends terminator */ + ret = ipvs_proc_est_cpumask_set(table, buffer); + if (ret >= 0) + *ppos += *lenp; + } else { + /* proc_sys_call_handler() allocates 1 byte for terminator */ + ret = ipvs_proc_est_cpumask_get(table, buffer, *lenp + 1); + if (ret >= 0) { + *lenp = ret; + *ppos += *lenp; + ret = 0; + } + } + return ret; +} + +static int ipvs_proc_est_nice(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct netns_ipvs *ipvs = table->extra2; + int *valp = table->data; + int val = *valp; + int ret; + + struct ctl_table tmp_table = { + .data = &val, + .maxlen = sizeof(int), + .mode = table->mode, + }; + + ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos); + if (write && ret >= 0) { + if (val < MIN_NICE || val > MAX_NICE) { + ret = -EINVAL; + } else { + mutex_lock(&ipvs->est_mutex); + if (*valp != val) { + *valp = val; + ip_vs_est_reload_start(ipvs); + } + mutex_unlock(&ipvs->est_mutex); + } + } + return ret; +} + +static int ipvs_proc_run_estimation(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct netns_ipvs *ipvs = table->extra2; + int *valp = table->data; + int val = *valp; + int ret; + + struct ctl_table tmp_table = { + .data = &val, + .maxlen = sizeof(int), + .mode = table->mode, + }; + + ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos); + if (write && ret >= 0) { + mutex_lock(&ipvs->est_mutex); + if (*valp != val) { + *valp = val; + ip_vs_est_reload_start(ipvs); + } + mutex_unlock(&ipvs->est_mutex); + } + return ret; +} + /* * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/) * Do not change order or insert new entries without @@ -2017,7 +2256,19 @@ static struct ctl_table vs_vars[] = { .procname = "run_estimation", .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = ipvs_proc_run_estimation, + }, + { + .procname = "est_cpulist", + .maxlen = NR_CPUS, /* unused */ + .mode = 0644, + .proc_handler = ipvs_proc_est_cpulist, + }, + { + .procname = "est_nice", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = ipvs_proc_est_nice, }, #ifdef CONFIG_IP_VS_DEBUG { @@ -2255,7 +2506,7 @@ static int ip_vs_stats_show(struct seq_file *seq, void *v) seq_puts(seq, " Conns Packets Packets Bytes Bytes\n"); - ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats); + ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats->s); seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n\n", (unsigned long long)show.conns, (unsigned long long)show.inpkts, @@ -2279,7 +2530,7 @@ static int ip_vs_stats_show(struct seq_file *seq, void *v) static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) { struct net *net = seq_file_single_net(seq); - struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats; + struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats->s; struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats; struct ip_vs_kstats kstats; int i; @@ -2297,11 +2548,11 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) do { start = u64_stats_fetch_begin(&u->syncp); - conns = u->cnt.conns; - inpkts = u->cnt.inpkts; - outpkts = u->cnt.outpkts; - inbytes = u->cnt.inbytes; - outbytes = u->cnt.outbytes; + conns = u64_stats_read(&u->cnt.conns); + inpkts = u64_stats_read(&u->cnt.inpkts); + outpkts = u64_stats_read(&u->cnt.outpkts); + inbytes = u64_stats_read(&u->cnt.inbytes); + outbytes = u64_stats_read(&u->cnt.outbytes); } while (u64_stats_fetch_retry(&u->syncp, start)); seq_printf(seq, "%3X %8LX %8LX %8LX %16LX %16LX\n", @@ -4027,13 +4278,17 @@ static void ip_vs_genl_unregister(void) static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) { struct net *net = ipvs->net; - int idx; struct ctl_table *tbl; + int idx, ret; atomic_set(&ipvs->dropentry, 0); spin_lock_init(&ipvs->dropentry_lock); spin_lock_init(&ipvs->droppacket_lock); spin_lock_init(&ipvs->securetcp_lock); + INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler); + INIT_DELAYED_WORK(&ipvs->expire_nodest_conn_work, + expire_nodest_conn_handler); + ipvs->est_stopped = 0; if (!net_eq(net, &init_net)) { tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL); @@ -4094,31 +4349,44 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) tbl[idx++].data = &ipvs->sysctl_schedule_icmp; tbl[idx++].data = &ipvs->sysctl_ignore_tunneled; ipvs->sysctl_run_estimation = 1; + tbl[idx].extra2 = ipvs; tbl[idx++].data = &ipvs->sysctl_run_estimation; + + ipvs->est_cpulist_valid = 0; + tbl[idx].extra2 = ipvs; + tbl[idx++].data = &ipvs->sysctl_est_cpulist; + + ipvs->sysctl_est_nice = IPVS_EST_NICE; + tbl[idx].extra2 = ipvs; + tbl[idx++].data = &ipvs->sysctl_est_nice; + #ifdef CONFIG_IP_VS_DEBUG /* Global sysctls must be ro in non-init netns */ if (!net_eq(net, &init_net)) tbl[idx++].mode = 0444; #endif + ret = -ENOMEM; ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl); - if (ipvs->sysctl_hdr == NULL) { - if (!net_eq(net, &init_net)) - kfree(tbl); - return -ENOMEM; - } - ip_vs_start_estimator(ipvs, &ipvs->tot_stats); + if (!ipvs->sysctl_hdr) + goto err; ipvs->sysctl_tbl = tbl; + + ret = ip_vs_start_estimator(ipvs, &ipvs->tot_stats->s); + if (ret < 0) + goto err; + /* Schedule defense work */ - INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler); queue_delayed_work(system_long_wq, &ipvs->defense_work, DEFENSE_TIMER_PERIOD); - /* Init delayed work for expiring no dest conn */ - INIT_DELAYED_WORK(&ipvs->expire_nodest_conn_work, - expire_nodest_conn_handler); - return 0; + +err: + unregister_net_sysctl_table(ipvs->sysctl_hdr); + if (!net_eq(net, &init_net)) + kfree(tbl); + return ret; } static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) @@ -4129,7 +4397,10 @@ static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) cancel_delayed_work_sync(&ipvs->defense_work); cancel_work_sync(&ipvs->defense_work.work); unregister_net_sysctl_table(ipvs->sysctl_hdr); - ip_vs_stop_estimator(ipvs, &ipvs->tot_stats); + ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s); + + if (ipvs->est_cpulist_valid) + free_cpumask_var(ipvs->sysctl_est_cpulist); if (!net_eq(net, &init_net)) kfree(ipvs->sysctl_tbl); @@ -4151,7 +4422,8 @@ static struct notifier_block ip_vs_dst_notifier = { int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) { - int i, idx; + int ret = -ENOMEM; + int idx; /* Initialize rs_table */ for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++) @@ -4164,18 +4436,14 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) atomic_set(&ipvs->nullsvc_counter, 0); atomic_set(&ipvs->conn_out_counter, 0); - /* procfs stats */ - ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); - if (!ipvs->tot_stats.cpustats) - return -ENOMEM; - - for_each_possible_cpu(i) { - struct ip_vs_cpu_stats *ipvs_tot_stats; - ipvs_tot_stats = per_cpu_ptr(ipvs->tot_stats.cpustats, i); - u64_stats_init(&ipvs_tot_stats->syncp); - } + INIT_DELAYED_WORK(&ipvs->est_reload_work, est_reload_work_handler); - spin_lock_init(&ipvs->tot_stats.lock); + /* procfs stats */ + ipvs->tot_stats = kzalloc(sizeof(*ipvs->tot_stats), GFP_KERNEL); + if (!ipvs->tot_stats) + goto out; + if (ip_vs_stats_init_alloc(&ipvs->tot_stats->s) < 0) + goto err_tot_stats; #ifdef CONFIG_PROC_FS if (!proc_create_net("ip_vs", 0, ipvs->net->proc_net, @@ -4190,7 +4458,8 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) goto err_percpu; #endif - if (ip_vs_control_net_init_sysctl(ipvs)) + ret = ip_vs_control_net_init_sysctl(ipvs); + if (ret < 0) goto err; return 0; @@ -4207,20 +4476,26 @@ err_stats: err_vs: #endif - free_percpu(ipvs->tot_stats.cpustats); - return -ENOMEM; + ip_vs_stats_release(&ipvs->tot_stats->s); + +err_tot_stats: + kfree(ipvs->tot_stats); + +out: + return ret; } void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs) { ip_vs_trash_cleanup(ipvs); ip_vs_control_net_cleanup_sysctl(ipvs); + cancel_delayed_work_sync(&ipvs->est_reload_work); #ifdef CONFIG_PROC_FS remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net); remove_proc_entry("ip_vs_stats", ipvs->net->proc_net); remove_proc_entry("ip_vs", ipvs->net->proc_net); #endif - free_percpu(ipvs->tot_stats.cpustats); + call_rcu(&ipvs->tot_stats->rcu_head, ip_vs_stats_rcu_free); } int __init ip_vs_register_nl_ioctl(void) @@ -4280,5 +4555,6 @@ void ip_vs_control_cleanup(void) { EnterFunction(2); unregister_netdevice_notifier(&ip_vs_dst_notifier); + /* relying on common rcu_barrier() in ip_vs_cleanup() */ LeaveFunction(2); } diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index 9a1a7af6a186..df56073bb282 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -30,9 +30,6 @@ long interval, it is easy to implement a user level daemon which periodically reads those statistical counters and measure rate. - Currently, the measurement is activated by slow timer handler. Hope - this measurement will not introduce too much load. - We measure rate during the last 8 seconds every 2 seconds: avgrate = avgrate*(1-W) + rate*W @@ -47,68 +44,79 @@ to 32-bit values for conns, packets, bps, cps and pps. * A lot of code is taken from net/core/gen_estimator.c - */ - -/* - * Make a summary from each cpu + KEY POINTS: + - cpustats counters are updated per-cpu in SoftIRQ context with BH disabled + - kthreads read the cpustats to update the estimators (svcs, dests, total) + - the states of estimators can be read (get stats) or modified (zero stats) + from processes + + KTHREADS: + - estimators are added initially to est_temp_list and later kthread 0 + distributes them to one or many kthreads for estimation + - kthread contexts are created and attached to array + - the kthread tasks are started when first service is added, before that + the total stats are not estimated + - when configuration (cpulist/nice) is changed, the tasks are restarted + by work (est_reload_work) + - kthread tasks are stopped while the cpulist is empty + - the kthread context holds lists with estimators (chains) which are + processed every 2 seconds + - as estimators can be added dynamically and in bursts, we try to spread + them to multiple chains which are estimated at different time + - on start, kthread 0 enters calculation phase to determine the chain limits + and the limit of estimators per kthread + - est_add_ktid: ktid where to add new ests, can point to empty slot where + we should add kt data */ -static void ip_vs_read_cpu_stats(struct ip_vs_kstats *sum, - struct ip_vs_cpu_stats __percpu *stats) -{ - int i; - bool add = false; - - for_each_possible_cpu(i) { - struct ip_vs_cpu_stats *s = per_cpu_ptr(stats, i); - unsigned int start; - u64 conns, inpkts, outpkts, inbytes, outbytes; - if (add) { - do { - start = u64_stats_fetch_begin(&s->syncp); - conns = s->cnt.conns; - inpkts = s->cnt.inpkts; - outpkts = s->cnt.outpkts; - inbytes = s->cnt.inbytes; - outbytes = s->cnt.outbytes; - } while (u64_stats_fetch_retry(&s->syncp, start)); - sum->conns += conns; - sum->inpkts += inpkts; - sum->outpkts += outpkts; - sum->inbytes += inbytes; - sum->outbytes += outbytes; - } else { - add = true; - do { - start = u64_stats_fetch_begin(&s->syncp); - sum->conns = s->cnt.conns; - sum->inpkts = s->cnt.inpkts; - sum->outpkts = s->cnt.outpkts; - sum->inbytes = s->cnt.inbytes; - sum->outbytes = s->cnt.outbytes; - } while (u64_stats_fetch_retry(&s->syncp, start)); - } - } -} +static struct lock_class_key __ipvs_est_key; +static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs); +static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs); -static void estimation_timer(struct timer_list *t) +static void ip_vs_chain_estimation(struct hlist_head *chain) { struct ip_vs_estimator *e; + struct ip_vs_cpu_stats *c; struct ip_vs_stats *s; u64 rate; - struct netns_ipvs *ipvs = from_timer(ipvs, t, est_timer); - if (!sysctl_run_estimation(ipvs)) - goto skip; + hlist_for_each_entry_rcu(e, chain, list) { + u64 conns, inpkts, outpkts, inbytes, outbytes; + u64 kconns = 0, kinpkts = 0, koutpkts = 0; + u64 kinbytes = 0, koutbytes = 0; + unsigned int start; + int i; + + if (kthread_should_stop()) + break; - spin_lock(&ipvs->est_lock); - list_for_each_entry(e, &ipvs->est_list, list) { s = container_of(e, struct ip_vs_stats, est); + for_each_possible_cpu(i) { + c = per_cpu_ptr(s->cpustats, i); + do { + start = u64_stats_fetch_begin(&c->syncp); + conns = u64_stats_read(&c->cnt.conns); + inpkts = u64_stats_read(&c->cnt.inpkts); + outpkts = u64_stats_read(&c->cnt.outpkts); + inbytes = u64_stats_read(&c->cnt.inbytes); + outbytes = u64_stats_read(&c->cnt.outbytes); + } while (u64_stats_fetch_retry(&c->syncp, start)); + kconns += conns; + kinpkts += inpkts; + koutpkts += outpkts; + kinbytes += inbytes; + koutbytes += outbytes; + } spin_lock(&s->lock); - ip_vs_read_cpu_stats(&s->kstats, s->cpustats); + + s->kstats.conns = kconns; + s->kstats.inpkts = kinpkts; + s->kstats.outpkts = koutpkts; + s->kstats.inbytes = kinbytes; + s->kstats.outbytes = koutbytes; /* scaled by 2^10, but divided 2 seconds */ rate = (s->kstats.conns - e->last_conns) << 9; @@ -133,30 +141,757 @@ static void estimation_timer(struct timer_list *t) e->outbps += ((s64)rate - (s64)e->outbps) >> 2; spin_unlock(&s->lock); } - spin_unlock(&ipvs->est_lock); +} -skip: - mod_timer(&ipvs->est_timer, jiffies + 2*HZ); +static void ip_vs_tick_estimation(struct ip_vs_est_kt_data *kd, int row) +{ + struct ip_vs_est_tick_data *td; + int cid; + + rcu_read_lock(); + td = rcu_dereference(kd->ticks[row]); + if (!td) + goto out; + for_each_set_bit(cid, td->present, IPVS_EST_TICK_CHAINS) { + if (kthread_should_stop()) + break; + ip_vs_chain_estimation(&td->chains[cid]); + cond_resched_rcu(); + td = rcu_dereference(kd->ticks[row]); + if (!td) + break; + } + +out: + rcu_read_unlock(); } -void ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) +static int ip_vs_estimation_kthread(void *data) { - struct ip_vs_estimator *est = &stats->est; + struct ip_vs_est_kt_data *kd = data; + struct netns_ipvs *ipvs = kd->ipvs; + int row = kd->est_row; + unsigned long now; + int id = kd->id; + long gap; + + if (id > 0) { + if (!ipvs->est_chain_max) + return 0; + } else { + if (!ipvs->est_chain_max) { + ipvs->est_calc_phase = 1; + /* commit est_calc_phase before reading est_genid */ + smp_mb(); + } + + /* kthread 0 will handle the calc phase */ + if (ipvs->est_calc_phase) + ip_vs_est_calc_phase(ipvs); + } + + while (1) { + if (!id && !hlist_empty(&ipvs->est_temp_list)) + ip_vs_est_drain_temp_list(ipvs); + set_current_state(TASK_IDLE); + if (kthread_should_stop()) + break; + + /* before estimation, check if we should sleep */ + now = jiffies; + gap = kd->est_timer - now; + if (gap > 0) { + if (gap > IPVS_EST_TICK) { + kd->est_timer = now - IPVS_EST_TICK; + gap = IPVS_EST_TICK; + } + schedule_timeout(gap); + } else { + __set_current_state(TASK_RUNNING); + if (gap < -8 * IPVS_EST_TICK) + kd->est_timer = now; + } + + if (kd->tick_len[row]) + ip_vs_tick_estimation(kd, row); + + row++; + if (row >= IPVS_EST_NTICKS) + row = 0; + WRITE_ONCE(kd->est_row, row); + kd->est_timer += IPVS_EST_TICK; + } + __set_current_state(TASK_RUNNING); + + return 0; +} + +/* Schedule stop/start for kthread tasks */ +void ip_vs_est_reload_start(struct netns_ipvs *ipvs) +{ + /* Ignore reloads before first service is added */ + if (!ipvs->enable) + return; + ip_vs_est_stopped_recalc(ipvs); + /* Bump the kthread configuration genid */ + atomic_inc(&ipvs->est_genid); + queue_delayed_work(system_long_wq, &ipvs->est_reload_work, 0); +} + +/* Start kthread task with current configuration */ +int ip_vs_est_kthread_start(struct netns_ipvs *ipvs, + struct ip_vs_est_kt_data *kd) +{ + unsigned long now; + int ret = 0; + long gap; + + lockdep_assert_held(&ipvs->est_mutex); + + if (kd->task) + goto out; + now = jiffies; + gap = kd->est_timer - now; + /* Sync est_timer if task is starting later */ + if (abs(gap) > 4 * IPVS_EST_TICK) + kd->est_timer = now; + kd->task = kthread_create(ip_vs_estimation_kthread, kd, "ipvs-e:%d:%d", + ipvs->gen, kd->id); + if (IS_ERR(kd->task)) { + ret = PTR_ERR(kd->task); + kd->task = NULL; + goto out; + } + + set_user_nice(kd->task, sysctl_est_nice(ipvs)); + set_cpus_allowed_ptr(kd->task, sysctl_est_cpulist(ipvs)); + + pr_info("starting estimator thread %d...\n", kd->id); + wake_up_process(kd->task); + +out: + return ret; +} + +void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd) +{ + if (kd->task) { + pr_info("stopping estimator thread %d...\n", kd->id); + kthread_stop(kd->task); + kd->task = NULL; + } +} + +/* Apply parameters to kthread */ +static void ip_vs_est_set_params(struct netns_ipvs *ipvs, + struct ip_vs_est_kt_data *kd) +{ + kd->chain_max = ipvs->est_chain_max; + /* We are using single chain on RCU preemption */ + if (IPVS_EST_TICK_CHAINS == 1) + kd->chain_max *= IPVS_EST_CHAIN_FACTOR; + kd->tick_max = IPVS_EST_TICK_CHAINS * kd->chain_max; + kd->est_max_count = IPVS_EST_NTICKS * kd->tick_max; +} + +/* Create and start estimation kthread in a free or new array slot */ +static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs) +{ + struct ip_vs_est_kt_data *kd = NULL; + int id = ipvs->est_kt_count; + int ret = -ENOMEM; + void *arr = NULL; + int i; + + if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads && + ipvs->enable && ipvs->est_max_threads) + return -EINVAL; + + mutex_lock(&ipvs->est_mutex); + + for (i = 0; i < id; i++) { + if (!ipvs->est_kt_arr[i]) + break; + } + if (i >= id) { + arr = krealloc_array(ipvs->est_kt_arr, id + 1, + sizeof(struct ip_vs_est_kt_data *), + GFP_KERNEL); + if (!arr) + goto out; + ipvs->est_kt_arr = arr; + } else { + id = i; + } - INIT_LIST_HEAD(&est->list); + kd = kzalloc(sizeof(*kd), GFP_KERNEL); + if (!kd) + goto out; + kd->ipvs = ipvs; + bitmap_fill(kd->avail, IPVS_EST_NTICKS); + kd->est_timer = jiffies; + kd->id = id; + ip_vs_est_set_params(ipvs, kd); + + /* Pre-allocate stats used in calc phase */ + if (!id && !kd->calc_stats) { + kd->calc_stats = ip_vs_stats_alloc(); + if (!kd->calc_stats) + goto out; + } + + /* Start kthread tasks only when services are present */ + if (ipvs->enable && !ip_vs_est_stopped(ipvs)) { + ret = ip_vs_est_kthread_start(ipvs, kd); + if (ret < 0) + goto out; + } + + if (arr) + ipvs->est_kt_count++; + ipvs->est_kt_arr[id] = kd; + kd = NULL; + /* Use most recent kthread for new ests */ + ipvs->est_add_ktid = id; + ret = 0; + +out: + mutex_unlock(&ipvs->est_mutex); + if (kd) { + ip_vs_stats_free(kd->calc_stats); + kfree(kd); + } - spin_lock_bh(&ipvs->est_lock); - list_add(&est->list, &ipvs->est_list); - spin_unlock_bh(&ipvs->est_lock); + return ret; } +/* Select ktid where to add new ests: available, unused or new slot */ +static void ip_vs_est_update_ktid(struct netns_ipvs *ipvs) +{ + int ktid, best = ipvs->est_kt_count; + struct ip_vs_est_kt_data *kd; + + for (ktid = 0; ktid < ipvs->est_kt_count; ktid++) { + kd = ipvs->est_kt_arr[ktid]; + if (kd) { + if (kd->est_count < kd->est_max_count) { + best = ktid; + break; + } + } else if (ktid < best) { + best = ktid; + } + } + ipvs->est_add_ktid = best; +} + +/* Add estimator to current kthread (est_add_ktid) */ +static int ip_vs_enqueue_estimator(struct netns_ipvs *ipvs, + struct ip_vs_estimator *est) +{ + struct ip_vs_est_kt_data *kd = NULL; + struct ip_vs_est_tick_data *td; + int ktid, row, crow, cid, ret; + int delay = est->ktrow; + + BUILD_BUG_ON_MSG(IPVS_EST_TICK_CHAINS > 127, + "Too many chains for ktcid"); + + if (ipvs->est_add_ktid < ipvs->est_kt_count) { + kd = ipvs->est_kt_arr[ipvs->est_add_ktid]; + if (kd) + goto add_est; + } + + ret = ip_vs_est_add_kthread(ipvs); + if (ret < 0) + goto out; + kd = ipvs->est_kt_arr[ipvs->est_add_ktid]; + +add_est: + ktid = kd->id; + /* For small number of estimators prefer to use few ticks, + * otherwise try to add into the last estimated row. + * est_row and add_row point after the row we should use + */ + if (kd->est_count >= 2 * kd->tick_max || delay < IPVS_EST_NTICKS - 1) + crow = READ_ONCE(kd->est_row); + else + crow = kd->add_row; + crow += delay; + if (crow >= IPVS_EST_NTICKS) + crow -= IPVS_EST_NTICKS; + /* Assume initial delay ? */ + if (delay >= IPVS_EST_NTICKS - 1) { + /* Preserve initial delay or decrease it if no space in tick */ + row = crow; + if (crow < IPVS_EST_NTICKS - 1) { + crow++; + row = find_last_bit(kd->avail, crow); + } + if (row >= crow) + row = find_last_bit(kd->avail, IPVS_EST_NTICKS); + } else { + /* Preserve delay or increase it if no space in tick */ + row = IPVS_EST_NTICKS; + if (crow > 0) + row = find_next_bit(kd->avail, IPVS_EST_NTICKS, crow); + if (row >= IPVS_EST_NTICKS) + row = find_first_bit(kd->avail, IPVS_EST_NTICKS); + } + + td = rcu_dereference_protected(kd->ticks[row], 1); + if (!td) { + td = kzalloc(sizeof(*td), GFP_KERNEL); + if (!td) { + ret = -ENOMEM; + goto out; + } + rcu_assign_pointer(kd->ticks[row], td); + } + + cid = find_first_zero_bit(td->full, IPVS_EST_TICK_CHAINS); + + kd->est_count++; + kd->tick_len[row]++; + if (!td->chain_len[cid]) + __set_bit(cid, td->present); + td->chain_len[cid]++; + est->ktid = ktid; + est->ktrow = row; + est->ktcid = cid; + hlist_add_head_rcu(&est->list, &td->chains[cid]); + + if (td->chain_len[cid] >= kd->chain_max) { + __set_bit(cid, td->full); + if (kd->tick_len[row] >= kd->tick_max) + __clear_bit(row, kd->avail); + } + + /* Update est_add_ktid to point to first available/empty kt slot */ + if (kd->est_count == kd->est_max_count) + ip_vs_est_update_ktid(ipvs); + + ret = 0; + +out: + return ret; +} + +/* Start estimation for stats */ +int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) +{ + struct ip_vs_estimator *est = &stats->est; + int ret; + + if (!ipvs->est_max_threads && ipvs->enable) + ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); + + est->ktid = -1; + est->ktrow = IPVS_EST_NTICKS - 1; /* Initial delay */ + + /* We prefer this code to be short, kthread 0 will requeue the + * estimator to available chain. If tasks are disabled, we + * will not allocate much memory, just for kt 0. + */ + ret = 0; + if (!ipvs->est_kt_count || !ipvs->est_kt_arr[0]) + ret = ip_vs_est_add_kthread(ipvs); + if (ret >= 0) + hlist_add_head(&est->list, &ipvs->est_temp_list); + else + INIT_HLIST_NODE(&est->list); + return ret; +} + +static void ip_vs_est_kthread_destroy(struct ip_vs_est_kt_data *kd) +{ + if (kd) { + if (kd->task) { + pr_info("stop unused estimator thread %d...\n", kd->id); + kthread_stop(kd->task); + } + ip_vs_stats_free(kd->calc_stats); + kfree(kd); + } +} + +/* Unlink estimator from chain */ void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) { struct ip_vs_estimator *est = &stats->est; + struct ip_vs_est_tick_data *td; + struct ip_vs_est_kt_data *kd; + int ktid = est->ktid; + int row = est->ktrow; + int cid = est->ktcid; + + /* Failed to add to chain ? */ + if (hlist_unhashed(&est->list)) + return; + + /* On return, estimator can be freed, dequeue it now */ + + /* In est_temp_list ? */ + if (ktid < 0) { + hlist_del(&est->list); + goto end_kt0; + } + + hlist_del_rcu(&est->list); + kd = ipvs->est_kt_arr[ktid]; + td = rcu_dereference_protected(kd->ticks[row], 1); + __clear_bit(cid, td->full); + td->chain_len[cid]--; + if (!td->chain_len[cid]) + __clear_bit(cid, td->present); + kd->tick_len[row]--; + __set_bit(row, kd->avail); + if (!kd->tick_len[row]) { + RCU_INIT_POINTER(kd->ticks[row], NULL); + kfree_rcu(td); + } + kd->est_count--; + if (kd->est_count) { + /* This kt slot can become available just now, prefer it */ + if (ktid < ipvs->est_add_ktid) + ipvs->est_add_ktid = ktid; + return; + } - spin_lock_bh(&ipvs->est_lock); - list_del(&est->list); - spin_unlock_bh(&ipvs->est_lock); + if (ktid > 0) { + mutex_lock(&ipvs->est_mutex); + ip_vs_est_kthread_destroy(kd); + ipvs->est_kt_arr[ktid] = NULL; + if (ktid == ipvs->est_kt_count - 1) { + ipvs->est_kt_count--; + while (ipvs->est_kt_count > 1 && + !ipvs->est_kt_arr[ipvs->est_kt_count - 1]) + ipvs->est_kt_count--; + } + mutex_unlock(&ipvs->est_mutex); + + /* This slot is now empty, prefer another available kt slot */ + if (ktid == ipvs->est_add_ktid) + ip_vs_est_update_ktid(ipvs); + } + +end_kt0: + /* kt 0 is freed after all other kthreads and chains are empty */ + if (ipvs->est_kt_count == 1 && hlist_empty(&ipvs->est_temp_list)) { + kd = ipvs->est_kt_arr[0]; + if (!kd || !kd->est_count) { + mutex_lock(&ipvs->est_mutex); + if (kd) { + ip_vs_est_kthread_destroy(kd); + ipvs->est_kt_arr[0] = NULL; + } + ipvs->est_kt_count--; + mutex_unlock(&ipvs->est_mutex); + ipvs->est_add_ktid = 0; + } + } +} + +/* Register all ests from est_temp_list to kthreads */ +static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs) +{ + struct ip_vs_estimator *est; + + while (1) { + int max = 16; + + mutex_lock(&__ip_vs_mutex); + + while (max-- > 0) { + est = hlist_entry_safe(ipvs->est_temp_list.first, + struct ip_vs_estimator, list); + if (est) { + if (kthread_should_stop()) + goto unlock; + hlist_del_init(&est->list); + if (ip_vs_enqueue_estimator(ipvs, est) >= 0) + continue; + est->ktid = -1; + hlist_add_head(&est->list, + &ipvs->est_temp_list); + /* Abort, some entries will not be estimated + * until next attempt + */ + } + goto unlock; + } + mutex_unlock(&__ip_vs_mutex); + cond_resched(); + } + +unlock: + mutex_unlock(&__ip_vs_mutex); +} + +/* Calculate limits for all kthreads */ +static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max) +{ + DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + struct ip_vs_est_kt_data *kd; + struct hlist_head chain; + struct ip_vs_stats *s; + int cache_factor = 4; + int i, loops, ntest; + s32 min_est = 0; + ktime_t t1, t2; + s64 diff, val; + int max = 8; + int ret = 1; + + INIT_HLIST_HEAD(&chain); + mutex_lock(&__ip_vs_mutex); + kd = ipvs->est_kt_arr[0]; + mutex_unlock(&__ip_vs_mutex); + s = kd ? kd->calc_stats : NULL; + if (!s) + goto out; + hlist_add_head(&s->est.list, &chain); + + loops = 1; + /* Get best result from many tests */ + for (ntest = 0; ntest < 12; ntest++) { + if (!(ntest & 3)) { + /* Wait for cpufreq frequency transition */ + wait_event_idle_timeout(wq, kthread_should_stop(), + HZ / 50); + if (!ipvs->enable || kthread_should_stop()) + goto stop; + } + + local_bh_disable(); + rcu_read_lock(); + + /* Put stats in cache */ + ip_vs_chain_estimation(&chain); + + t1 = ktime_get(); + for (i = loops * cache_factor; i > 0; i--) + ip_vs_chain_estimation(&chain); + t2 = ktime_get(); + + rcu_read_unlock(); + local_bh_enable(); + + if (!ipvs->enable || kthread_should_stop()) + goto stop; + cond_resched(); + + diff = ktime_to_ns(ktime_sub(t2, t1)); + if (diff <= 1 * NSEC_PER_USEC) { + /* Do more loops on low time resolution */ + loops *= 2; + continue; + } + if (diff >= NSEC_PER_SEC) + continue; + val = diff; + do_div(val, loops); + if (!min_est || val < min_est) { + min_est = val; + /* goal: 95usec per chain */ + val = 95 * NSEC_PER_USEC; + if (val >= min_est) { + do_div(val, min_est); + max = (int)val; + } else { + max = 1; + } + } + } + +out: + if (s) + hlist_del_init(&s->est.list); + *chain_max = max; + return ret; + +stop: + ret = 0; + goto out; +} + +/* Calculate the parameters and apply them in context of kt #0 + * ECP: est_calc_phase + * ECM: est_chain_max + * ECP ECM Insert Chain enable Description + * --------------------------------------------------------------------------- + * 0 0 est_temp_list 0 create kt #0 context + * 0 0 est_temp_list 0->1 service added, start kthread #0 task + * 0->1 0 est_temp_list 1 kt task #0 started, enters calc phase + * 1 0 est_temp_list 1 kt #0: determine est_chain_max, + * stop tasks, move ests to est_temp_list + * and free kd for kthreads 1..last + * 1->0 0->N kt chains 1 ests can go to kthreads + * 0 N kt chains 1 drain est_temp_list, create new kthread + * contexts, start tasks, estimate + */ +static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs) +{ + int genid = atomic_read(&ipvs->est_genid); + struct ip_vs_est_tick_data *td; + struct ip_vs_est_kt_data *kd; + struct ip_vs_estimator *est; + struct ip_vs_stats *stats; + int id, row, cid, delay; + bool last, last_td; + int chain_max; + int step; + + if (!ip_vs_est_calc_limits(ipvs, &chain_max)) + return; + + mutex_lock(&__ip_vs_mutex); + + /* Stop all other tasks, so that we can immediately move the + * estimators to est_temp_list without RCU grace period + */ + mutex_lock(&ipvs->est_mutex); + for (id = 1; id < ipvs->est_kt_count; id++) { + /* netns clean up started, abort */ + if (!ipvs->enable) + goto unlock2; + kd = ipvs->est_kt_arr[id]; + if (!kd) + continue; + ip_vs_est_kthread_stop(kd); + } + mutex_unlock(&ipvs->est_mutex); + + /* Move all estimators to est_temp_list but carefully, + * all estimators and kthread data can be released while + * we reschedule. Even for kthread 0. + */ + step = 0; + + /* Order entries in est_temp_list in ascending delay, so now + * walk delay(desc), id(desc), cid(asc) + */ + delay = IPVS_EST_NTICKS; + +next_delay: + delay--; + if (delay < 0) + goto end_dequeue; + +last_kt: + /* Destroy contexts backwards */ + id = ipvs->est_kt_count; + +next_kt: + if (!ipvs->enable || kthread_should_stop()) + goto unlock; + id--; + if (id < 0) + goto next_delay; + kd = ipvs->est_kt_arr[id]; + if (!kd) + goto next_kt; + /* kt 0 can exist with empty chains */ + if (!id && kd->est_count <= 1) + goto next_delay; + + row = kd->est_row + delay; + if (row >= IPVS_EST_NTICKS) + row -= IPVS_EST_NTICKS; + td = rcu_dereference_protected(kd->ticks[row], 1); + if (!td) + goto next_kt; + + cid = 0; + +walk_chain: + if (kthread_should_stop()) + goto unlock; + step++; + if (!(step & 63)) { + /* Give chance estimators to be added (to est_temp_list) + * and deleted (releasing kthread contexts) + */ + mutex_unlock(&__ip_vs_mutex); + cond_resched(); + mutex_lock(&__ip_vs_mutex); + + /* Current kt released ? */ + if (id >= ipvs->est_kt_count) + goto last_kt; + if (kd != ipvs->est_kt_arr[id]) + goto next_kt; + /* Current td released ? */ + if (td != rcu_dereference_protected(kd->ticks[row], 1)) + goto next_kt; + /* No fatal changes on the current kd and td */ + } + est = hlist_entry_safe(td->chains[cid].first, struct ip_vs_estimator, + list); + if (!est) { + cid++; + if (cid >= IPVS_EST_TICK_CHAINS) + goto next_kt; + goto walk_chain; + } + /* We can cheat and increase est_count to protect kt 0 context + * from release but we prefer to keep the last estimator + */ + last = kd->est_count <= 1; + /* Do not free kt #0 data */ + if (!id && last) + goto next_delay; + last_td = kd->tick_len[row] <= 1; + stats = container_of(est, struct ip_vs_stats, est); + ip_vs_stop_estimator(ipvs, stats); + /* Tasks are stopped, move without RCU grace period */ + est->ktid = -1; + est->ktrow = row - kd->est_row; + if (est->ktrow < 0) + est->ktrow += IPVS_EST_NTICKS; + hlist_add_head(&est->list, &ipvs->est_temp_list); + /* kd freed ? */ + if (last) + goto next_kt; + /* td freed ? */ + if (last_td) + goto next_kt; + goto walk_chain; + +end_dequeue: + /* All estimators removed while calculating ? */ + if (!ipvs->est_kt_count) + goto unlock; + kd = ipvs->est_kt_arr[0]; + if (!kd) + goto unlock; + kd->add_row = kd->est_row; + ipvs->est_chain_max = chain_max; + ip_vs_est_set_params(ipvs, kd); + + pr_info("using max %d ests per chain, %d per kthread\n", + kd->chain_max, kd->est_max_count); + + /* Try to keep tot_stats in kt0, enqueue it early */ + if (ipvs->tot_stats && !hlist_unhashed(&ipvs->tot_stats->s.est.list) && + ipvs->tot_stats->s.est.ktid == -1) { + hlist_del(&ipvs->tot_stats->s.est.list); + hlist_add_head(&ipvs->tot_stats->s.est.list, + &ipvs->est_temp_list); + } + + mutex_lock(&ipvs->est_mutex); + + /* We completed the calc phase, new calc phase not requested */ + if (genid == atomic_read(&ipvs->est_genid)) + ipvs->est_calc_phase = 0; + +unlock2: + mutex_unlock(&ipvs->est_mutex); + +unlock: + mutex_unlock(&__ip_vs_mutex); } void ip_vs_zero_estimator(struct ip_vs_stats *stats) @@ -191,14 +926,25 @@ void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats) int __net_init ip_vs_estimator_net_init(struct netns_ipvs *ipvs) { - INIT_LIST_HEAD(&ipvs->est_list); - spin_lock_init(&ipvs->est_lock); - timer_setup(&ipvs->est_timer, estimation_timer, 0); - mod_timer(&ipvs->est_timer, jiffies + 2 * HZ); + INIT_HLIST_HEAD(&ipvs->est_temp_list); + ipvs->est_kt_arr = NULL; + ipvs->est_max_threads = 0; + ipvs->est_calc_phase = 0; + ipvs->est_chain_max = 0; + ipvs->est_kt_count = 0; + ipvs->est_add_ktid = 0; + atomic_set(&ipvs->est_genid, 0); + atomic_set(&ipvs->est_genid_done, 0); + __mutex_init(&ipvs->est_mutex, "ipvs->est_mutex", &__ipvs_est_key); return 0; } void __net_exit ip_vs_estimator_net_cleanup(struct netns_ipvs *ipvs) { - del_timer_sync(&ipvs->est_timer); + int i; + + for (i = 0; i < ipvs->est_kt_count; i++) + ip_vs_est_kthread_destroy(ipvs->est_kt_arr[i]); + kfree(ipvs->est_kt_arr); + mutex_destroy(&ipvs->est_mutex); } diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 895b09cbd7cf..99323fb12d0f 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -121,17 +121,61 @@ const struct nf_conntrack_l4proto *nf_ct_l4proto_find(u8 l4proto) }; EXPORT_SYMBOL_GPL(nf_ct_l4proto_find); -unsigned int nf_confirm(struct sk_buff *skb, unsigned int protoff, - struct nf_conn *ct, enum ip_conntrack_info ctinfo) +static bool in_vrf_postrouting(const struct nf_hook_state *state) +{ +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) + if (state->hook == NF_INET_POST_ROUTING && + netif_is_l3_master(state->out)) + return true; +#endif + return false; +} + +unsigned int nf_confirm(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) { const struct nf_conn_help *help; + enum ip_conntrack_info ctinfo; + unsigned int protoff; + struct nf_conn *ct; + bool seqadj_needed; + __be16 frag_off; + u8 pnum; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct || in_vrf_postrouting(state)) + return NF_ACCEPT; help = nfct_help(ct); + + seqadj_needed = test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && !nf_is_loopback_packet(skb); + if (!help && !seqadj_needed) + return nf_conntrack_confirm(skb); + + /* helper->help() do not expect ICMP packets */ + if (ctinfo == IP_CT_RELATED_REPLY) + return nf_conntrack_confirm(skb); + + switch (nf_ct_l3num(ct)) { + case NFPROTO_IPV4: + protoff = skb_network_offset(skb) + ip_hdrlen(skb); + break; + case NFPROTO_IPV6: + pnum = ipv6_hdr(skb)->nexthdr; + protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, &frag_off); + if (protoff < 0 || (frag_off & htons(~0x7)) != 0) + return nf_conntrack_confirm(skb); + break; + default: + return nf_conntrack_confirm(skb); + } + if (help) { const struct nf_conntrack_helper *helper; int ret; - /* rcu_read_lock()ed by nf_hook_thresh */ + /* rcu_read_lock()ed by nf_hook */ helper = rcu_dereference(help->helper); if (helper) { ret = helper->help(skb, @@ -142,12 +186,10 @@ unsigned int nf_confirm(struct sk_buff *skb, unsigned int protoff, } } - if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && - !nf_is_loopback_packet(skb)) { - if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) { - NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); - return NF_DROP; - } + if (seqadj_needed && + !nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) { + NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); + return NF_DROP; } /* We've seen it coming out the other side: confirm it */ @@ -155,35 +197,6 @@ unsigned int nf_confirm(struct sk_buff *skb, unsigned int protoff, } EXPORT_SYMBOL_GPL(nf_confirm); -static bool in_vrf_postrouting(const struct nf_hook_state *state) -{ -#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) - if (state->hook == NF_INET_POST_ROUTING && - netif_is_l3_master(state->out)) - return true; -#endif - return false; -} - -static unsigned int ipv4_confirm(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - enum ip_conntrack_info ctinfo; - struct nf_conn *ct; - - ct = nf_ct_get(skb, &ctinfo); - if (!ct || ctinfo == IP_CT_RELATED_REPLY) - return nf_conntrack_confirm(skb); - - if (in_vrf_postrouting(state)) - return NF_ACCEPT; - - return nf_confirm(skb, - skb_network_offset(skb) + ip_hdrlen(skb), - ct, ctinfo); -} - static unsigned int ipv4_conntrack_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) @@ -230,13 +243,13 @@ static const struct nf_hook_ops ipv4_conntrack_ops[] = { .priority = NF_IP_PRI_CONNTRACK, }, { - .hook = ipv4_confirm, + .hook = nf_confirm, .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_CONNTRACK_CONFIRM, }, { - .hook = ipv4_confirm, + .hook = nf_confirm, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_CONNTRACK_CONFIRM, @@ -373,33 +386,6 @@ static struct nf_sockopt_ops so_getorigdst6 = { .owner = THIS_MODULE, }; -static unsigned int ipv6_confirm(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - unsigned char pnum = ipv6_hdr(skb)->nexthdr; - __be16 frag_off; - int protoff; - - ct = nf_ct_get(skb, &ctinfo); - if (!ct || ctinfo == IP_CT_RELATED_REPLY) - return nf_conntrack_confirm(skb); - - if (in_vrf_postrouting(state)) - return NF_ACCEPT; - - protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, - &frag_off); - if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { - pr_debug("proto header not found\n"); - return nf_conntrack_confirm(skb); - } - - return nf_confirm(skb, protoff, ct, ctinfo); -} - static unsigned int ipv6_conntrack_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) @@ -428,13 +414,13 @@ static const struct nf_hook_ops ipv6_conntrack_ops[] = { .priority = NF_IP6_PRI_CONNTRACK, }, { - .hook = ipv6_confirm, + .hook = nf_confirm, .pf = NFPROTO_IPV6, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP6_PRI_LAST, }, { - .hook = ipv6_confirm, + .hook = nf_confirm, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_LAST - 1, diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c index 61e3b05cf02c..1020d67600a9 100644 --- a/net/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/netfilter/nf_conntrack_proto_icmpv6.c @@ -129,6 +129,56 @@ static void icmpv6_error_log(const struct sk_buff *skb, nf_l4proto_log_invalid(skb, state, IPPROTO_ICMPV6, "%s", msg); } +static noinline_for_stack int +nf_conntrack_icmpv6_redirect(struct nf_conn *tmpl, struct sk_buff *skb, + unsigned int dataoff, + const struct nf_hook_state *state) +{ + u8 hl = ipv6_hdr(skb)->hop_limit; + union nf_inet_addr outer_daddr; + union { + struct nd_opt_hdr nd_opt; + struct rd_msg rd_msg; + } tmp; + const struct nd_opt_hdr *nd_opt; + const struct rd_msg *rd_msg; + + rd_msg = skb_header_pointer(skb, dataoff, sizeof(*rd_msg), &tmp.rd_msg); + if (!rd_msg) { + icmpv6_error_log(skb, state, "short redirect"); + return -NF_ACCEPT; + } + + if (rd_msg->icmph.icmp6_code != 0) + return NF_ACCEPT; + + if (hl != 255 || !(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { + icmpv6_error_log(skb, state, "invalid saddr or hoplimit for redirect"); + return -NF_ACCEPT; + } + + dataoff += sizeof(*rd_msg); + + /* warning: rd_msg no longer usable after this call */ + nd_opt = skb_header_pointer(skb, dataoff, sizeof(*nd_opt), &tmp.nd_opt); + if (!nd_opt || nd_opt->nd_opt_len == 0) { + icmpv6_error_log(skb, state, "redirect without options"); + return -NF_ACCEPT; + } + + /* We could call ndisc_parse_options(), but it would need + * skb_linearize() and a bit more work. + */ + if (nd_opt->nd_opt_type != ND_OPT_REDIRECT_HDR) + return NF_ACCEPT; + + memcpy(&outer_daddr.ip6, &ipv6_hdr(skb)->daddr, + sizeof(outer_daddr.ip6)); + dataoff += 8; + return nf_conntrack_inet_error(tmpl, skb, dataoff, state, + IPPROTO_ICMPV6, &outer_daddr); +} + int nf_conntrack_icmpv6_error(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, @@ -159,6 +209,9 @@ int nf_conntrack_icmpv6_error(struct nf_conn *tmpl, return NF_ACCEPT; } + if (icmp6h->icmp6_type == NDISC_REDIRECT) + return nf_conntrack_icmpv6_redirect(tmpl, skb, dataoff, state); + /* is not error message ? */ if (icmp6h->icmp6_type >= 128) return NF_ACCEPT; diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 5a936334b517..d88b92a8ffca 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -60,6 +60,7 @@ static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = { [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = 3 SECS, [SCTP_CONNTRACK_HEARTBEAT_SENT] = 30 SECS, [SCTP_CONNTRACK_HEARTBEAT_ACKED] = 210 SECS, + [SCTP_CONNTRACK_DATA_SENT] = 30 SECS, }; #define SCTP_FLAG_HEARTBEAT_VTAG_FAILED 1 @@ -74,6 +75,7 @@ static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = { #define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT #define sHS SCTP_CONNTRACK_HEARTBEAT_SENT #define sHA SCTP_CONNTRACK_HEARTBEAT_ACKED +#define sDS SCTP_CONNTRACK_DATA_SENT #define sIV SCTP_CONNTRACK_MAX /* @@ -90,15 +92,16 @@ COOKIE WAIT - We have seen an INIT chunk in the original direction, or als COOKIE ECHOED - We have seen a COOKIE_ECHO chunk in the original direction. ESTABLISHED - We have seen a COOKIE_ACK in the reply direction. SHUTDOWN_SENT - We have seen a SHUTDOWN chunk in the original direction. -SHUTDOWN_RECD - We have seen a SHUTDOWN chunk in the reply directoin. +SHUTDOWN_RECD - We have seen a SHUTDOWN chunk in the reply direction. SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite to that of the SHUTDOWN chunk. CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of the SHUTDOWN chunk. Connection is closed. HEARTBEAT_SENT - We have seen a HEARTBEAT in a new flow. -HEARTBEAT_ACKED - We have seen a HEARTBEAT-ACK in the direction opposite to - that of the HEARTBEAT chunk. Secondary connection is - established. +HEARTBEAT_ACKED - We have seen a HEARTBEAT-ACK/DATA/SACK in the direction + opposite to that of the HEARTBEAT/DATA chunk. Secondary connection + is established. +DATA_SENT - We have seen a DATA/SACK in a new flow. */ /* TODO @@ -112,36 +115,38 @@ cookie echoed to closed. */ /* SCTP conntrack state transitions */ -static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = { +static const u8 sctp_conntracks[2][12][SCTP_CONNTRACK_MAX] = { { /* ORIGINAL */ -/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */ -/* init */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCW, sHA}, -/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA}, -/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, -/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL, sSS}, -/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA, sHA}, -/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* Can't have Stale cookie*/ -/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* 5.2.4 - Big TODO */ -/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* Can't come in orig dir */ -/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL, sHA}, -/* heartbeat */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA}, -/* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA} +/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sDS */ +/* init */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCW, sHA, sCW}, +/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA, sCL}, +/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, +/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL, sSS, sCL}, +/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA, sHA, sSA}, +/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA, sCL},/* Can't have Stale cookie*/ +/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL, sHA, sCL},/* 5.2.4 - Big TODO */ +/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA, sCL},/* Can't come in orig dir */ +/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL, sHA, sCL}, +/* heartbeat */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sDS}, +/* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sDS}, +/* data/sack */ {sDS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sDS} }, { /* REPLY */ -/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */ -/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},/* INIT in sCL Big TODO */ -/* init_ack */ {sIV, sCW, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA}, -/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV, sCL}, -/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV, sSR}, -/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV, sHA}, -/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV, sHA}, -/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},/* Can't come in reply dir */ -/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV, sHA}, -/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV, sHA}, -/* heartbeat */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA}, -/* heartbeat_ack*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHA, sHA} +/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sDS */ +/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA, sIV},/* INIT in sCL Big TODO */ +/* init_ack */ {sIV, sCW, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA, sIV}, +/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV, sCL, sIV}, +/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV, sSR, sIV}, +/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV, sHA, sIV}, +/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV, sHA, sIV}, +/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA, sIV},/* Can't come in reply dir */ +/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV, sHA, sIV}, +/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV, sHA, sIV}, +/* heartbeat */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sHA}, +/* heartbeat_ack*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHA, sHA, sHA}, +/* data/sack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHA, sHA, sHA}, } }; @@ -253,6 +258,11 @@ static int sctp_new_state(enum ip_conntrack_dir dir, pr_debug("SCTP_CID_HEARTBEAT_ACK"); i = 10; break; + case SCTP_CID_DATA: + case SCTP_CID_SACK: + pr_debug("SCTP_CID_DATA/SACK"); + i = 11; + break; default: /* Other chunks like DATA or SACK do not change the state */ pr_debug("Unknown chunk type, Will stay in %s\n", @@ -306,7 +316,9 @@ sctp_new(struct nf_conn *ct, const struct sk_buff *skb, ih->init_tag); ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = ih->init_tag; - } else if (sch->type == SCTP_CID_HEARTBEAT) { + } else if (sch->type == SCTP_CID_HEARTBEAT || + sch->type == SCTP_CID_DATA || + sch->type == SCTP_CID_SACK) { pr_debug("Setting vtag %x for secondary conntrack\n", sh->vtag); ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = sh->vtag; @@ -392,19 +404,19 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, if (!sctp_new(ct, skb, sh, dataoff)) return -NF_ACCEPT; - } - - /* Check the verification tag (Sec 8.5) */ - if (!test_bit(SCTP_CID_INIT, map) && - !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) && - !test_bit(SCTP_CID_COOKIE_ECHO, map) && - !test_bit(SCTP_CID_ABORT, map) && - !test_bit(SCTP_CID_SHUTDOWN_ACK, map) && - !test_bit(SCTP_CID_HEARTBEAT, map) && - !test_bit(SCTP_CID_HEARTBEAT_ACK, map) && - sh->vtag != ct->proto.sctp.vtag[dir]) { - pr_debug("Verification tag check failed\n"); - goto out; + } else { + /* Check the verification tag (Sec 8.5) */ + if (!test_bit(SCTP_CID_INIT, map) && + !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) && + !test_bit(SCTP_CID_COOKIE_ECHO, map) && + !test_bit(SCTP_CID_ABORT, map) && + !test_bit(SCTP_CID_SHUTDOWN_ACK, map) && + !test_bit(SCTP_CID_HEARTBEAT, map) && + !test_bit(SCTP_CID_HEARTBEAT_ACK, map) && + sh->vtag != ct->proto.sctp.vtag[dir]) { + pr_debug("Verification tag check failed\n"); + goto out; + } } old_state = new_state = SCTP_CONNTRACK_NONE; @@ -464,6 +476,11 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, } else if (ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) { ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED; } + } else if (sch->type == SCTP_CID_DATA || sch->type == SCTP_CID_SACK) { + if (ct->proto.sctp.vtag[dir] == 0) { + pr_debug("Setting vtag %x for dir %d\n", sh->vtag, dir); + ct->proto.sctp.vtag[dir] = sh->vtag; + } } old_state = ct->proto.sctp.state; @@ -684,6 +701,7 @@ sctp_timeout_nla_policy[CTA_TIMEOUT_SCTP_MAX+1] = { [CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_HEARTBEAT_SENT] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_HEARTBEAT_ACKED] = { .type = NLA_U32 }, + [CTA_TIMEOUT_SCTP_DATA_SENT] = { .type = NLA_U32 }, }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index bca839ab1ae8..0250725e38a4 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -602,6 +602,7 @@ enum nf_ct_sysctl_index { NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT, NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_SENT, NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_ACKED, + NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_DATA_SENT, #endif #ifdef CONFIG_NF_CT_PROTO_DCCP NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_REQUEST, @@ -892,6 +893,12 @@ static struct ctl_table nf_ct_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, + [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_DATA_SENT] = { + .procname = "nf_conntrack_sctp_timeout_data_sent", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, #endif #ifdef CONFIG_NF_CT_PROTO_DCCP [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_REQUEST] = { @@ -1036,6 +1043,7 @@ static void nf_conntrack_standalone_init_sctp_sysctl(struct net *net, XASSIGN(SHUTDOWN_ACK_SENT, sn); XASSIGN(HEARTBEAT_SENT, sn); XASSIGN(HEARTBEAT_ACKED, sn); + XASSIGN(DATA_SENT, sn); #undef XASSIGN #endif } diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index b350fe9d00b0..19efba1e51ef 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -421,6 +421,10 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, if (ret == NF_DROP) flow_offload_teardown(flow); break; + default: + WARN_ON_ONCE(1); + ret = NF_DROP; + break; } return ret; @@ -682,6 +686,10 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, if (ret == NF_DROP) flow_offload_teardown(flow); break; + default: + WARN_ON_ONCE(1); + ret = NF_DROP; + break; } return ret; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 6269b0d9977c..832b881f7c17 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2873,8 +2873,8 @@ int nft_expr_inner_parse(const struct nft_ctx *ctx, const struct nlattr *nla, return -EINVAL; type = __nft_expr_type_get(ctx->family, tb[NFTA_EXPR_NAME]); - if (IS_ERR(type)) - return PTR_ERR(type); + if (!type) + return -ENOENT; if (!type->inner_ops) return -EOPNOTSUPP; diff --git a/tools/testing/selftests/netfilter/conntrack_icmp_related.sh b/tools/testing/selftests/netfilter/conntrack_icmp_related.sh index b48e1833bc89..76645aaf2b58 100755 --- a/tools/testing/selftests/netfilter/conntrack_icmp_related.sh +++ b/tools/testing/selftests/netfilter/conntrack_icmp_related.sh @@ -35,6 +35,8 @@ cleanup() { for i in 1 2;do ip netns del nsrouter$i;done } +trap cleanup EXIT + ipv4() { echo -n 192.168.$1.2 } @@ -146,11 +148,17 @@ ip netns exec nsclient1 nft -f - <<EOF table inet filter { counter unknown { } counter related { } + counter redir4 { } + counter redir6 { } chain input { type filter hook input priority 0; policy accept; - meta l4proto { icmp, icmpv6 } ct state established,untracked accept + icmp type "redirect" ct state "related" counter name "redir4" accept + icmpv6 type "nd-redirect" ct state "related" counter name "redir6" accept + + meta l4proto { icmp, icmpv6 } ct state established,untracked accept meta l4proto { icmp, icmpv6 } ct state "related" counter name "related" accept + counter name "unknown" drop } } @@ -279,5 +287,29 @@ else echo "ERROR: icmp error RELATED state test has failed" fi -cleanup +# add 'bad' route, expect icmp REDIRECT to be generated +ip netns exec nsclient1 ip route add 192.168.1.42 via 192.168.1.1 +ip netns exec nsclient1 ip route add dead:1::42 via dead:1::1 + +ip netns exec "nsclient1" ping -q -c 2 192.168.1.42 > /dev/null + +expect="packets 1 bytes 112" +check_counter nsclient1 "redir4" "$expect" +if [ $? -ne 0 ];then + ret=1 +fi + +ip netns exec "nsclient1" ping -c 1 dead:1::42 > /dev/null +expect="packets 1 bytes 192" +check_counter nsclient1 "redir6" "$expect" +if [ $? -ne 0 ];then + ret=1 +fi + +if [ $ret -eq 0 ];then + echo "PASS: icmp redirects had RELATED state" +else + echo "ERROR: icmp redirect RELATED state test has failed" +fi + exit $ret |