diff options
author | Johannes Berg <johannes.berg@intel.com> | 2015-10-13 10:28:16 +0200 |
---|---|---|
committer | Johannes Berg <johannes.berg@intel.com> | 2015-10-13 10:28:43 +0200 |
commit | 985f2c87a7109a42cac93f56ea595353d53a1746 (patch) | |
tree | 69fe0acbbfeb6a348d6f4c64750d278e56b935b3 /net | |
parent | 4d57c67827d7bb79c4fcf6618bf80930808e50c6 (diff) | |
parent | 6623c60dc28ee966cd85c6f12aa2fc3c952d0179 (diff) | |
download | linux-985f2c87a7109a42cac93f56ea595353d53a1746.tar.gz |
Merge remote-tracking branch 'net-next/master' into mac80211-next
Merge net-next to get some driver changes that patches depend
on (in order to avoid conflicts).
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Diffstat (limited to 'net')
416 files changed, 11907 insertions, 7133 deletions
diff --git a/net/6lowpan/core.c b/net/6lowpan/core.c index ae1896fa45e2..83b19e072224 100644 --- a/net/6lowpan/core.c +++ b/net/6lowpan/core.c @@ -17,6 +17,11 @@ void lowpan_netdev_setup(struct net_device *dev, enum lowpan_lltypes lltype) { + dev->addr_len = EUI64_ADDR_LEN; + dev->type = ARPHRD_6LOWPAN; + dev->mtu = IPV6_MIN_MTU; + dev->priv_flags |= IFF_NO_QUEUE; + lowpan_priv(dev)->lltype = lltype; } EXPORT_SYMBOL(lowpan_netdev_setup); diff --git a/net/6lowpan/iphc.c b/net/6lowpan/iphc.c index 1e0071fdcf72..78c8a495b571 100644 --- a/net/6lowpan/iphc.c +++ b/net/6lowpan/iphc.c @@ -366,7 +366,18 @@ lowpan_header_decompress(struct sk_buff *skb, struct net_device *dev, return err; } - hdr.payload_len = htons(skb->len); + switch (lowpan_priv(dev)->lltype) { + case LOWPAN_LLTYPE_IEEE802154: + if (lowpan_802154_cb(skb)->d_size) + hdr.payload_len = htons(lowpan_802154_cb(skb)->d_size - + sizeof(struct ipv6hdr)); + else + hdr.payload_len = htons(skb->len); + break; + default: + hdr.payload_len = htons(skb->len); + break; + } pr_debug("skb headroom size = %d, data length = %d\n", skb_headroom(skb), skb->len); diff --git a/net/6lowpan/nhc.h b/net/6lowpan/nhc.h index ed44938eb5de..c249f17fa37b 100644 --- a/net/6lowpan/nhc.h +++ b/net/6lowpan/nhc.h @@ -8,8 +8,6 @@ #include <net/6lowpan.h> #include <net/ipv6.h> -#define LOWPAN_NHC_MAX_ID_LEN 1 - /** * LOWPAN_NHC - helper macro to generate nh id fields and lowpan_nhc struct * diff --git a/net/6lowpan/nhc_udp.c b/net/6lowpan/nhc_udp.c index c6bcaeb428ae..72d0b57eb6e5 100644 --- a/net/6lowpan/nhc_udp.c +++ b/net/6lowpan/nhc_udp.c @@ -71,7 +71,18 @@ static int udp_uncompress(struct sk_buff *skb, size_t needed) * here, we obtain the hint from the remaining size of the * frame */ - uh.len = htons(skb->len + sizeof(struct udphdr)); + switch (lowpan_priv(skb->dev)->lltype) { + case LOWPAN_LLTYPE_IEEE802154: + if (lowpan_802154_cb(skb)->d_size) + uh.len = htons(lowpan_802154_cb(skb)->d_size - + sizeof(struct ipv6hdr)); + else + uh.len = htons(skb->len + sizeof(struct udphdr)); + break; + default: + uh.len = htons(skb->len + sizeof(struct udphdr)); + break; + } pr_debug("uncompressed UDP length: src = %d", ntohs(uh.len)); /* replace the compressed UDP head by the uncompressed UDP diff --git a/net/9p/client.c b/net/9p/client.c index 498454b3c06c..ea79ee9a7348 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -1541,6 +1541,7 @@ p9_client_read(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err) struct p9_client *clnt = fid->clnt; struct p9_req_t *req; int total = 0; + *err = 0; p9_debug(P9_DEBUG_9P, ">>> TREAD fid %d offset %llu %d\n", fid->fid, (unsigned long long) offset, (int)iov_iter_count(to)); @@ -1620,6 +1621,7 @@ p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err) struct p9_client *clnt = fid->clnt; struct p9_req_t *req; int total = 0; + *err = 0; p9_debug(P9_DEBUG_9P, ">>> TWRITE fid %d offset %llu count %zd\n", fid->fid, (unsigned long long) offset, diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index 37a78d20c0f6..ba1210253f5e 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -94,8 +94,6 @@ struct p9_trans_rdma { struct ib_pd *pd; struct ib_qp *qp; struct ib_cq *cq; - struct ib_mr *dma_mr; - u32 lkey; long timeout; int sq_depth; struct semaphore sq_sem; @@ -382,9 +380,6 @@ static void rdma_destroy_trans(struct p9_trans_rdma *rdma) if (!rdma) return; - if (rdma->dma_mr && !IS_ERR(rdma->dma_mr)) - ib_dereg_mr(rdma->dma_mr); - if (rdma->qp && !IS_ERR(rdma->qp)) ib_destroy_qp(rdma->qp); @@ -415,7 +410,7 @@ post_recv(struct p9_client *client, struct p9_rdma_context *c) sge.addr = c->busa; sge.length = client->msize; - sge.lkey = rdma->lkey; + sge.lkey = rdma->pd->local_dma_lkey; wr.next = NULL; c->wc_op = IB_WC_RECV; @@ -506,7 +501,7 @@ dont_need_post_recv: sge.addr = c->busa; sge.length = c->req->tc->size; - sge.lkey = rdma->lkey; + sge.lkey = rdma->pd->local_dma_lkey; wr.next = NULL; c->wc_op = IB_WC_SEND; @@ -647,7 +642,6 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args) struct p9_trans_rdma *rdma; struct rdma_conn_param conn_param; struct ib_qp_init_attr qp_attr; - struct ib_device_attr devattr; struct ib_cq_init_attr cq_attr = {}; /* Parse the transport specific mount options */ @@ -700,11 +694,6 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args) if (err || (rdma->state != P9_RDMA_ROUTE_RESOLVED)) goto error; - /* Query the device attributes */ - err = ib_query_device(rdma->cm_id->device, &devattr); - if (err) - goto error; - /* Create the Completion Queue */ cq_attr.cqe = opts.sq_depth + opts.rq_depth + 1; rdma->cq = ib_create_cq(rdma->cm_id->device, cq_comp_handler, @@ -719,17 +708,6 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args) if (IS_ERR(rdma->pd)) goto error; - /* Cache the DMA lkey in the transport */ - rdma->dma_mr = NULL; - if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) - rdma->lkey = rdma->cm_id->device->local_dma_lkey; - else { - rdma->dma_mr = ib_get_dma_mr(rdma->pd, IB_ACCESS_LOCAL_WRITE); - if (IS_ERR(rdma->dma_mr)) - goto error; - rdma->lkey = rdma->dma_mr->lkey; - } - /* Create the Queue Pair */ memset(&qp_attr, 0, sizeof qp_attr); qp_attr.event_handler = qp_event_handler; diff --git a/net/Kconfig b/net/Kconfig index 7021c1bf44d6..127da94ae25e 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -232,6 +232,7 @@ source "net/netlink/Kconfig" source "net/mpls/Kconfig" source "net/hsr/Kconfig" source "net/switchdev/Kconfig" +source "net/l3mdev/Kconfig" config RPS bool diff --git a/net/Makefile b/net/Makefile index 3995613e5510..a5d04098dfce 100644 --- a/net/Makefile +++ b/net/Makefile @@ -74,3 +74,6 @@ obj-$(CONFIG_HSR) += hsr/ ifneq ($(CONFIG_NET_SWITCHDEV),) obj-y += switchdev/ endif +ifneq ($(CONFIG_NET_L3_MASTER_DEV),) +obj-y += l3mdev/ +endif diff --git a/net/atm/clip.c b/net/atm/clip.c index 17e55dfecbe2..e07f551a863c 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -317,6 +317,9 @@ static int clip_constructor(struct neighbour *neigh) static int clip_encap(struct atm_vcc *vcc, int mode) { + if (!CLIP_VCC(vcc)) + return -EBADFD; + CLIP_VCC(vcc)->encap = mode; return 0; } diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 5c122000688f..912d9c36fb1c 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -296,39 +296,13 @@ batadv_iv_ogm_neigh_new(struct batadv_hard_iface *hard_iface, struct batadv_orig_node *orig_node, struct batadv_orig_node *orig_neigh) { - struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); - struct batadv_neigh_node *neigh_node, *tmp_neigh_node; + struct batadv_neigh_node *neigh_node; - neigh_node = batadv_neigh_node_new(hard_iface, neigh_addr, orig_node); + neigh_node = batadv_neigh_node_new(orig_node, hard_iface, neigh_addr); if (!neigh_node) goto out; - if (!atomic_inc_not_zero(&hard_iface->refcount)) { - kfree(neigh_node); - neigh_node = NULL; - goto out; - } - neigh_node->orig_node = orig_neigh; - neigh_node->if_incoming = hard_iface; - - spin_lock_bh(&orig_node->neigh_list_lock); - tmp_neigh_node = batadv_neigh_node_get(orig_node, hard_iface, - neigh_addr); - if (!tmp_neigh_node) { - hlist_add_head_rcu(&neigh_node->list, &orig_node->neigh_list); - } else { - kfree(neigh_node); - batadv_hardif_free_ref(hard_iface); - neigh_node = tmp_neigh_node; - } - spin_unlock_bh(&orig_node->neigh_list_lock); - - if (!tmp_neigh_node) - batadv_dbg(BATADV_DBG_BATMAN, bat_priv, - "Creating new neighbor %pM for orig_node %pM on interface %s\n", - neigh_addr, orig_node->orig, - hard_iface->net_dev->name); out: return neigh_node; diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index d7ca2144e62c..e6c8382c79ba 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -27,7 +27,6 @@ #include <linux/in.h> #include <linux/ip.h> #include <linux/ipv6.h> -#include <linux/jiffies.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/netdevice.h> @@ -161,9 +160,6 @@ batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv) rcu_read_lock(); hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) { - if (gw_node->deleted) - continue; - orig_node = gw_node->orig_node; router = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT); if (!router) @@ -473,9 +469,6 @@ batadv_gw_node_get(struct batadv_priv *bat_priv, if (gw_node_tmp->orig_node != orig_node) continue; - if (gw_node_tmp->deleted) - continue; - if (!atomic_inc_not_zero(&gw_node_tmp->refcount)) continue; @@ -525,9 +518,7 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv, gw_node->bandwidth_down = ntohl(gateway->bandwidth_down); gw_node->bandwidth_up = ntohl(gateway->bandwidth_up); - gw_node->deleted = 0; if (ntohl(gateway->bandwidth_down) == 0) { - gw_node->deleted = jiffies; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Gateway %pM removed from gateway list\n", orig_node->orig); @@ -535,14 +526,21 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv, /* Note: We don't need a NULL check here, since curr_gw never * gets dereferenced. */ + spin_lock_bh(&bat_priv->gw.list_lock); + hlist_del_init_rcu(&gw_node->list); + spin_unlock_bh(&bat_priv->gw.list_lock); + + batadv_gw_node_free_ref(gw_node); + curr_gw = batadv_gw_get_selected_gw_node(bat_priv); if (gw_node == curr_gw) batadv_gw_reselect(bat_priv); + + if (curr_gw) + batadv_gw_node_free_ref(curr_gw); } out: - if (curr_gw) - batadv_gw_node_free_ref(curr_gw); if (gw_node) batadv_gw_node_free_ref(gw_node); } @@ -558,39 +556,18 @@ void batadv_gw_node_delete(struct batadv_priv *bat_priv, batadv_gw_node_update(bat_priv, orig_node, &gateway); } -void batadv_gw_node_purge(struct batadv_priv *bat_priv) +void batadv_gw_node_free(struct batadv_priv *bat_priv) { - struct batadv_gw_node *gw_node, *curr_gw; + struct batadv_gw_node *gw_node; struct hlist_node *node_tmp; - unsigned long timeout = msecs_to_jiffies(2 * BATADV_PURGE_TIMEOUT); - int do_reselect = 0; - - curr_gw = batadv_gw_get_selected_gw_node(bat_priv); spin_lock_bh(&bat_priv->gw.list_lock); - hlist_for_each_entry_safe(gw_node, node_tmp, &bat_priv->gw.list, list) { - if (((!gw_node->deleted) || - (time_before(jiffies, gw_node->deleted + timeout))) && - atomic_read(&bat_priv->mesh_state) == BATADV_MESH_ACTIVE) - continue; - - if (curr_gw == gw_node) - do_reselect = 1; - - hlist_del_rcu(&gw_node->list); + hlist_del_init_rcu(&gw_node->list); batadv_gw_node_free_ref(gw_node); } - spin_unlock_bh(&bat_priv->gw.list_lock); - - /* gw_reselect() needs to acquire the gw_list_lock */ - if (do_reselect) - batadv_gw_reselect(bat_priv); - - if (curr_gw) - batadv_gw_node_free_ref(curr_gw); } /* fails if orig_node has no router */ @@ -654,9 +631,6 @@ int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset) rcu_read_lock(); hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) { - if (gw_node->deleted) - continue; - /* fails if orig_node has no router */ if (batadv_write_buffer_text(bat_priv, seq, gw_node) < 0) continue; diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h index ef4d7e336651..fa9527785ed3 100644 --- a/net/batman-adv/gateway_client.h +++ b/net/batman-adv/gateway_client.h @@ -38,7 +38,7 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv, struct batadv_tvlv_gateway_data *gateway); void batadv_gw_node_delete(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node); -void batadv_gw_node_purge(struct batadv_priv *bat_priv); +void batadv_gw_node_free(struct batadv_priv *bat_priv); int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset); bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, struct sk_buff *skb); enum batadv_dhcp_recipient diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index f4a15d2e5eaf..f11345e163d7 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -252,6 +252,44 @@ static void batadv_check_known_mac_addr(const struct net_device *net_dev) rcu_read_unlock(); } +/** + * batadv_hardif_recalc_extra_skbroom() - Recalculate skbuff extra head/tailroom + * @soft_iface: netdev struct of the mesh interface + */ +static void batadv_hardif_recalc_extra_skbroom(struct net_device *soft_iface) +{ + const struct batadv_hard_iface *hard_iface; + unsigned short lower_header_len = ETH_HLEN; + unsigned short lower_headroom = 0; + unsigned short lower_tailroom = 0; + unsigned short needed_headroom; + + rcu_read_lock(); + list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { + if (hard_iface->if_status == BATADV_IF_NOT_IN_USE) + continue; + + if (hard_iface->soft_iface != soft_iface) + continue; + + lower_header_len = max_t(unsigned short, lower_header_len, + hard_iface->net_dev->hard_header_len); + + lower_headroom = max_t(unsigned short, lower_headroom, + hard_iface->net_dev->needed_headroom); + + lower_tailroom = max_t(unsigned short, lower_tailroom, + hard_iface->net_dev->needed_tailroom); + } + rcu_read_unlock(); + + needed_headroom = lower_headroom + (lower_header_len - ETH_HLEN); + needed_headroom += batadv_max_header_len(); + + soft_iface->needed_headroom = needed_headroom; + soft_iface->needed_tailroom = lower_tailroom; +} + int batadv_hardif_min_mtu(struct net_device *soft_iface) { struct batadv_priv *bat_priv = netdev_priv(soft_iface); @@ -474,6 +512,8 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, "Not using interface %s (retrying later): interface not active\n", hard_iface->net_dev->name); + batadv_hardif_recalc_extra_skbroom(soft_iface); + /* begin scheduling originator messages on that interface */ batadv_schedule_bat_ogm(hard_iface); @@ -528,6 +568,9 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface, batadv_purge_outstanding_packets(bat_priv, hard_iface); dev_put(hard_iface->soft_iface); + netdev_upper_dev_unlink(hard_iface->net_dev, hard_iface->soft_iface); + batadv_hardif_recalc_extra_skbroom(hard_iface->soft_iface); + /* nobody uses this interface anymore */ if (!bat_priv->num_ifaces) { batadv_gw_check_client_stop(bat_priv); @@ -536,7 +579,6 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface, batadv_softif_destroy_sysfs(hard_iface->soft_iface); } - netdev_upper_dev_unlink(hard_iface->net_dev, hard_iface->soft_iface); hard_iface->soft_iface = NULL; batadv_hardif_free_ref(hard_iface); diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index e61c5f3633d0..d7f17c1aa4a4 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -199,7 +199,7 @@ void batadv_mesh_free(struct net_device *soft_iface) batadv_purge_outstanding_packets(bat_priv, NULL); - batadv_gw_node_purge(bat_priv); + batadv_gw_node_free(bat_priv); batadv_nc_mesh_free(bat_priv); batadv_dat_free(bat_priv); batadv_bla_free(bat_priv); diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 78500ac725d6..ebd8af0a1eb0 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -24,7 +24,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2015.1" +#define BATADV_SOURCE_VERSION "2015.2" #endif /* B.A.T.M.A.N. parameters */ diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 610620aa8d26..7486df9ed48d 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -443,40 +443,6 @@ out: } /** - * batadv_neigh_node_new - create and init a new neigh_node object - * @hard_iface: the interface where the neighbour is connected to - * @neigh_addr: the mac address of the neighbour interface - * @orig_node: originator object representing the neighbour - * - * Allocates a new neigh_node object and initialises all the generic fields. - * Returns the new object or NULL on failure. - */ -struct batadv_neigh_node * -batadv_neigh_node_new(struct batadv_hard_iface *hard_iface, - const u8 *neigh_addr, struct batadv_orig_node *orig_node) -{ - struct batadv_neigh_node *neigh_node; - - neigh_node = kzalloc(sizeof(*neigh_node), GFP_ATOMIC); - if (!neigh_node) - goto out; - - INIT_HLIST_NODE(&neigh_node->list); - INIT_HLIST_HEAD(&neigh_node->ifinfo_list); - spin_lock_init(&neigh_node->ifinfo_lock); - - ether_addr_copy(neigh_node->addr, neigh_addr); - neigh_node->if_incoming = hard_iface; - neigh_node->orig_node = orig_node; - - /* extra reference for return */ - atomic_set(&neigh_node->refcount, 2); - -out: - return neigh_node; -} - -/** * batadv_neigh_node_get - retrieve a neighbour from the list * @orig_node: originator which the neighbour belongs to * @hard_iface: the interface where this neighbour is connected to @@ -486,7 +452,7 @@ out: * which is connected through the provided hard interface. * Returns NULL if the neighbour is not found. */ -struct batadv_neigh_node * +static struct batadv_neigh_node * batadv_neigh_node_get(const struct batadv_orig_node *orig_node, const struct batadv_hard_iface *hard_iface, const u8 *addr) @@ -513,6 +479,59 @@ batadv_neigh_node_get(const struct batadv_orig_node *orig_node, } /** + * batadv_neigh_node_new - create and init a new neigh_node object + * @orig_node: originator object representing the neighbour + * @hard_iface: the interface where the neighbour is connected to + * @neigh_addr: the mac address of the neighbour interface + * + * Allocates a new neigh_node object and initialises all the generic fields. + * Returns the new object or NULL on failure. + */ +struct batadv_neigh_node * +batadv_neigh_node_new(struct batadv_orig_node *orig_node, + struct batadv_hard_iface *hard_iface, + const u8 *neigh_addr) +{ + struct batadv_neigh_node *neigh_node; + + neigh_node = batadv_neigh_node_get(orig_node, hard_iface, neigh_addr); + if (neigh_node) + goto out; + + neigh_node = kzalloc(sizeof(*neigh_node), GFP_ATOMIC); + if (!neigh_node) + goto out; + + if (!atomic_inc_not_zero(&hard_iface->refcount)) { + kfree(neigh_node); + neigh_node = NULL; + goto out; + } + + INIT_HLIST_NODE(&neigh_node->list); + INIT_HLIST_HEAD(&neigh_node->ifinfo_list); + spin_lock_init(&neigh_node->ifinfo_lock); + + ether_addr_copy(neigh_node->addr, neigh_addr); + neigh_node->if_incoming = hard_iface; + neigh_node->orig_node = orig_node; + + /* extra reference for return */ + atomic_set(&neigh_node->refcount, 2); + + spin_lock_bh(&orig_node->neigh_list_lock); + hlist_add_head_rcu(&neigh_node->list, &orig_node->neigh_list); + spin_unlock_bh(&orig_node->neigh_list_lock); + + batadv_dbg(BATADV_DBG_BATMAN, orig_node->bat_priv, + "Creating new neighbor %pM for orig_node %pM on interface %s\n", + neigh_addr, orig_node->orig, hard_iface->net_dev->name); + +out: + return neigh_node; +} + +/** * batadv_orig_ifinfo_free_rcu - free the orig_ifinfo object * @rcu: rcu pointer of the orig_ifinfo object */ @@ -1010,7 +1029,6 @@ static void _batadv_purge_orig(struct batadv_priv *bat_priv) spin_unlock_bh(list_lock); } - batadv_gw_node_purge(bat_priv); batadv_gw_election(bat_priv); } diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index 3fc76f6f710c..fa18f9bf266b 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -42,12 +42,9 @@ void batadv_orig_node_free_ref_now(struct batadv_orig_node *orig_node); struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, const u8 *addr); struct batadv_neigh_node * -batadv_neigh_node_get(const struct batadv_orig_node *orig_node, - const struct batadv_hard_iface *hard_iface, - const u8 *addr); -struct batadv_neigh_node * -batadv_neigh_node_new(struct batadv_hard_iface *hard_iface, - const u8 *neigh_addr, struct batadv_orig_node *orig_node); +batadv_neigh_node_new(struct batadv_orig_node *orig_node, + struct batadv_hard_iface *hard_iface, + const u8 *neigh_addr); void batadv_neigh_node_free_ref(struct batadv_neigh_node *neigh_node); struct batadv_neigh_node * batadv_orig_router_get(struct batadv_orig_node *orig_node, diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index d5c5ad93a611..ac4d08de5df4 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -947,8 +947,6 @@ static void batadv_softif_init_early(struct net_device *dev) * have not been initialized yet */ dev->mtu = ETH_DATA_LEN; - /* reserve more space in the skbuff for our header */ - dev->hard_header_len = batadv_max_header_len(); /* generate random address */ eth_hw_addr_random(dev); diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 2f5e6c39f913..d260efd70499 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -328,7 +328,6 @@ enum batadv_orig_capabilities { * @orig_node: pointer to corresponding orig node * @bandwidth_down: advertised uplink download bandwidth * @bandwidth_up: advertised uplink upload bandwidth - * @deleted: this struct is scheduled for deletion * @refcount: number of contexts the object is used * @rcu: struct used for freeing in an RCU-safe manner */ @@ -337,7 +336,6 @@ struct batadv_gw_node { struct batadv_orig_node *orig_node; u32 bandwidth_down; u32 bandwidth_up; - unsigned long deleted; atomic_t refcount; struct rcu_head rcu; }; diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 131e79cde350..db73b8a1433f 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -35,7 +35,6 @@ static struct dentry *lowpan_enable_debugfs; static struct dentry *lowpan_control_debugfs; #define IFACE_NAME_TEMPLATE "bt%d" -#define EUI64_ADDR_LEN 8 struct skb_cb { struct in6_addr addr; @@ -674,13 +673,8 @@ static struct header_ops header_ops = { static void netdev_setup(struct net_device *dev) { - dev->addr_len = EUI64_ADDR_LEN; - dev->type = ARPHRD_6LOWPAN; - dev->hard_header_len = 0; dev->needed_tailroom = 0; - dev->mtu = IPV6_MIN_MTU; - dev->tx_queue_len = 0; dev->flags = IFF_RUNNING | IFF_POINTOPOINT | IFF_MULTICAST; dev->watchdog_timeo = 0; @@ -775,24 +769,7 @@ static struct l2cap_chan *chan_create(void) chan->chan_type = L2CAP_CHAN_CONN_ORIENTED; chan->mode = L2CAP_MODE_LE_FLOWCTL; - chan->omtu = 65535; - chan->imtu = chan->omtu; - - return chan; -} - -static struct l2cap_chan *chan_open(struct l2cap_chan *pchan) -{ - struct l2cap_chan *chan; - - chan = chan_create(); - if (!chan) - return NULL; - - chan->remote_mps = chan->omtu; - chan->mps = chan->omtu; - - chan->state = BT_CONNECTED; + chan->imtu = 1280; return chan; } @@ -919,7 +896,10 @@ static inline struct l2cap_chan *chan_new_conn_cb(struct l2cap_chan *pchan) { struct l2cap_chan *chan; - chan = chan_open(pchan); + chan = chan_create(); + if (!chan) + return NULL; + chan->ops = pchan->ops; BT_DBG("chan %p pchan %p", chan, pchan); @@ -1065,34 +1045,23 @@ static inline __u8 bdaddr_type(__u8 type) return BDADDR_LE_RANDOM; } -static struct l2cap_chan *chan_get(void) -{ - struct l2cap_chan *pchan; - - pchan = chan_create(); - if (!pchan) - return NULL; - - pchan->ops = &bt_6lowpan_chan_ops; - - return pchan; -} - static int bt_6lowpan_connect(bdaddr_t *addr, u8 dst_type) { - struct l2cap_chan *pchan; + struct l2cap_chan *chan; int err; - pchan = chan_get(); - if (!pchan) + chan = chan_create(); + if (!chan) return -EINVAL; - err = l2cap_chan_connect(pchan, cpu_to_le16(L2CAP_PSM_IPSP), 0, + chan->ops = &bt_6lowpan_chan_ops; + + err = l2cap_chan_connect(chan, cpu_to_le16(L2CAP_PSM_IPSP), 0, addr, dst_type); - BT_DBG("chan %p err %d", pchan, err); + BT_DBG("chan %p err %d", chan, err); if (err < 0) - l2cap_chan_put(pchan); + l2cap_chan_put(chan); return err; } @@ -1117,31 +1086,32 @@ static int bt_6lowpan_disconnect(struct l2cap_conn *conn, u8 dst_type) static struct l2cap_chan *bt_6lowpan_listen(void) { bdaddr_t *addr = BDADDR_ANY; - struct l2cap_chan *pchan; + struct l2cap_chan *chan; int err; if (!enable_6lowpan) return NULL; - pchan = chan_get(); - if (!pchan) + chan = chan_create(); + if (!chan) return NULL; - pchan->state = BT_LISTEN; - pchan->src_type = BDADDR_LE_PUBLIC; + chan->ops = &bt_6lowpan_chan_ops; + chan->state = BT_LISTEN; + chan->src_type = BDADDR_LE_PUBLIC; - atomic_set(&pchan->nesting, L2CAP_NESTING_PARENT); + atomic_set(&chan->nesting, L2CAP_NESTING_PARENT); - BT_DBG("chan %p src type %d", pchan, pchan->src_type); + BT_DBG("chan %p src type %d", chan, chan->src_type); - err = l2cap_add_psm(pchan, addr, cpu_to_le16(L2CAP_PSM_IPSP)); + err = l2cap_add_psm(chan, addr, cpu_to_le16(L2CAP_PSM_IPSP)); if (err) { - l2cap_chan_put(pchan); + l2cap_chan_put(chan); BT_ERR("psm cannot be added err %d", err); return NULL; } - return pchan; + return chan; } static int get_l2cap_conn(char *buf, bdaddr_t *addr, u8 *addr_type, diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index adcbc74c2432..d2b3dd32d6cf 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -134,6 +134,66 @@ static const struct file_operations dut_mode_fops = { .llseek = default_llseek, }; +static ssize_t vendor_diag_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct hci_dev *hdev = file->private_data; + char buf[3]; + + buf[0] = hci_dev_test_flag(hdev, HCI_VENDOR_DIAG) ? 'Y': 'N'; + buf[1] = '\n'; + buf[2] = '\0'; + return simple_read_from_buffer(user_buf, count, ppos, buf, 2); +} + +static ssize_t vendor_diag_write(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct hci_dev *hdev = file->private_data; + char buf[32]; + size_t buf_size = min(count, (sizeof(buf)-1)); + bool enable; + int err; + + if (copy_from_user(buf, user_buf, buf_size)) + return -EFAULT; + + buf[buf_size] = '\0'; + if (strtobool(buf, &enable)) + return -EINVAL; + + hci_req_lock(hdev); + err = hdev->set_diag(hdev, enable); + hci_req_unlock(hdev); + + if (err < 0) + return err; + + if (enable) + hci_dev_set_flag(hdev, HCI_VENDOR_DIAG); + else + hci_dev_clear_flag(hdev, HCI_VENDOR_DIAG); + + return count; +} + +static const struct file_operations vendor_diag_fops = { + .open = simple_open, + .read = vendor_diag_read, + .write = vendor_diag_write, + .llseek = default_llseek, +}; + +static void hci_debugfs_create_basic(struct hci_dev *hdev) +{ + debugfs_create_file("dut_mode", 0644, hdev->debugfs, hdev, + &dut_mode_fops); + + if (hdev->set_diag) + debugfs_create_file("vendor_diag", 0644, hdev->debugfs, hdev, + &vendor_diag_fops); +} + /* ---- HCI requests ---- */ static void hci_req_sync_complete(struct hci_dev *hdev, u8 result, u16 opcode, @@ -693,7 +753,8 @@ static void hci_init3_req(struct hci_request *req, unsigned long opt) hci_setup_event_mask(req); - if (hdev->commands[6] & 0x20) { + if (hdev->commands[6] & 0x20 && + !test_bit(HCI_QUIRK_BROKEN_STORED_LINK_KEY, &hdev->quirks)) { struct hci_cp_read_stored_link_key cp; bacpy(&cp.bdaddr, BDADDR_ANY); @@ -849,13 +910,8 @@ static int __hci_init(struct hci_dev *hdev) if (err < 0) return err; - /* The Device Under Test (DUT) mode is special and available for - * all controller types. So just create it early on. - */ - if (hci_dev_test_flag(hdev, HCI_SETUP)) { - debugfs_create_file("dut_mode", 0644, hdev->debugfs, hdev, - &dut_mode_fops); - } + if (hci_dev_test_flag(hdev, HCI_SETUP)) + hci_debugfs_create_basic(hdev); err = __hci_req_sync(hdev, hci_init2_req, 0, HCI_INIT_TIMEOUT); if (err < 0) @@ -932,6 +988,9 @@ static int __hci_unconf_init(struct hci_dev *hdev) if (err < 0) return err; + if (hci_dev_test_flag(hdev, HCI_SETUP)) + hci_debugfs_create_basic(hdev); + return 0; } @@ -1384,6 +1443,9 @@ static int hci_dev_do_open(struct hci_dev *hdev) goto done; } + set_bit(HCI_RUNNING, &hdev->flags); + hci_notify(hdev, HCI_DEV_OPEN); + atomic_set(&hdev->cmd_cnt, 1); set_bit(HCI_INIT, &hdev->flags); @@ -1465,6 +1527,9 @@ static int hci_dev_do_open(struct hci_dev *hdev) hdev->sent_cmd = NULL; } + clear_bit(HCI_RUNNING, &hdev->flags); + hci_notify(hdev, HCI_DEV_CLOSE); + hdev->close(hdev); hdev->flags &= BIT(HCI_RAW); } @@ -1548,8 +1613,10 @@ static void hci_pend_le_actions_clear(struct hci_dev *hdev) BT_DBG("All LE pending actions cleared"); } -static int hci_dev_do_close(struct hci_dev *hdev) +int hci_dev_do_close(struct hci_dev *hdev) { + bool auto_off; + BT_DBG("%s %p", hdev->name, hdev); if (!hci_dev_test_flag(hdev, HCI_UNREGISTER) && @@ -1605,10 +1672,10 @@ static int hci_dev_do_close(struct hci_dev *hdev) hci_discovery_set_state(hdev, DISCOVERY_STOPPED); - if (!hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF)) { - if (hdev->dev_type == HCI_BREDR) - mgmt_powered(hdev, 0); - } + auto_off = hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF); + + if (!auto_off && hdev->dev_type == HCI_BREDR) + mgmt_powered(hdev, 0); hci_inquiry_cache_flush(hdev); hci_pend_le_actions_clear(hdev); @@ -1625,9 +1692,8 @@ static int hci_dev_do_close(struct hci_dev *hdev) /* Reset device */ skb_queue_purge(&hdev->cmd_q); atomic_set(&hdev->cmd_cnt, 1); - if (!hci_dev_test_flag(hdev, HCI_AUTO_OFF) && - !hci_dev_test_flag(hdev, HCI_UNCONFIGURED) && - test_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks)) { + if (test_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks) && + !auto_off && !hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { set_bit(HCI_INIT, &hdev->flags); __hci_req_sync(hdev, hci_reset_req, 0, HCI_CMD_TIMEOUT); clear_bit(HCI_INIT, &hdev->flags); @@ -1648,6 +1714,9 @@ static int hci_dev_do_close(struct hci_dev *hdev) hdev->sent_cmd = NULL; } + clear_bit(HCI_RUNNING, &hdev->flags); + hci_notify(hdev, HCI_DEV_CLOSE); + /* After this point our queues are empty * and no tasks are scheduled. */ hdev->close(hdev); @@ -3470,6 +3539,13 @@ int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb) return -ENXIO; } + if (bt_cb(skb)->pkt_type != HCI_EVENT_PKT && + bt_cb(skb)->pkt_type != HCI_ACLDATA_PKT && + bt_cb(skb)->pkt_type != HCI_SCODATA_PKT) { + kfree_skb(skb); + return -EINVAL; + } + /* Incoming skb */ bt_cb(skb)->incoming = 1; @@ -3483,6 +3559,21 @@ int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb) } EXPORT_SYMBOL(hci_recv_frame); +/* Receive diagnostic message from HCI drivers */ +int hci_recv_diag(struct hci_dev *hdev, struct sk_buff *skb) +{ + /* Time stamp */ + __net_timestamp(skb); + + /* Mark as diagnostic packet and send to monitor */ + bt_cb(skb)->pkt_type = HCI_DIAG_PKT; + hci_send_to_monitor(hdev, skb); + + kfree_skb(skb); + return 0; +} +EXPORT_SYMBOL(hci_recv_diag); + /* ---- Interface to upper protocols ---- */ int hci_register_cb(struct hci_cb *cb) @@ -3529,6 +3620,11 @@ static void hci_send_frame(struct hci_dev *hdev, struct sk_buff *skb) /* Get rid of skb owner, prior to sending to the driver. */ skb_orphan(skb); + if (!test_bit(HCI_RUNNING, &hdev->flags)) { + kfree_skb(skb); + return; + } + err = hdev->send(hdev, skb); if (err < 0) { BT_ERR("%s sending frame failed (%d)", hdev->name, err); @@ -3579,6 +3675,25 @@ void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 opcode) return hdev->sent_cmd->data + HCI_COMMAND_HDR_SIZE; } +/* Send HCI command and wait for command commplete event */ +struct sk_buff *hci_cmd_sync(struct hci_dev *hdev, u16 opcode, u32 plen, + const void *param, u32 timeout) +{ + struct sk_buff *skb; + + if (!test_bit(HCI_UP, &hdev->flags)) + return ERR_PTR(-ENETDOWN); + + bt_dev_dbg(hdev, "opcode 0x%4.4x plen %d", opcode, plen); + + hci_req_lock(hdev); + skb = __hci_cmd_sync(hdev, opcode, plen, param, timeout); + hci_req_unlock(hdev); + + return skb; +} +EXPORT_SYMBOL(hci_cmd_sync); + /* Send ACL data */ static void hci_add_acl_hdr(struct sk_buff *skb, __u16 handle, __u16 flags) { diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 7ba35a9ba6b7..8acec932123a 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -3726,17 +3726,25 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, if (ev->link_type == ESCO_LINK) goto unlock; + /* When the link type in the event indicates SCO connection + * and lookup of the connection object fails, then check + * if an eSCO connection object exists. + * + * The core limits the synchronous connections to either + * SCO or eSCO. The eSCO connection is preferred and tried + * to be setup first and until successfully established, + * the link type will be hinted as eSCO. + */ conn = hci_conn_hash_lookup_ba(hdev, ESCO_LINK, &ev->bdaddr); if (!conn) goto unlock; - - conn->type = SCO_LINK; } switch (ev->status) { case 0x00: conn->handle = __le16_to_cpu(ev->handle); conn->state = BT_CONNECTED; + conn->type = ev->link_type; hci_debugfs_create_conn(conn); hci_conn_add_sysfs(conn); @@ -4711,6 +4719,27 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, struct hci_conn *conn; bool match; u32 flags; + u8 *ptr, real_len; + + /* Find the end of the data in case the report contains padded zero + * bytes at the end causing an invalid length value. + * + * When data is NULL, len is 0 so there is no need for extra ptr + * check as 'ptr < data + 0' is already false in such case. + */ + for (ptr = data; ptr < data + len && *ptr; ptr += *ptr + 1) { + if (ptr + 1 + *ptr > data + len) + break; + } + + real_len = ptr - data; + + /* Adjust for actual length */ + if (len != real_len) { + BT_ERR_RATELIMITED("%s advertising data length corrected", + hdev->name); + len = real_len; + } /* If the direct address is present, then this report is from * a LE Direct Advertising Report event. In that case it is diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index f2d30d1156c9..9a100c1fd7b5 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -279,6 +279,9 @@ void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb) else opcode = cpu_to_le16(HCI_MON_SCO_TX_PKT); break; + case HCI_DIAG_PKT: + opcode = cpu_to_le16(HCI_MON_VENDOR_DIAG); + break; default: return; } @@ -303,6 +306,7 @@ static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event) { struct hci_mon_hdr *hdr; struct hci_mon_new_index *ni; + struct hci_mon_index_info *ii; struct sk_buff *skb; __le16 opcode; @@ -312,7 +316,7 @@ static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event) if (!skb) return NULL; - ni = (void *) skb_put(skb, HCI_MON_NEW_INDEX_SIZE); + ni = (void *)skb_put(skb, HCI_MON_NEW_INDEX_SIZE); ni->type = hdev->dev_type; ni->bus = hdev->bus; bacpy(&ni->bdaddr, &hdev->bdaddr); @@ -329,6 +333,34 @@ static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event) opcode = cpu_to_le16(HCI_MON_DEL_INDEX); break; + case HCI_DEV_UP: + skb = bt_skb_alloc(HCI_MON_INDEX_INFO_SIZE, GFP_ATOMIC); + if (!skb) + return NULL; + + ii = (void *)skb_put(skb, HCI_MON_INDEX_INFO_SIZE); + bacpy(&ii->bdaddr, &hdev->bdaddr); + ii->manufacturer = cpu_to_le16(hdev->manufacturer); + + opcode = cpu_to_le16(HCI_MON_INDEX_INFO); + break; + + case HCI_DEV_OPEN: + skb = bt_skb_alloc(0, GFP_ATOMIC); + if (!skb) + return NULL; + + opcode = cpu_to_le16(HCI_MON_OPEN_INDEX); + break; + + case HCI_DEV_CLOSE: + skb = bt_skb_alloc(0, GFP_ATOMIC); + if (!skb) + return NULL; + + opcode = cpu_to_le16(HCI_MON_CLOSE_INDEX); + break; + default: return NULL; } @@ -358,6 +390,26 @@ static void send_monitor_replay(struct sock *sk) if (sock_queue_rcv_skb(sk, skb)) kfree_skb(skb); + + if (!test_bit(HCI_RUNNING, &hdev->flags)) + continue; + + skb = create_monitor_event(hdev, HCI_DEV_OPEN); + if (!skb) + continue; + + if (sock_queue_rcv_skb(sk, skb)) + kfree_skb(skb); + + if (!test_bit(HCI_UP, &hdev->flags)) + continue; + + skb = create_monitor_event(hdev, HCI_DEV_UP); + if (!skb) + continue; + + if (sock_queue_rcv_skb(sk, skb)) + kfree_skb(skb); } read_unlock(&hci_dev_list_lock); @@ -392,14 +444,12 @@ static void hci_si_event(struct hci_dev *hdev, int type, int dlen, void *data) void hci_sock_dev_event(struct hci_dev *hdev, int event) { - struct hci_ev_si_device ev; - BT_DBG("hdev %s event %d", hdev->name, event); - /* Send event to monitor */ if (atomic_read(&monitor_promisc)) { struct sk_buff *skb; + /* Send event to monitor */ skb = create_monitor_event(hdev, event); if (skb) { hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, @@ -408,10 +458,14 @@ void hci_sock_dev_event(struct hci_dev *hdev, int event) } } - /* Send event to sockets */ - ev.event = event; - ev.dev_id = hdev->id; - hci_si_event(NULL, HCI_EV_SI_DEVICE, sizeof(ev), &ev); + if (event <= HCI_DEV_DOWN) { + struct hci_ev_si_device ev; + + /* Send event to sockets */ + ev.event = event; + ev.dev_id = hdev->id; + hci_si_event(NULL, HCI_EV_SI_DEVICE, sizeof(ev), &ev); + } if (event == HCI_DEV_UNREG) { struct sock *sk; @@ -503,7 +557,16 @@ static int hci_sock_release(struct socket *sock) if (hdev) { if (hci_pi(sk)->channel == HCI_CHANNEL_USER) { - hci_dev_close(hdev->id); + /* When releasing an user channel exclusive access, + * call hci_dev_do_close directly instead of calling + * hci_dev_close to ensure the exclusive access will + * be released and the controller brought back down. + * + * The checking of HCI_AUTO_OFF is not needed in this + * case since it will have been cleared already when + * opening the user channel. + */ + hci_dev_do_close(hdev); hci_dev_clear_flag(hdev, HCI_USER_CHANNEL); mgmt_index_added(hdev); } diff --git a/net/bluetooth/lib.c b/net/bluetooth/lib.c index b36bc0415854..aa4cf64e32a6 100644 --- a/net/bluetooth/lib.c +++ b/net/bluetooth/lib.c @@ -151,6 +151,22 @@ void bt_info(const char *format, ...) } EXPORT_SYMBOL(bt_info); +void bt_warn(const char *format, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, format); + + vaf.fmt = format; + vaf.va = &args; + + pr_warn("%pV", &vaf); + + va_end(args); +} +EXPORT_SYMBOL(bt_warn); + void bt_err(const char *format, ...) { struct va_format vaf; @@ -166,3 +182,19 @@ void bt_err(const char *format, ...) va_end(args); } EXPORT_SYMBOL(bt_err); + +void bt_err_ratelimited(const char *format, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, format); + + vaf.fmt = format; + vaf.va = &args; + + pr_err_ratelimited("%pV", &vaf); + + va_end(args); +} +EXPORT_SYMBOL(bt_err_ratelimited); diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 688a040c5626..f315c8d0e43b 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -154,13 +154,13 @@ static void sco_chan_del(struct sock *sk, int err) sock_set_flag(sk, SOCK_ZAPPED); } -static int sco_conn_del(struct hci_conn *hcon, int err) +static void sco_conn_del(struct hci_conn *hcon, int err) { struct sco_conn *conn = hcon->sco_data; struct sock *sk; if (!conn) - return 0; + return; BT_DBG("hcon %p conn %p, err %d", hcon, conn, err); @@ -179,7 +179,6 @@ static int sco_conn_del(struct hci_conn *hcon, int err) hcon->sco_data = NULL; kfree(conn); - return 0; } static void __sco_chan_add(struct sco_conn *conn, struct sock *sk, struct sock *parent) diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index ad82324f710f..25644e1bc479 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -495,7 +495,7 @@ static int smp_ah(struct crypto_blkcipher *tfm, const u8 irk[16], } /* The output of the random address function ah is: - * ah(h, r) = e(k, r') mod 2^24 + * ah(k, r) = e(k, r') mod 2^24 * The output of the security function e is then truncated to 24 bits * by taking the least significant 24 bits of the output of e as the * result of ah. @@ -2311,12 +2311,6 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) if (!conn) return 1; - chan = conn->smp; - if (!chan) { - BT_ERR("SMP security requested but not available"); - return 1; - } - if (!hci_dev_test_flag(hcon->hdev, HCI_LE_ENABLED)) return 1; @@ -2330,6 +2324,12 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) if (smp_ltk_encrypt(conn, hcon->pending_sec_level)) return 0; + chan = conn->smp; + if (!chan) { + BT_ERR("SMP security requested but not available"); + return 1; + } + l2cap_chan_lock(chan); /* If SMP is already in progress ignore this request */ diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 6ed2feb51e3c..bdfb9544ca03 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -56,7 +56,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) skb_reset_mac_header(skb); skb_pull(skb, ETH_HLEN); - if (!br_allowed_ingress(br, br_get_vlan_info(br), skb, &vid)) + if (!br_allowed_ingress(br, br_vlan_group(br), skb, &vid)) goto out; if (is_broadcast_ether_addr(dest)) @@ -391,7 +391,7 @@ void br_dev_setup(struct net_device *dev) br->bridge_max_age = br->max_age = 20 * HZ; br->bridge_hello_time = br->hello_time = 2 * HZ; br->bridge_forward_delay = br->forward_delay = 15 * HZ; - br->ageing_time = 300 * HZ; + br->ageing_time = BR_DEFAULT_AGEING_TIME; br_netfilter_rtable_init(br); br_stp_timer_init(br); diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 9e9875da0a4f..f43ce05c66a6 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -133,15 +133,13 @@ static void fdb_del_hw_addr(struct net_bridge *br, const unsigned char *addr) static void fdb_del_external_learn(struct net_bridge_fdb_entry *f) { - struct switchdev_obj obj = { - .id = SWITCHDEV_OBJ_PORT_FDB, - .u.fdb = { - .addr = f->addr.addr, - .vid = f->vlan_id, - }, + struct switchdev_obj_port_fdb fdb = { + .obj.id = SWITCHDEV_OBJ_ID_PORT_FDB, + .addr = f->addr.addr, + .vid = f->vlan_id, }; - switchdev_port_obj_del(f->dst->dev, &obj); + switchdev_port_obj_del(f->dst->dev, &fdb.obj); } static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f) @@ -163,22 +161,27 @@ static void fdb_delete_local(struct net_bridge *br, struct net_bridge_fdb_entry *f) { const unsigned char *addr = f->addr.addr; - u16 vid = f->vlan_id; + struct net_bridge_vlan_group *vg; + const struct net_bridge_vlan *v; struct net_bridge_port *op; + u16 vid = f->vlan_id; /* Maybe another port has same hw addr? */ list_for_each_entry(op, &br->port_list, list) { + vg = nbp_vlan_group(op); if (op != p && ether_addr_equal(op->dev->dev_addr, addr) && - (!vid || nbp_vlan_find(op, vid))) { + (!vid || br_vlan_find(vg, vid))) { f->dst = op; f->added_by_user = 0; return; } } + vg = br_vlan_group(br); + v = br_vlan_find(vg, vid); /* Maybe bridge device has same hw addr? */ if (p && ether_addr_equal(br->dev->dev_addr, addr) && - (!vid || br_vlan_find(br, vid))) { + (!vid || (v && br_vlan_should_use(v)))) { f->dst = NULL; f->added_by_user = 0; return; @@ -203,14 +206,14 @@ void br_fdb_find_delete_local(struct net_bridge *br, void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr) { + struct net_bridge_vlan_group *vg; struct net_bridge *br = p->br; - struct net_port_vlans *pv = nbp_get_vlan_info(p); - bool no_vlan = !pv; + struct net_bridge_vlan *v; int i; - u16 vid; spin_lock_bh(&br->hash_lock); + vg = nbp_vlan_group(p); /* Search all chains since old address/hash is unknown */ for (i = 0; i < BR_HASH_SIZE; i++) { struct hlist_node *h; @@ -226,7 +229,7 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr) * configured, we can safely be done at * this point. */ - if (no_vlan) + if (!vg || !vg->num_vlans) goto insert; } } @@ -236,15 +239,15 @@ insert: /* insert new address, may fail if invalid address or dup. */ fdb_insert(br, p, newaddr, 0); - if (no_vlan) + if (!vg || !vg->num_vlans) goto done; /* Now add entries for every VLAN configured on the port. * This function runs under RTNL so the bitmap will not change * from under us. */ - for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) - fdb_insert(br, p, newaddr, vid); + list_for_each_entry(v, &vg->vlan_list, vlist) + fdb_insert(br, p, newaddr, v->vid); done: spin_unlock_bh(&br->hash_lock); @@ -252,9 +255,9 @@ done: void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr) { + struct net_bridge_vlan_group *vg; struct net_bridge_fdb_entry *f; - struct net_port_vlans *pv; - u16 vid = 0; + struct net_bridge_vlan *v; spin_lock_bh(&br->hash_lock); @@ -264,20 +267,18 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr) fdb_delete_local(br, NULL, f); fdb_insert(br, NULL, newaddr, 0); - + vg = br_vlan_group(br); + if (!vg || !vg->num_vlans) + goto out; /* Now remove and add entries for every VLAN configured on the * bridge. This function runs under RTNL so the bitmap will not * change from under us. */ - pv = br_get_vlan_info(br); - if (!pv) - goto out; - - for_each_set_bit_from(vid, pv->vlan_bitmap, VLAN_N_VID) { - f = __br_fdb_get(br, br->dev->dev_addr, vid); + list_for_each_entry(v, &vg->vlan_list, vlist) { + f = __br_fdb_get(br, br->dev->dev_addr, v->vid); if (f && f->is_local && !f->dst) fdb_delete_local(br, NULL, f); - fdb_insert(br, NULL, newaddr, vid); + fdb_insert(br, NULL, newaddr, v->vid); } out: spin_unlock_bh(&br->hash_lock); @@ -299,6 +300,8 @@ void br_fdb_cleanup(unsigned long _data) unsigned long this_timer; if (f->is_static) continue; + if (f->added_by_external_learn) + continue; this_timer = f->updated + delay; if (time_before_eq(this_timer, jiffies)) fdb_delete(br, f); @@ -605,13 +608,14 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, } } -static int fdb_to_nud(const struct net_bridge_fdb_entry *fdb) +static int fdb_to_nud(const struct net_bridge *br, + const struct net_bridge_fdb_entry *fdb) { if (fdb->is_local) return NUD_PERMANENT; else if (fdb->is_static) return NUD_NOARP; - else if (has_expired(fdb->dst->br, fdb)) + else if (has_expired(br, fdb)) return NUD_STALE; else return NUD_REACHABLE; @@ -637,7 +641,7 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br, ndm->ndm_flags = fdb->added_by_external_learn ? NTF_EXT_LEARNED : 0; ndm->ndm_type = 0; ndm->ndm_ifindex = fdb->dst ? fdb->dst->dev->ifindex : br->dev->ifindex; - ndm->ndm_state = fdb_to_nud(fdb); + ndm->ndm_state = fdb_to_nud(br, fdb); if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->addr)) goto nla_put_failure; @@ -782,7 +786,7 @@ static int fdb_add_entry(struct net_bridge_port *source, const __u8 *addr, } } - if (fdb_to_nud(fdb) != state) { + if (fdb_to_nud(br, fdb) != state) { if (state & NUD_PERMANENT) { fdb->is_local = 1; if (!fdb->is_static) { @@ -842,9 +846,11 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 nlh_flags) { - struct net_bridge_port *p; + struct net_bridge_vlan_group *vg; + struct net_bridge_port *p = NULL; + struct net_bridge_vlan *v; + struct net_bridge *br = NULL; int err = 0; - struct net_port_vlans *pv; if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE))) { pr_info("bridge: RTM_NEWNEIGH with invalid state %#x\n", ndm->ndm_state); @@ -856,34 +862,51 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], return -EINVAL; } - p = br_port_get_rtnl(dev); - if (p == NULL) { - pr_info("bridge: RTM_NEWNEIGH %s not a bridge port\n", - dev->name); - return -EINVAL; + if (dev->priv_flags & IFF_EBRIDGE) { + br = netdev_priv(dev); + vg = br_vlan_group(br); + } else { + p = br_port_get_rtnl(dev); + if (!p) { + pr_info("bridge: RTM_NEWNEIGH %s not a bridge port\n", + dev->name); + return -EINVAL; + } + vg = nbp_vlan_group(p); } - pv = nbp_get_vlan_info(p); if (vid) { - if (!pv || !test_bit(vid, pv->vlan_bitmap)) { - pr_info("bridge: RTM_NEWNEIGH with unconfigured " - "vlan %d on port %s\n", vid, dev->name); + v = br_vlan_find(vg, vid); + if (!v || !br_vlan_should_use(v)) { + pr_info("bridge: RTM_NEWNEIGH with unconfigured vlan %d on %s\n", vid, dev->name); return -EINVAL; } /* VID was specified, so use it. */ - err = __br_fdb_add(ndm, p, addr, nlh_flags, vid); + if (dev->priv_flags & IFF_EBRIDGE) + err = br_fdb_insert(br, NULL, addr, vid); + else + err = __br_fdb_add(ndm, p, addr, nlh_flags, vid); } else { - err = __br_fdb_add(ndm, p, addr, nlh_flags, 0); - if (err || !pv) + if (dev->priv_flags & IFF_EBRIDGE) + err = br_fdb_insert(br, NULL, addr, 0); + else + err = __br_fdb_add(ndm, p, addr, nlh_flags, 0); + if (err || !vg || !vg->num_vlans) goto out; /* We have vlans configured on this port and user didn't * specify a VLAN. To be nice, add/update entry for every * vlan on this port. */ - for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) { - err = __br_fdb_add(ndm, p, addr, nlh_flags, vid); + list_for_each_entry(v, &vg->vlan_list, vlist) { + if (!br_vlan_should_use(v)) + continue; + if (dev->priv_flags & IFF_EBRIDGE) + err = br_fdb_insert(br, NULL, addr, v->vid); + else + err = __br_fdb_add(ndm, p, addr, nlh_flags, + v->vid); if (err) goto out; } @@ -893,6 +916,32 @@ out: return err; } +static int fdb_delete_by_addr(struct net_bridge *br, const u8 *addr, + u16 vid) +{ + struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)]; + struct net_bridge_fdb_entry *fdb; + + fdb = fdb_find(head, addr, vid); + if (!fdb) + return -ENOENT; + + fdb_delete(br, fdb); + return 0; +} + +static int __br_fdb_delete_by_addr(struct net_bridge *br, + const unsigned char *addr, u16 vid) +{ + int err; + + spin_lock_bh(&br->hash_lock); + err = fdb_delete_by_addr(br, addr, vid); + spin_unlock_bh(&br->hash_lock); + + return err; +} + static int fdb_delete_by_addr_and_port(struct net_bridge_port *p, const u8 *addr, u16 vlan) { @@ -925,38 +974,53 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid) { - struct net_bridge_port *p; + struct net_bridge_vlan_group *vg; + struct net_bridge_port *p = NULL; + struct net_bridge_vlan *v; + struct net_bridge *br = NULL; int err; - struct net_port_vlans *pv; - p = br_port_get_rtnl(dev); - if (p == NULL) { - pr_info("bridge: RTM_DELNEIGH %s not a bridge port\n", - dev->name); - return -EINVAL; + if (dev->priv_flags & IFF_EBRIDGE) { + br = netdev_priv(dev); + vg = br_vlan_group(br); + } else { + p = br_port_get_rtnl(dev); + if (!p) { + pr_info("bridge: RTM_DELNEIGH %s not a bridge port\n", + dev->name); + return -EINVAL; + } + vg = nbp_vlan_group(p); } - pv = nbp_get_vlan_info(p); if (vid) { - if (!pv || !test_bit(vid, pv->vlan_bitmap)) { - pr_info("bridge: RTM_DELNEIGH with unconfigured " - "vlan %d on port %s\n", vid, dev->name); + v = br_vlan_find(vg, vid); + if (!v) { + pr_info("bridge: RTM_DELNEIGH with unconfigured vlan %d on %s\n", vid, dev->name); return -EINVAL; } - err = __br_fdb_delete(p, addr, vid); + if (dev->priv_flags & IFF_EBRIDGE) + err = __br_fdb_delete_by_addr(br, addr, vid); + else + err = __br_fdb_delete(p, addr, vid); } else { err = -ENOENT; - err &= __br_fdb_delete(p, addr, 0); - if (!pv) + if (dev->priv_flags & IFF_EBRIDGE) + err = __br_fdb_delete_by_addr(br, addr, 0); + else + err &= __br_fdb_delete(p, addr, 0); + + if (!vg || !vg->num_vlans) goto out; - /* We have vlans configured on this port and user didn't - * specify a VLAN. To be nice, add/update entry for every - * vlan on this port. - */ - for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) { - err &= __br_fdb_delete(p, addr, vid); + list_for_each_entry(v, &vg->vlan_list, vlist) { + if (!br_vlan_should_use(v)) + continue; + if (dev->priv_flags & IFF_EBRIDGE) + err = __br_fdb_delete_by_addr(br, addr, v->vid); + else + err &= __br_fdb_delete(p, addr, v->vid); } } out: diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index fa7bfced888e..6d5ed795c3e2 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -30,12 +30,14 @@ static int deliver_clone(const struct net_bridge_port *prev, static inline int should_deliver(const struct net_bridge_port *p, const struct sk_buff *skb) { + struct net_bridge_vlan_group *vg; + + vg = nbp_vlan_group(p); return ((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) && - br_allowed_egress(p->br, nbp_get_vlan_info(p), skb) && - p->state == BR_STATE_FORWARDING; + br_allowed_egress(vg, skb) && p->state == BR_STATE_FORWARDING; } -int br_dev_queue_push_xmit(struct sock *sk, struct sk_buff *skb) +int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) { if (!is_skb_forwardable(skb->dev, skb)) goto drop; @@ -65,10 +67,10 @@ drop: } EXPORT_SYMBOL_GPL(br_dev_queue_push_xmit); -int br_forward_finish(struct sock *sk, struct sk_buff *skb) +int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { - return NF_HOOK(NFPROTO_BRIDGE, NF_BR_POST_ROUTING, sk, skb, - NULL, skb->dev, + return NF_HOOK(NFPROTO_BRIDGE, NF_BR_POST_ROUTING, + net, sk, skb, NULL, skb->dev, br_dev_queue_push_xmit); } @@ -76,7 +78,10 @@ EXPORT_SYMBOL_GPL(br_forward_finish); static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb) { - skb = br_handle_vlan(to->br, nbp_get_vlan_info(to), skb); + struct net_bridge_vlan_group *vg; + + vg = nbp_vlan_group(to); + skb = br_handle_vlan(to->br, vg, skb); if (!skb) return; @@ -92,13 +97,14 @@ static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb) return; } - NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, NULL, skb, - NULL, skb->dev, + NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, + dev_net(skb->dev), NULL, skb,NULL, skb->dev, br_forward_finish); } static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb) { + struct net_bridge_vlan_group *vg; struct net_device *indev; if (skb_warn_if_lro(skb)) { @@ -106,7 +112,8 @@ static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb) return; } - skb = br_handle_vlan(to->br, nbp_get_vlan_info(to), skb); + vg = nbp_vlan_group(to); + skb = br_handle_vlan(to->br, vg, skb); if (!skb) return; @@ -114,8 +121,8 @@ static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb) skb->dev = to->dev; skb_forward_csum(skb); - NF_HOOK(NFPROTO_BRIDGE, NF_BR_FORWARD, NULL, skb, - indev, skb->dev, + NF_HOOK(NFPROTO_BRIDGE, NF_BR_FORWARD, + dev_net(indev), NULL, skb, indev, skb->dev, br_forward_finish); } diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 45e4757c6fd2..934cae9fa317 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -248,7 +248,6 @@ static void del_nbp(struct net_bridge_port *p) list_del_rcu(&p->list); - nbp_vlan_flush(p); br_fdb_delete_by_port(br, p, 0, 1); nbp_update_port_count(br); @@ -257,6 +256,8 @@ static void del_nbp(struct net_bridge_port *p) dev->priv_flags &= ~IFF_BRIDGE_PORT; netdev_rx_handler_unregister(dev); + /* use the synchronize_rcu done by netdev_rx_handler_unregister */ + nbp_vlan_flush(p); br_multicast_del_port(p); diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index f921a5dce22d..f5c5a4500e2f 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -26,38 +26,44 @@ br_should_route_hook_t __rcu *br_should_route_hook __read_mostly; EXPORT_SYMBOL(br_should_route_hook); +static int +br_netif_receive_skb(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + return netif_receive_skb(skb); +} + static int br_pass_frame_up(struct sk_buff *skb) { struct net_device *indev, *brdev = BR_INPUT_SKB_CB(skb)->brdev; struct net_bridge *br = netdev_priv(brdev); + struct net_bridge_vlan_group *vg; struct pcpu_sw_netstats *brstats = this_cpu_ptr(br->stats); - struct net_port_vlans *pv; u64_stats_update_begin(&brstats->syncp); brstats->rx_packets++; brstats->rx_bytes += skb->len; u64_stats_update_end(&brstats->syncp); + vg = br_vlan_group(br); /* Bridge is just like any other port. Make sure the * packet is allowed except in promisc modue when someone * may be running packet capture. */ - pv = br_get_vlan_info(br); if (!(brdev->flags & IFF_PROMISC) && - !br_allowed_egress(br, pv, skb)) { + !br_allowed_egress(vg, skb)) { kfree_skb(skb); return NET_RX_DROP; } indev = skb->dev; skb->dev = brdev; - skb = br_handle_vlan(br, pv, skb); + skb = br_handle_vlan(br, vg, skb); if (!skb) return NET_RX_DROP; - return NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, NULL, skb, - indev, NULL, - netif_receive_skb_sk); + return NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, + dev_net(indev), NULL, skb, indev, NULL, + br_netif_receive_skb); } static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br, @@ -120,7 +126,7 @@ static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br, } /* note: already called with rcu_read_lock */ -int br_handle_frame_finish(struct sock *sk, struct sk_buff *skb) +int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { const unsigned char *dest = eth_hdr(skb)->h_dest; struct net_bridge_port *p = br_port_get_rcu(skb->dev); @@ -134,7 +140,7 @@ int br_handle_frame_finish(struct sock *sk, struct sk_buff *skb) if (!p || p->state == BR_STATE_DISABLED) goto drop; - if (!br_allowed_ingress(p->br, nbp_get_vlan_info(p), skb, &vid)) + if (!br_allowed_ingress(p->br, nbp_vlan_group(p), skb, &vid)) goto out; /* insert into forwarding database after filtering to avoid spoofing */ @@ -208,7 +214,7 @@ drop: EXPORT_SYMBOL_GPL(br_handle_frame_finish); /* note: already called with rcu_read_lock */ -static int br_handle_local_finish(struct sock *sk, struct sk_buff *skb) +static int br_handle_local_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_bridge_port *p = br_port_get_rcu(skb->dev); u16 vid = 0; @@ -278,8 +284,9 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb) } /* Deliver packet to local host only */ - if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, NULL, skb, - skb->dev, NULL, br_handle_local_finish)) { + if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, + dev_net(skb->dev), NULL, skb, skb->dev, NULL, + br_handle_local_finish)) { return RX_HANDLER_CONSUMED; /* consumed by filter */ } else { *pskb = skb; @@ -303,8 +310,8 @@ forward: if (ether_addr_equal(p->br->dev->dev_addr, dest)) skb->pkt_type = PACKET_HOST; - NF_HOOK(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, NULL, skb, - skb->dev, NULL, + NF_HOOK(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, + dev_net(skb->dev), NULL, skb, skb->dev, NULL, br_handle_frame_finish); break; default: diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c index 8d423bc649b9..263b4de4de57 100644 --- a/net/bridge/br_ioctl.c +++ b/net/bridge/br_ioctl.c @@ -200,8 +200,7 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; - br->ageing_time = clock_t_to_jiffies(args[1]); - return 0; + return br_set_ageing_time(br, args[1]); case BRCTL_GET_PORT_INFO: { diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index d747275fad18..cd8deea2d074 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -464,11 +464,11 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br, static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); - unsigned short vid = VLAN_N_VID; + struct net_bridge_vlan_group *vg; struct net_device *dev, *pdev; struct br_mdb_entry *entry; struct net_bridge_port *p; - struct net_port_vlans *pv; + struct net_bridge_vlan *v; struct net_bridge *br; int err; @@ -489,10 +489,10 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) if (!p || p->br != br || p->state == BR_STATE_DISABLED) return -EINVAL; - pv = nbp_get_vlan_info(p); - if (br_vlan_enabled(br) && pv && entry->vid == 0) { - for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) { - entry->vid = vid; + vg = nbp_vlan_group(p); + if (br_vlan_enabled(br) && vg && entry->vid == 0) { + list_for_each_entry(v, &vg->vlan_list, vlist) { + entry->vid = v->vid; err = __br_mdb_add(net, br, entry); if (err) break; @@ -566,11 +566,11 @@ unlock: static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); - unsigned short vid = VLAN_N_VID; + struct net_bridge_vlan_group *vg; struct net_device *dev, *pdev; struct br_mdb_entry *entry; struct net_bridge_port *p; - struct net_port_vlans *pv; + struct net_bridge_vlan *v; struct net_bridge *br; int err; @@ -591,10 +591,10 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh) if (!p || p->br != br || p->state == BR_STATE_DISABLED) return -EINVAL; - pv = nbp_get_vlan_info(p); - if (br_vlan_enabled(br) && pv && entry->vid == 0) { - for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) { - entry->vid = vid; + vg = nbp_vlan_group(p); + if (br_vlan_enabled(br) && vg && entry->vid == 0) { + list_for_each_entry(v, &vg->vlan_list, vlist) { + entry->vid = v->vid; err = __br_mdb_del(br, entry); if (!err) __br_mdb_notify(dev, entry, RTM_DELMDB); diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 66efdc21f548..03661d97463c 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -829,8 +829,8 @@ static void __br_multicast_send_query(struct net_bridge *br, if (port) { skb->dev = port->dev; - NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, NULL, skb, - NULL, skb->dev, + NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, + dev_net(port->dev), NULL, skb, NULL, skb->dev, br_dev_queue_push_xmit); } else { br_multicast_select_own_querier(br, ip, skb); @@ -1006,7 +1006,7 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br, ih = igmpv3_report_hdr(skb); num = ntohs(ih->ngrec); - len = sizeof(*ih); + len = skb_transport_offset(skb) + sizeof(*ih); for (i = 0; i < num; i++) { len += sizeof(*grec); @@ -1067,7 +1067,7 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br, icmp6h = icmp6_hdr(skb); num = ntohs(icmp6h->icmp6_dataun.un_data16[1]); - len = sizeof(*icmp6h); + len = skb_transport_offset(skb) + sizeof(*icmp6h); for (i = 0; i < num; i++) { __be16 *nsrcs, _nsrcs; diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 0a6f095bb0c9..370aa4d4cf4d 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -189,10 +189,9 @@ static inline void nf_bridge_pull_encap_header_rcsum(struct sk_buff *skb) * expected format */ -static int br_validate_ipv4(struct sk_buff *skb) +static int br_validate_ipv4(struct net *net, struct sk_buff *skb) { const struct iphdr *iph; - struct net_device *dev = skb->dev; u32 len; if (!pskb_may_pull(skb, sizeof(struct iphdr))) @@ -213,13 +212,13 @@ static int br_validate_ipv4(struct sk_buff *skb) len = ntohs(iph->tot_len); if (skb->len < len) { - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS); + IP_INC_STATS_BH(net, IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; } else if (len < (iph->ihl*4)) goto inhdr_error; if (pskb_trim_rcsum(skb, len)) { - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS); + IP_INC_STATS_BH(net, IPSTATS_MIB_INDISCARDS); goto drop; } @@ -232,7 +231,7 @@ static int br_validate_ipv4(struct sk_buff *skb) return 0; inhdr_error: - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS); + IP_INC_STATS_BH(net, IPSTATS_MIB_INHDRERRORS); drop: return -1; } @@ -256,7 +255,7 @@ void nf_bridge_update_protocol(struct sk_buff *skb) * don't, we use the neighbour framework to find out. In both cases, we make * sure that br_handle_frame_finish() is called afterwards. */ -int br_nf_pre_routing_finish_bridge(struct sock *sk, struct sk_buff *skb) +int br_nf_pre_routing_finish_bridge(struct net *net, struct sock *sk, struct sk_buff *skb) { struct neighbour *neigh; struct dst_entry *dst; @@ -273,7 +272,7 @@ int br_nf_pre_routing_finish_bridge(struct sock *sk, struct sk_buff *skb) if (neigh->hh.hh_len) { neigh_hh_bridge(&neigh->hh, skb); skb->dev = nf_bridge->physindev; - ret = br_handle_frame_finish(sk, skb); + ret = br_handle_frame_finish(net, sk, skb); } else { /* the neighbour function below overwrites the complete * MAC header, so we save the Ethernet source address and @@ -342,7 +341,7 @@ br_nf_ipv4_daddr_was_changed(const struct sk_buff *skb, * device, we proceed as if ip_route_input() succeeded. If it differs from the * logical bridge port or if ip_route_output_key() fails we drop the packet. */ -static int br_nf_pre_routing_finish(struct sock *sk, struct sk_buff *skb) +static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb->dev; struct iphdr *iph = ip_hdr(skb); @@ -371,7 +370,7 @@ static int br_nf_pre_routing_finish(struct sock *sk, struct sk_buff *skb) if (err != -EHOSTUNREACH || !in_dev || IN_DEV_FORWARD(in_dev)) goto free_skb; - rt = ip_route_output(dev_net(dev), iph->daddr, 0, + rt = ip_route_output(net, iph->daddr, 0, RT_TOS(iph->tos), 0); if (!IS_ERR(rt)) { /* - Bridged-and-DNAT'ed traffic doesn't @@ -393,7 +392,7 @@ bridged_dnat: nf_bridge_push_encap_header(skb); NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, - sk, skb, skb->dev, NULL, + net, sk, skb, skb->dev, NULL, br_nf_pre_routing_finish_bridge, 1); return 0; @@ -413,7 +412,7 @@ bridged_dnat: skb->dev = nf_bridge->physindev; nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); - NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, sk, skb, + NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, net, sk, skb, skb->dev, NULL, br_handle_frame_finish, 1); @@ -464,7 +463,7 @@ struct net_device *setup_pre_routing(struct sk_buff *skb) * receiving device) to make netfilter happy, the REDIRECT * target in particular. Save the original destination IP * address to be able to detect DNAT afterwards. */ -static unsigned int br_nf_pre_routing(const struct nf_hook_ops *ops, +static unsigned int br_nf_pre_routing(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { @@ -486,7 +485,7 @@ static unsigned int br_nf_pre_routing(const struct nf_hook_ops *ops, return NF_ACCEPT; nf_bridge_pull_encap_header_rcsum(skb); - return br_nf_pre_routing_ipv6(ops, skb, state); + return br_nf_pre_routing_ipv6(priv, skb, state); } if (!brnf_call_iptables && !br->nf_call_iptables) @@ -497,7 +496,7 @@ static unsigned int br_nf_pre_routing(const struct nf_hook_ops *ops, nf_bridge_pull_encap_header_rcsum(skb); - if (br_validate_ipv4(skb)) + if (br_validate_ipv4(state->net, skb)) return NF_DROP; nf_bridge_put(skb->nf_bridge); @@ -511,7 +510,7 @@ static unsigned int br_nf_pre_routing(const struct nf_hook_ops *ops, skb->protocol = htons(ETH_P_IP); - NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, state->sk, skb, + NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, state->net, state->sk, skb, skb->dev, NULL, br_nf_pre_routing_finish); @@ -526,7 +525,7 @@ static unsigned int br_nf_pre_routing(const struct nf_hook_ops *ops, * took place when the packet entered the bridge), but we * register an IPv4 PRE_ROUTING 'sabotage' hook that will * prevent this from happening. */ -static unsigned int br_nf_local_in(const struct nf_hook_ops *ops, +static unsigned int br_nf_local_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { @@ -535,7 +534,7 @@ static unsigned int br_nf_local_in(const struct nf_hook_ops *ops, } /* PF_BRIDGE/FORWARD *************************************************/ -static int br_nf_forward_finish(struct sock *sk, struct sk_buff *skb) +static int br_nf_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); struct net_device *in; @@ -559,7 +558,7 @@ static int br_nf_forward_finish(struct sock *sk, struct sk_buff *skb) } nf_bridge_push_encap_header(skb); - NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_FORWARD, sk, skb, + NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_FORWARD, net, sk, skb, in, skb->dev, br_forward_finish, 1); return 0; } @@ -570,7 +569,7 @@ static int br_nf_forward_finish(struct sock *sk, struct sk_buff *skb) * but we are still able to filter on the 'real' indev/outdev * because of the physdev module. For ARP, indev and outdev are the * bridge ports. */ -static unsigned int br_nf_forward_ip(const struct nf_hook_ops *ops, +static unsigned int br_nf_forward_ip(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { @@ -609,13 +608,13 @@ static unsigned int br_nf_forward_ip(const struct nf_hook_ops *ops, } if (pf == NFPROTO_IPV4) { - if (br_validate_ipv4(skb)) + if (br_validate_ipv4(state->net, skb)) return NF_DROP; IPCB(skb)->frag_max_size = nf_bridge->frag_max_size; } if (pf == NFPROTO_IPV6) { - if (br_validate_ipv6(skb)) + if (br_validate_ipv6(state->net, skb)) return NF_DROP; IP6CB(skb)->frag_max_size = nf_bridge->frag_max_size; } @@ -626,14 +625,14 @@ static unsigned int br_nf_forward_ip(const struct nf_hook_ops *ops, else skb->protocol = htons(ETH_P_IPV6); - NF_HOOK(pf, NF_INET_FORWARD, NULL, skb, + NF_HOOK(pf, NF_INET_FORWARD, state->net, NULL, skb, brnf_get_logical_dev(skb, state->in), parent, br_nf_forward_finish); return NF_STOLEN; } -static unsigned int br_nf_forward_arp(const struct nf_hook_ops *ops, +static unsigned int br_nf_forward_arp(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { @@ -661,14 +660,14 @@ static unsigned int br_nf_forward_arp(const struct nf_hook_ops *ops, return NF_ACCEPT; } *d = state->in; - NF_HOOK(NFPROTO_ARP, NF_ARP_FORWARD, state->sk, skb, + NF_HOOK(NFPROTO_ARP, NF_ARP_FORWARD, state->net, state->sk, skb, state->in, state->out, br_nf_forward_finish); return NF_STOLEN; } #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) || IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) -static int br_nf_push_frag_xmit(struct sock *sk, struct sk_buff *skb) +static int br_nf_push_frag_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) { struct brnf_frag_data *data; int err; @@ -690,28 +689,27 @@ static int br_nf_push_frag_xmit(struct sock *sk, struct sk_buff *skb) __skb_push(skb, data->encap_size); nf_bridge_info_free(skb); - return br_dev_queue_push_xmit(sk, skb); + return br_dev_queue_push_xmit(net, sk, skb); } #endif #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) -static int br_nf_ip_fragment(struct sock *sk, struct sk_buff *skb, - int (*output)(struct sock *, struct sk_buff *)) +static int +br_nf_ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, + int (*output)(struct net *, struct sock *, struct sk_buff *)) { unsigned int mtu = ip_skb_dst_mtu(skb); struct iphdr *iph = ip_hdr(skb); - struct rtable *rt = skb_rtable(skb); - struct net_device *dev = rt->dst.dev; if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) || (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size > mtu))) { - IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); kfree_skb(skb); return -EMSGSIZE; } - return ip_do_fragment(sk, skb, output); + return ip_do_fragment(net, sk, skb, output); } #endif @@ -722,7 +720,7 @@ static unsigned int nf_bridge_mtu_reduction(const struct sk_buff *skb) return 0; } -static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb) +static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) { struct nf_bridge_info *nf_bridge; unsigned int mtu_reserved; @@ -731,7 +729,7 @@ static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb) if (skb_is_gso(skb) || skb->len + mtu_reserved <= skb->dev->mtu) { nf_bridge_info_free(skb); - return br_dev_queue_push_xmit(sk, skb); + return br_dev_queue_push_xmit(net, sk, skb); } nf_bridge = nf_bridge_info_get(skb); @@ -743,7 +741,7 @@ static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb) if (skb->protocol == htons(ETH_P_IP)) { struct brnf_frag_data *data; - if (br_validate_ipv4(skb)) + if (br_validate_ipv4(net, skb)) goto drop; IPCB(skb)->frag_max_size = nf_bridge->frag_max_size; @@ -760,7 +758,7 @@ static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb) skb_copy_from_linear_data_offset(skb, -data->size, data->mac, data->size); - return br_nf_ip_fragment(sk, skb, br_nf_push_frag_xmit); + return br_nf_ip_fragment(net, sk, skb, br_nf_push_frag_xmit); } #endif #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) @@ -768,7 +766,7 @@ static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb) const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops(); struct brnf_frag_data *data; - if (br_validate_ipv6(skb)) + if (br_validate_ipv6(net, skb)) goto drop; IP6CB(skb)->frag_max_size = nf_bridge->frag_max_size; @@ -783,21 +781,21 @@ static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb) data->size); if (v6ops) - return v6ops->fragment(sk, skb, br_nf_push_frag_xmit); + return v6ops->fragment(net, sk, skb, br_nf_push_frag_xmit); kfree_skb(skb); return -EMSGSIZE; } #endif nf_bridge_info_free(skb); - return br_dev_queue_push_xmit(sk, skb); + return br_dev_queue_push_xmit(net, sk, skb); drop: kfree_skb(skb); return 0; } /* PF_BRIDGE/POST_ROUTING ********************************************/ -static unsigned int br_nf_post_routing(const struct nf_hook_ops *ops, +static unsigned int br_nf_post_routing(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { @@ -836,7 +834,7 @@ static unsigned int br_nf_post_routing(const struct nf_hook_ops *ops, else skb->protocol = htons(ETH_P_IPV6); - NF_HOOK(pf, NF_INET_POST_ROUTING, state->sk, skb, + NF_HOOK(pf, NF_INET_POST_ROUTING, state->net, state->sk, skb, NULL, realoutdev, br_nf_dev_queue_xmit); @@ -846,7 +844,7 @@ static unsigned int br_nf_post_routing(const struct nf_hook_ops *ops, /* IP/SABOTAGE *****************************************************/ /* Don't hand locally destined packets to PF_INET(6)/PRE_ROUTING * for the second time. */ -static unsigned int ip_sabotage_in(const struct nf_hook_ops *ops, +static unsigned int ip_sabotage_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { @@ -880,7 +878,7 @@ static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb) skb->dev = nf_bridge->physindev; nf_bridge->physoutdev = NULL; - br_handle_frame_finish(NULL, skb); + br_handle_frame_finish(dev_net(skb->dev), NULL, skb); } static int br_nf_dev_xmit(struct sk_buff *skb) diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c index 77383bfe7ea3..d61f56efc8dc 100644 --- a/net/bridge/br_netfilter_ipv6.c +++ b/net/bridge/br_netfilter_ipv6.c @@ -100,10 +100,9 @@ bad: return -1; } -int br_validate_ipv6(struct sk_buff *skb) +int br_validate_ipv6(struct net *net, struct sk_buff *skb) { const struct ipv6hdr *hdr; - struct net_device *dev = skb->dev; struct inet6_dev *idev = __in6_dev_get(skb->dev); u32 pkt_len; u8 ip6h_len = sizeof(struct ipv6hdr); @@ -123,12 +122,12 @@ int br_validate_ipv6(struct sk_buff *skb) if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) { if (pkt_len + ip6h_len > skb->len) { - IP6_INC_STATS_BH(dev_net(dev), idev, + IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; } if (pskb_trim_rcsum(skb, pkt_len + ip6h_len)) { - IP6_INC_STATS_BH(dev_net(dev), idev, + IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INDISCARDS); goto drop; } @@ -143,7 +142,7 @@ int br_validate_ipv6(struct sk_buff *skb) return 0; inhdr_error: - IP6_INC_STATS_BH(dev_net(dev), idev, IPSTATS_MIB_INHDRERRORS); + IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INHDRERRORS); drop: return -1; } @@ -161,7 +160,7 @@ br_nf_ipv6_daddr_was_changed(const struct sk_buff *skb, * for br_nf_pre_routing_finish(), same logic is used here but * equivalent IPv6 function ip6_route_input() called indirectly. */ -static int br_nf_pre_routing_finish_ipv6(struct sock *sk, struct sk_buff *skb) +static int br_nf_pre_routing_finish_ipv6(struct net *net, struct sock *sk, struct sk_buff *skb) { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); struct rtable *rt; @@ -189,7 +188,7 @@ static int br_nf_pre_routing_finish_ipv6(struct sock *sk, struct sk_buff *skb) nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, - sk, skb, skb->dev, NULL, + net, sk, skb, skb->dev, NULL, br_nf_pre_routing_finish_bridge, 1); return 0; @@ -208,7 +207,7 @@ static int br_nf_pre_routing_finish_ipv6(struct sock *sk, struct sk_buff *skb) skb->dev = nf_bridge->physindev; nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); - NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, sk, skb, + NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, net, sk, skb, skb->dev, NULL, br_handle_frame_finish, 1); @@ -218,13 +217,13 @@ static int br_nf_pre_routing_finish_ipv6(struct sock *sk, struct sk_buff *skb) /* Replicate the checks that IPv6 does on packet reception and pass the packet * to ip6tables. */ -unsigned int br_nf_pre_routing_ipv6(const struct nf_hook_ops *ops, +unsigned int br_nf_pre_routing_ipv6(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nf_bridge_info *nf_bridge; - if (br_validate_ipv6(skb)) + if (br_validate_ipv6(state->net, skb)) return NF_DROP; nf_bridge_put(skb->nf_bridge); @@ -237,7 +236,7 @@ unsigned int br_nf_pre_routing_ipv6(const struct nf_hook_ops *ops, nf_bridge->ipv6_daddr = ipv6_hdr(skb)->daddr; skb->protocol = htons(ETH_P_IPV6); - NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, state->sk, skb, + NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, state->net, state->sk, skb, skb->dev, NULL, br_nf_pre_routing_finish_ipv6); diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index dbcb1949ea58..d792d1a848ad 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -16,42 +16,40 @@ #include <net/rtnetlink.h> #include <net/net_namespace.h> #include <net/sock.h> -#include <net/switchdev.h> #include <uapi/linux/if_bridge.h> #include "br_private.h" #include "br_private_stp.h" -static int br_get_num_vlan_infos(const struct net_port_vlans *pv, - u32 filter_mask) +static int __get_num_vlan_infos(struct net_bridge_vlan_group *vg, + u32 filter_mask) { - u16 vid_range_start = 0, vid_range_end = 0; - u16 vid_range_flags = 0; - u16 pvid, vid, flags; + struct net_bridge_vlan *v; + u16 vid_range_start = 0, vid_range_end = 0, vid_range_flags = 0; + u16 flags, pvid; int num_vlans = 0; - if (filter_mask & RTEXT_FILTER_BRVLAN) - return pv->num_vlans; - if (!(filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED)) return 0; - /* Count number of vlan info's - */ - pvid = br_get_pvid(pv); - for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) { + pvid = br_get_pvid(vg); + /* Count number of vlan infos */ + list_for_each_entry_rcu(v, &vg->vlan_list, vlist) { flags = 0; - if (vid == pvid) + /* only a context, bridge vlan not activated */ + if (!br_vlan_should_use(v)) + continue; + if (v->vid == pvid) flags |= BRIDGE_VLAN_INFO_PVID; - if (test_bit(vid, pv->untagged_bitmap)) + if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED) flags |= BRIDGE_VLAN_INFO_UNTAGGED; if (vid_range_start == 0) { goto initvars; - } else if ((vid - vid_range_end) == 1 && + } else if ((v->vid - vid_range_end) == 1 && flags == vid_range_flags) { - vid_range_end = vid; + vid_range_end = v->vid; continue; } else { if ((vid_range_end - vid_range_start) > 0) @@ -60,8 +58,8 @@ static int br_get_num_vlan_infos(const struct net_port_vlans *pv, num_vlans += 1; } initvars: - vid_range_start = vid; - vid_range_end = vid; + vid_range_start = v->vid; + vid_range_end = v->vid; vid_range_flags = flags; } @@ -75,28 +73,43 @@ initvars: return num_vlans; } +static int br_get_num_vlan_infos(struct net_bridge_vlan_group *vg, + u32 filter_mask) +{ + int num_vlans; + + if (!vg) + return 0; + + if (filter_mask & RTEXT_FILTER_BRVLAN) + return vg->num_vlans; + + rcu_read_lock(); + num_vlans = __get_num_vlan_infos(vg, filter_mask); + rcu_read_unlock(); + + return num_vlans; +} + static size_t br_get_link_af_size_filtered(const struct net_device *dev, u32 filter_mask) { - struct net_port_vlans *pv; + struct net_bridge_vlan_group *vg = NULL; + struct net_bridge_port *p; + struct net_bridge *br; int num_vlan_infos; rcu_read_lock(); - if (br_port_exists(dev)) - pv = nbp_get_vlan_info(br_port_get_rcu(dev)); - else if (dev->priv_flags & IFF_EBRIDGE) - pv = br_get_vlan_info((struct net_bridge *)netdev_priv(dev)); - else - pv = NULL; - if (pv) - num_vlan_infos = br_get_num_vlan_infos(pv, filter_mask); - else - num_vlan_infos = 0; + if (br_port_exists(dev)) { + p = br_port_get_rcu(dev); + vg = nbp_vlan_group(p); + } else if (dev->priv_flags & IFF_EBRIDGE) { + br = netdev_priv(dev); + vg = br_vlan_group(br); + } + num_vlan_infos = br_get_num_vlan_infos(vg, filter_mask); rcu_read_unlock(); - if (!num_vlan_infos) - return 0; - /* Each VLAN is returned in bridge_vlan_info along with flags */ return num_vlan_infos * nla_total_size(sizeof(struct bridge_vlan_info)); } @@ -114,6 +127,20 @@ static inline size_t br_port_info_size(void) + nla_total_size(1) /* IFLA_BRPORT_UNICAST_FLOOD */ + nla_total_size(1) /* IFLA_BRPORT_PROXYARP */ + nla_total_size(1) /* IFLA_BRPORT_PROXYARP_WIFI */ + + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_ROOT_ID */ + + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_BRIDGE_ID */ + + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_DESIGNATED_PORT */ + + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_DESIGNATED_COST */ + + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_ID */ + + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_NO */ + + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_TOPOLOGY_CHANGE_ACK */ + + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_CONFIG_PENDING */ + + nla_total_size(sizeof(u64)) /* IFLA_BRPORT_MESSAGE_AGE_TIMER */ + + nla_total_size(sizeof(u64)) /* IFLA_BRPORT_FORWARD_DELAY_TIMER */ + + nla_total_size(sizeof(u64)) /* IFLA_BRPORT_HOLD_TIMER */ +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MULTICAST_ROUTER */ +#endif + 0; } @@ -135,6 +162,7 @@ static int br_port_fill_attrs(struct sk_buff *skb, const struct net_bridge_port *p) { u8 mode = !!(p->flags & BR_HAIRPIN_MODE); + u64 timerval; if (nla_put_u8(skb, IFLA_BRPORT_STATE, p->state) || nla_put_u16(skb, IFLA_BRPORT_PRIORITY, p->priority) || @@ -147,8 +175,35 @@ static int br_port_fill_attrs(struct sk_buff *skb, nla_put_u8(skb, IFLA_BRPORT_UNICAST_FLOOD, !!(p->flags & BR_FLOOD)) || nla_put_u8(skb, IFLA_BRPORT_PROXYARP, !!(p->flags & BR_PROXYARP)) || nla_put_u8(skb, IFLA_BRPORT_PROXYARP_WIFI, - !!(p->flags & BR_PROXYARP_WIFI))) + !!(p->flags & BR_PROXYARP_WIFI)) || + nla_put(skb, IFLA_BRPORT_ROOT_ID, sizeof(struct ifla_bridge_id), + &p->designated_root) || + nla_put(skb, IFLA_BRPORT_BRIDGE_ID, sizeof(struct ifla_bridge_id), + &p->designated_bridge) || + nla_put_u16(skb, IFLA_BRPORT_DESIGNATED_PORT, p->designated_port) || + nla_put_u16(skb, IFLA_BRPORT_DESIGNATED_COST, p->designated_cost) || + nla_put_u16(skb, IFLA_BRPORT_ID, p->port_id) || + nla_put_u16(skb, IFLA_BRPORT_NO, p->port_no) || + nla_put_u8(skb, IFLA_BRPORT_TOPOLOGY_CHANGE_ACK, + p->topology_change_ack) || + nla_put_u8(skb, IFLA_BRPORT_CONFIG_PENDING, p->config_pending)) + return -EMSGSIZE; + + timerval = br_timer_value(&p->message_age_timer); + if (nla_put_u64(skb, IFLA_BRPORT_MESSAGE_AGE_TIMER, timerval)) + return -EMSGSIZE; + timerval = br_timer_value(&p->forward_delay_timer); + if (nla_put_u64(skb, IFLA_BRPORT_FORWARD_DELAY_TIMER, timerval)) + return -EMSGSIZE; + timerval = br_timer_value(&p->hold_timer); + if (nla_put_u64(skb, IFLA_BRPORT_HOLD_TIMER, timerval)) + return -EMSGSIZE; + +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + if (nla_put_u8(skb, IFLA_BRPORT_MULTICAST_ROUTER, + p->multicast_router)) return -EMSGSIZE; +#endif return 0; } @@ -186,31 +241,33 @@ nla_put_failure: } static int br_fill_ifvlaninfo_compressed(struct sk_buff *skb, - const struct net_port_vlans *pv) + struct net_bridge_vlan_group *vg) { - u16 vid_range_start = 0, vid_range_end = 0; - u16 vid_range_flags = 0; - u16 pvid, vid, flags; + struct net_bridge_vlan *v; + u16 vid_range_start = 0, vid_range_end = 0, vid_range_flags = 0; + u16 flags, pvid; int err = 0; /* Pack IFLA_BRIDGE_VLAN_INFO's for every vlan * and mark vlan info with begin and end flags * if vlaninfo represents a range */ - pvid = br_get_pvid(pv); - for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) { + pvid = br_get_pvid(vg); + list_for_each_entry(v, &vg->vlan_list, vlist) { flags = 0; - if (vid == pvid) + if (!br_vlan_should_use(v)) + continue; + if (v->vid == pvid) flags |= BRIDGE_VLAN_INFO_PVID; - if (test_bit(vid, pv->untagged_bitmap)) + if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED) flags |= BRIDGE_VLAN_INFO_UNTAGGED; if (vid_range_start == 0) { goto initvars; - } else if ((vid - vid_range_end) == 1 && + } else if ((v->vid - vid_range_end) == 1 && flags == vid_range_flags) { - vid_range_end = vid; + vid_range_end = v->vid; continue; } else { err = br_fill_ifvlaninfo_range(skb, vid_range_start, @@ -221,8 +278,8 @@ static int br_fill_ifvlaninfo_compressed(struct sk_buff *skb, } initvars: - vid_range_start = vid; - vid_range_end = vid; + vid_range_start = v->vid; + vid_range_end = v->vid; vid_range_flags = flags; } @@ -239,19 +296,23 @@ initvars: } static int br_fill_ifvlaninfo(struct sk_buff *skb, - const struct net_port_vlans *pv) + struct net_bridge_vlan_group *vg) { struct bridge_vlan_info vinfo; - u16 pvid, vid; + struct net_bridge_vlan *v; + u16 pvid; - pvid = br_get_pvid(pv); - for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) { - vinfo.vid = vid; + pvid = br_get_pvid(vg); + list_for_each_entry(v, &vg->vlan_list, vlist) { + if (!br_vlan_should_use(v)) + continue; + + vinfo.vid = v->vid; vinfo.flags = 0; - if (vid == pvid) + if (v->vid == pvid) vinfo.flags |= BRIDGE_VLAN_INFO_PVID; - if (test_bit(vid, pv->untagged_bitmap)) + if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED) vinfo.flags |= BRIDGE_VLAN_INFO_UNTAGGED; if (nla_put(skb, IFLA_BRIDGE_VLAN_INFO, @@ -270,11 +331,11 @@ nla_put_failure: * Contains port and master info as well as carrier and bridge state. */ static int br_fill_ifinfo(struct sk_buff *skb, - const struct net_bridge_port *port, + struct net_bridge_port *port, u32 pid, u32 seq, int event, unsigned int flags, u32 filter_mask, const struct net_device *dev) { - const struct net_bridge *br; + struct net_bridge *br; struct ifinfomsg *hdr; struct nlmsghdr *nlh; u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN; @@ -321,16 +382,16 @@ static int br_fill_ifinfo(struct sk_buff *skb, /* Check if the VID information is requested */ if ((filter_mask & RTEXT_FILTER_BRVLAN) || (filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED)) { - const struct net_port_vlans *pv; + struct net_bridge_vlan_group *vg; struct nlattr *af; int err; if (port) - pv = nbp_get_vlan_info(port); + vg = nbp_vlan_group(port); else - pv = br_get_vlan_info(br); + vg = br_vlan_group(br); - if (!pv || bitmap_empty(pv->vlan_bitmap, VLAN_N_VID)) + if (!vg || !vg->num_vlans) goto done; af = nla_nest_start(skb, IFLA_AF_SPEC); @@ -338,9 +399,9 @@ static int br_fill_ifinfo(struct sk_buff *skb, goto nla_put_failure; if (filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED) - err = br_fill_ifvlaninfo_compressed(skb, pv); + err = br_fill_ifvlaninfo_compressed(skb, vg); else - err = br_fill_ifvlaninfo(skb, pv); + err = br_fill_ifvlaninfo(skb, vg); if (err) goto nla_put_failure; nla_nest_end(skb, af); @@ -414,14 +475,14 @@ static int br_vlan_info(struct net_bridge *br, struct net_bridge_port *p, switch (cmd) { case RTM_SETLINK: if (p) { + /* if the MASTER flag is set this will act on the global + * per-VLAN entry as well + */ err = nbp_vlan_add(p, vinfo->vid, vinfo->flags); if (err) break; - - if (vinfo->flags & BRIDGE_VLAN_INFO_MASTER) - err = br_vlan_add(p->br, vinfo->vid, - vinfo->flags); } else { + vinfo->flags |= BRIDGE_VLAN_INFO_BRENTRY; err = br_vlan_add(br, vinfo->vid, vinfo->flags); } break; @@ -463,6 +524,9 @@ static int br_afspec(struct net_bridge *br, if (vinfo_start) return -EINVAL; vinfo_start = vinfo; + /* don't allow range of pvids */ + if (vinfo_start->flags & BRIDGE_VLAN_INFO_PVID) + return -EINVAL; continue; } @@ -508,6 +572,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = { [IFLA_BRPORT_UNICAST_FLOOD] = { .type = NLA_U8 }, [IFLA_BRPORT_PROXYARP] = { .type = NLA_U8 }, [IFLA_BRPORT_PROXYARP_WIFI] = { .type = NLA_U8 }, + [IFLA_BRPORT_MULTICAST_ROUTER] = { .type = NLA_U8 }, }; /* Change the state of the port and notify spanning tree */ @@ -579,6 +644,18 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) return err; } + if (tb[IFLA_BRPORT_FLUSH]) + br_fdb_delete_by_port(p->br, p, 0, 0); + +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + if (tb[IFLA_BRPORT_MULTICAST_ROUTER]) { + u8 mcast_router = nla_get_u8(tb[IFLA_BRPORT_MULTICAST_ROUTER]); + + err = br_multicast_set_port_router(p, mcast_router); + if (err) + return err; + } +#endif br_port_flags_change(p, old_flags ^ p->flags); return 0; } @@ -673,6 +750,21 @@ static int br_validate(struct nlattr *tb[], struct nlattr *data[]) return -EADDRNOTAVAIL; } + if (!data) + return 0; + +#ifdef CONFIG_BRIDGE_VLAN_FILTERING + if (data[IFLA_BR_VLAN_PROTOCOL]) { + switch (nla_get_be16(data[IFLA_BR_VLAN_PROTOCOL])) { + case htons(ETH_P_8021Q): + case htons(ETH_P_8021AD): + break; + default: + return -EPROTONOSUPPORT; + } + } +#endif + return 0; } @@ -729,6 +821,28 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = { [IFLA_BR_STP_STATE] = { .type = NLA_U32 }, [IFLA_BR_PRIORITY] = { .type = NLA_U16 }, [IFLA_BR_VLAN_FILTERING] = { .type = NLA_U8 }, + [IFLA_BR_VLAN_PROTOCOL] = { .type = NLA_U16 }, + [IFLA_BR_GROUP_FWD_MASK] = { .type = NLA_U16 }, + [IFLA_BR_GROUP_ADDR] = { .type = NLA_BINARY, + .len = ETH_ALEN }, + [IFLA_BR_MCAST_ROUTER] = { .type = NLA_U8 }, + [IFLA_BR_MCAST_SNOOPING] = { .type = NLA_U8 }, + [IFLA_BR_MCAST_QUERY_USE_IFADDR] = { .type = NLA_U8 }, + [IFLA_BR_MCAST_QUERIER] = { .type = NLA_U8 }, + [IFLA_BR_MCAST_HASH_ELASTICITY] = { .type = NLA_U32 }, + [IFLA_BR_MCAST_HASH_MAX] = { .type = NLA_U32 }, + [IFLA_BR_MCAST_LAST_MEMBER_CNT] = { .type = NLA_U32 }, + [IFLA_BR_MCAST_STARTUP_QUERY_CNT] = { .type = NLA_U32 }, + [IFLA_BR_MCAST_LAST_MEMBER_INTVL] = { .type = NLA_U64 }, + [IFLA_BR_MCAST_MEMBERSHIP_INTVL] = { .type = NLA_U64 }, + [IFLA_BR_MCAST_QUERIER_INTVL] = { .type = NLA_U64 }, + [IFLA_BR_MCAST_QUERY_INTVL] = { .type = NLA_U64 }, + [IFLA_BR_MCAST_QUERY_RESPONSE_INTVL] = { .type = NLA_U64 }, + [IFLA_BR_MCAST_STARTUP_QUERY_INTVL] = { .type = NLA_U64 }, + [IFLA_BR_NF_CALL_IPTABLES] = { .type = NLA_U8 }, + [IFLA_BR_NF_CALL_IP6TABLES] = { .type = NLA_U8 }, + [IFLA_BR_NF_CALL_ARPTABLES] = { .type = NLA_U8 }, + [IFLA_BR_VLAN_DEFAULT_PVID] = { .type = NLA_U16 }, }; static int br_changelink(struct net_device *brdev, struct nlattr *tb[], @@ -759,9 +873,9 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], } if (data[IFLA_BR_AGEING_TIME]) { - u32 ageing_time = nla_get_u32(data[IFLA_BR_AGEING_TIME]); - - br->ageing_time = clock_t_to_jiffies(ageing_time); + err = br_set_ageing_time(br, nla_get_u32(data[IFLA_BR_AGEING_TIME])); + if (err) + return err; } if (data[IFLA_BR_STP_STATE]) { @@ -784,6 +898,168 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], return err; } +#ifdef CONFIG_BRIDGE_VLAN_FILTERING + if (data[IFLA_BR_VLAN_PROTOCOL]) { + __be16 vlan_proto = nla_get_be16(data[IFLA_BR_VLAN_PROTOCOL]); + + err = __br_vlan_set_proto(br, vlan_proto); + if (err) + return err; + } + + if (data[IFLA_BR_VLAN_DEFAULT_PVID]) { + __u16 defpvid = nla_get_u16(data[IFLA_BR_VLAN_DEFAULT_PVID]); + + err = __br_vlan_set_default_pvid(br, defpvid); + if (err) + return err; + } +#endif + + if (data[IFLA_BR_GROUP_FWD_MASK]) { + u16 fwd_mask = nla_get_u16(data[IFLA_BR_GROUP_FWD_MASK]); + + if (fwd_mask & BR_GROUPFWD_RESTRICTED) + return -EINVAL; + br->group_fwd_mask = fwd_mask; + } + + if (data[IFLA_BR_GROUP_ADDR]) { + u8 new_addr[ETH_ALEN]; + + if (nla_len(data[IFLA_BR_GROUP_ADDR]) != ETH_ALEN) + return -EINVAL; + memcpy(new_addr, nla_data(data[IFLA_BR_GROUP_ADDR]), ETH_ALEN); + if (!is_link_local_ether_addr(new_addr)) + return -EINVAL; + if (new_addr[5] == 1 || /* 802.3x Pause address */ + new_addr[5] == 2 || /* 802.3ad Slow protocols */ + new_addr[5] == 3) /* 802.1X PAE address */ + return -EINVAL; + spin_lock_bh(&br->lock); + memcpy(br->group_addr, new_addr, sizeof(br->group_addr)); + spin_unlock_bh(&br->lock); + br->group_addr_set = true; + br_recalculate_fwd_mask(br); + } + + if (data[IFLA_BR_FDB_FLUSH]) + br_fdb_flush(br); + +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + if (data[IFLA_BR_MCAST_ROUTER]) { + u8 multicast_router = nla_get_u8(data[IFLA_BR_MCAST_ROUTER]); + + err = br_multicast_set_router(br, multicast_router); + if (err) + return err; + } + + if (data[IFLA_BR_MCAST_SNOOPING]) { + u8 mcast_snooping = nla_get_u8(data[IFLA_BR_MCAST_SNOOPING]); + + err = br_multicast_toggle(br, mcast_snooping); + if (err) + return err; + } + + if (data[IFLA_BR_MCAST_QUERY_USE_IFADDR]) { + u8 val; + + val = nla_get_u8(data[IFLA_BR_MCAST_QUERY_USE_IFADDR]); + br->multicast_query_use_ifaddr = !!val; + } + + if (data[IFLA_BR_MCAST_QUERIER]) { + u8 mcast_querier = nla_get_u8(data[IFLA_BR_MCAST_QUERIER]); + + err = br_multicast_set_querier(br, mcast_querier); + if (err) + return err; + } + + if (data[IFLA_BR_MCAST_HASH_ELASTICITY]) { + u32 val = nla_get_u32(data[IFLA_BR_MCAST_HASH_ELASTICITY]); + + br->hash_elasticity = val; + } + + if (data[IFLA_BR_MCAST_HASH_MAX]) { + u32 hash_max = nla_get_u32(data[IFLA_BR_MCAST_HASH_MAX]); + + err = br_multicast_set_hash_max(br, hash_max); + if (err) + return err; + } + + if (data[IFLA_BR_MCAST_LAST_MEMBER_CNT]) { + u32 val = nla_get_u32(data[IFLA_BR_MCAST_LAST_MEMBER_CNT]); + + br->multicast_last_member_count = val; + } + + if (data[IFLA_BR_MCAST_STARTUP_QUERY_CNT]) { + u32 val = nla_get_u32(data[IFLA_BR_MCAST_STARTUP_QUERY_CNT]); + + br->multicast_startup_query_count = val; + } + + if (data[IFLA_BR_MCAST_LAST_MEMBER_INTVL]) { + u64 val = nla_get_u64(data[IFLA_BR_MCAST_LAST_MEMBER_INTVL]); + + br->multicast_last_member_interval = clock_t_to_jiffies(val); + } + + if (data[IFLA_BR_MCAST_MEMBERSHIP_INTVL]) { + u64 val = nla_get_u64(data[IFLA_BR_MCAST_MEMBERSHIP_INTVL]); + + br->multicast_membership_interval = clock_t_to_jiffies(val); + } + + if (data[IFLA_BR_MCAST_QUERIER_INTVL]) { + u64 val = nla_get_u64(data[IFLA_BR_MCAST_QUERIER_INTVL]); + + br->multicast_querier_interval = clock_t_to_jiffies(val); + } + + if (data[IFLA_BR_MCAST_QUERY_INTVL]) { + u64 val = nla_get_u64(data[IFLA_BR_MCAST_QUERY_INTVL]); + + br->multicast_query_interval = clock_t_to_jiffies(val); + } + + if (data[IFLA_BR_MCAST_QUERY_RESPONSE_INTVL]) { + u64 val = nla_get_u64(data[IFLA_BR_MCAST_QUERY_RESPONSE_INTVL]); + + br->multicast_query_response_interval = clock_t_to_jiffies(val); + } + + if (data[IFLA_BR_MCAST_STARTUP_QUERY_INTVL]) { + u64 val = nla_get_u64(data[IFLA_BR_MCAST_STARTUP_QUERY_INTVL]); + + br->multicast_startup_query_interval = clock_t_to_jiffies(val); + } +#endif +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + if (data[IFLA_BR_NF_CALL_IPTABLES]) { + u8 val = nla_get_u8(data[IFLA_BR_NF_CALL_IPTABLES]); + + br->nf_call_iptables = val ? true : false; + } + + if (data[IFLA_BR_NF_CALL_IP6TABLES]) { + u8 val = nla_get_u8(data[IFLA_BR_NF_CALL_IP6TABLES]); + + br->nf_call_ip6tables = val ? true : false; + } + + if (data[IFLA_BR_NF_CALL_ARPTABLES]) { + u8 val = nla_get_u8(data[IFLA_BR_NF_CALL_ARPTABLES]); + + br->nf_call_arptables = val ? true : false; + } +#endif + return 0; } @@ -796,6 +1072,43 @@ static size_t br_get_size(const struct net_device *brdev) nla_total_size(sizeof(u32)) + /* IFLA_BR_STP_STATE */ nla_total_size(sizeof(u16)) + /* IFLA_BR_PRIORITY */ nla_total_size(sizeof(u8)) + /* IFLA_BR_VLAN_FILTERING */ +#ifdef CONFIG_BRIDGE_VLAN_FILTERING + nla_total_size(sizeof(__be16)) + /* IFLA_BR_VLAN_PROTOCOL */ + nla_total_size(sizeof(u16)) + /* IFLA_BR_VLAN_DEFAULT_PVID */ +#endif + nla_total_size(sizeof(u16)) + /* IFLA_BR_GROUP_FWD_MASK */ + nla_total_size(sizeof(struct ifla_bridge_id)) + /* IFLA_BR_ROOT_ID */ + nla_total_size(sizeof(struct ifla_bridge_id)) + /* IFLA_BR_BRIDGE_ID */ + nla_total_size(sizeof(u16)) + /* IFLA_BR_ROOT_PORT */ + nla_total_size(sizeof(u32)) + /* IFLA_BR_ROOT_PATH_COST */ + nla_total_size(sizeof(u8)) + /* IFLA_BR_TOPOLOGY_CHANGE */ + nla_total_size(sizeof(u8)) + /* IFLA_BR_TOPOLOGY_CHANGE_DETECTED */ + nla_total_size(sizeof(u64)) + /* IFLA_BR_HELLO_TIMER */ + nla_total_size(sizeof(u64)) + /* IFLA_BR_TCN_TIMER */ + nla_total_size(sizeof(u64)) + /* IFLA_BR_TOPOLOGY_CHANGE_TIMER */ + nla_total_size(sizeof(u64)) + /* IFLA_BR_GC_TIMER */ + nla_total_size(ETH_ALEN) + /* IFLA_BR_GROUP_ADDR */ +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_ROUTER */ + nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_SNOOPING */ + nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_QUERY_USE_IFADDR */ + nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_QUERIER */ + nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_HASH_ELASTICITY */ + nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_HASH_MAX */ + nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_LAST_MEMBER_CNT */ + nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_STARTUP_QUERY_CNT */ + nla_total_size(sizeof(u64)) + /* IFLA_BR_MCAST_LAST_MEMBER_INTVL */ + nla_total_size(sizeof(u64)) + /* IFLA_BR_MCAST_MEMBERSHIP_INTVL */ + nla_total_size(sizeof(u64)) + /* IFLA_BR_MCAST_QUERIER_INTVL */ + nla_total_size(sizeof(u64)) + /* IFLA_BR_MCAST_QUERY_INTVL */ + nla_total_size(sizeof(u64)) + /* IFLA_BR_MCAST_QUERY_RESPONSE_INTVL */ + nla_total_size(sizeof(u64)) + /* IFLA_BR_MCAST_STARTUP_QUERY_INTVL */ +#endif +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_IPTABLES */ + nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_IP6TABLES */ + nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_ARPTABLES */ +#endif 0; } @@ -809,6 +1122,20 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) u32 stp_enabled = br->stp_enabled; u16 priority = (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1]; u8 vlan_enabled = br_vlan_enabled(br); + u64 clockval; + + clockval = br_timer_value(&br->hello_timer); + if (nla_put_u64(skb, IFLA_BR_HELLO_TIMER, clockval)) + return -EMSGSIZE; + clockval = br_timer_value(&br->tcn_timer); + if (nla_put_u64(skb, IFLA_BR_TCN_TIMER, clockval)) + return -EMSGSIZE; + clockval = br_timer_value(&br->topology_change_timer); + if (nla_put_u64(skb, IFLA_BR_TOPOLOGY_CHANGE_TIMER, clockval)) + return -EMSGSIZE; + clockval = br_timer_value(&br->gc_timer); + if (nla_put_u64(skb, IFLA_BR_GC_TIMER, clockval)) + return -EMSGSIZE; if (nla_put_u32(skb, IFLA_BR_FORWARD_DELAY, forward_delay) || nla_put_u32(skb, IFLA_BR_HELLO_TIME, hello_time) || @@ -816,28 +1143,90 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) nla_put_u32(skb, IFLA_BR_AGEING_TIME, ageing_time) || nla_put_u32(skb, IFLA_BR_STP_STATE, stp_enabled) || nla_put_u16(skb, IFLA_BR_PRIORITY, priority) || - nla_put_u8(skb, IFLA_BR_VLAN_FILTERING, vlan_enabled)) + nla_put_u8(skb, IFLA_BR_VLAN_FILTERING, vlan_enabled) || + nla_put_u16(skb, IFLA_BR_GROUP_FWD_MASK, br->group_fwd_mask) || + nla_put(skb, IFLA_BR_BRIDGE_ID, sizeof(struct ifla_bridge_id), + &br->bridge_id) || + nla_put(skb, IFLA_BR_ROOT_ID, sizeof(struct ifla_bridge_id), + &br->designated_root) || + nla_put_u16(skb, IFLA_BR_ROOT_PORT, br->root_port) || + nla_put_u32(skb, IFLA_BR_ROOT_PATH_COST, br->root_path_cost) || + nla_put_u8(skb, IFLA_BR_TOPOLOGY_CHANGE, br->topology_change) || + nla_put_u8(skb, IFLA_BR_TOPOLOGY_CHANGE_DETECTED, + br->topology_change_detected) || + nla_put(skb, IFLA_BR_GROUP_ADDR, ETH_ALEN, br->group_addr)) return -EMSGSIZE; +#ifdef CONFIG_BRIDGE_VLAN_FILTERING + if (nla_put_be16(skb, IFLA_BR_VLAN_PROTOCOL, br->vlan_proto) || + nla_put_u16(skb, IFLA_BR_VLAN_DEFAULT_PVID, br->default_pvid)) + return -EMSGSIZE; +#endif +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + if (nla_put_u8(skb, IFLA_BR_MCAST_ROUTER, br->multicast_router) || + nla_put_u8(skb, IFLA_BR_MCAST_SNOOPING, !br->multicast_disabled) || + nla_put_u8(skb, IFLA_BR_MCAST_QUERY_USE_IFADDR, + br->multicast_query_use_ifaddr) || + nla_put_u8(skb, IFLA_BR_MCAST_QUERIER, br->multicast_querier) || + nla_put_u32(skb, IFLA_BR_MCAST_HASH_ELASTICITY, + br->hash_elasticity) || + nla_put_u32(skb, IFLA_BR_MCAST_HASH_MAX, br->hash_max) || + nla_put_u32(skb, IFLA_BR_MCAST_LAST_MEMBER_CNT, + br->multicast_last_member_count) || + nla_put_u32(skb, IFLA_BR_MCAST_STARTUP_QUERY_CNT, + br->multicast_startup_query_count)) + return -EMSGSIZE; + + clockval = jiffies_to_clock_t(br->multicast_last_member_interval); + if (nla_put_u64(skb, IFLA_BR_MCAST_LAST_MEMBER_INTVL, clockval)) + return -EMSGSIZE; + clockval = jiffies_to_clock_t(br->multicast_membership_interval); + if (nla_put_u64(skb, IFLA_BR_MCAST_MEMBERSHIP_INTVL, clockval)) + return -EMSGSIZE; + clockval = jiffies_to_clock_t(br->multicast_querier_interval); + if (nla_put_u64(skb, IFLA_BR_MCAST_QUERIER_INTVL, clockval)) + return -EMSGSIZE; + clockval = jiffies_to_clock_t(br->multicast_query_interval); + if (nla_put_u64(skb, IFLA_BR_MCAST_QUERY_INTVL, clockval)) + return -EMSGSIZE; + clockval = jiffies_to_clock_t(br->multicast_query_response_interval); + if (nla_put_u64(skb, IFLA_BR_MCAST_QUERY_RESPONSE_INTVL, clockval)) + return -EMSGSIZE; + clockval = jiffies_to_clock_t(br->multicast_startup_query_interval); + if (nla_put_u64(skb, IFLA_BR_MCAST_STARTUP_QUERY_INTVL, clockval)) + return -EMSGSIZE; +#endif +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + if (nla_put_u8(skb, IFLA_BR_NF_CALL_IPTABLES, + br->nf_call_iptables ? 1 : 0) || + nla_put_u8(skb, IFLA_BR_NF_CALL_IP6TABLES, + br->nf_call_ip6tables ? 1 : 0) || + nla_put_u8(skb, IFLA_BR_NF_CALL_ARPTABLES, + br->nf_call_arptables ? 1 : 0)) + return -EMSGSIZE; +#endif + return 0; } static size_t br_get_link_af_size(const struct net_device *dev) { - struct net_port_vlans *pv; - - if (br_port_exists(dev)) - pv = nbp_get_vlan_info(br_port_get_rtnl(dev)); - else if (dev->priv_flags & IFF_EBRIDGE) - pv = br_get_vlan_info((struct net_bridge *)netdev_priv(dev)); - else - return 0; + struct net_bridge_port *p; + struct net_bridge *br; + int num_vlans = 0; - if (!pv) - return 0; + if (br_port_exists(dev)) { + p = br_port_get_rtnl(dev); + num_vlans = br_get_num_vlan_infos(nbp_vlan_group(p), + RTEXT_FILTER_BRVLAN); + } else if (dev->priv_flags & IFF_EBRIDGE) { + br = netdev_priv(dev); + num_vlans = br_get_num_vlan_infos(br_vlan_group(br), + RTEXT_FILTER_BRVLAN); + } /* Each VLAN is returned in bridge_vlan_info along with flags */ - return pv->num_vlans * nla_total_size(sizeof(struct bridge_vlan_info)); + return num_vlans * nla_total_size(sizeof(struct bridge_vlan_info)); } static struct rtnl_af_ops br_af_ops __read_mostly = { diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 3d95647039d0..ba0c67b2159a 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -20,6 +20,7 @@ #include <net/route.h> #include <net/ip6_fib.h> #include <linux/if_vlan.h> +#include <linux/rhashtable.h> #define BR_HASH_BITS 8 #define BR_HASH_SIZE (1 << BR_HASH_BITS) @@ -28,7 +29,6 @@ #define BR_PORT_BITS 10 #define BR_MAX_PORTS (1<<BR_PORT_BITS) -#define BR_VLAN_BITMAP_LEN BITS_TO_LONGS(VLAN_N_VID) #define BR_VERSION "2.3" @@ -77,17 +77,61 @@ struct bridge_mcast_querier { }; #endif -struct net_port_vlans { - u16 port_idx; - u16 pvid; +/** + * struct net_bridge_vlan - per-vlan entry + * + * @vnode: rhashtable member + * @vid: VLAN id + * @flags: bridge vlan flags + * @br: if MASTER flag set, this points to a bridge struct + * @port: if MASTER flag unset, this points to a port struct + * @refcnt: if MASTER flag set, this is bumped for each port referencing it + * @brvlan: if MASTER flag unset, this points to the global per-VLAN context + * for this VLAN entry + * @vlist: sorted list of VLAN entries + * @rcu: used for entry destruction + * + * This structure is shared between the global per-VLAN entries contained in + * the bridge rhashtable and the local per-port per-VLAN entries contained in + * the port's rhashtable. The union entries should be interpreted depending on + * the entry flags that are set. + */ +struct net_bridge_vlan { + struct rhash_head vnode; + u16 vid; + u16 flags; union { - struct net_bridge_port *port; - struct net_bridge *br; - } parent; + struct net_bridge *br; + struct net_bridge_port *port; + }; + union { + atomic_t refcnt; + struct net_bridge_vlan *brvlan; + }; + struct list_head vlist; + struct rcu_head rcu; - unsigned long vlan_bitmap[BR_VLAN_BITMAP_LEN]; - unsigned long untagged_bitmap[BR_VLAN_BITMAP_LEN]; +}; + +/** + * struct net_bridge_vlan_group + * + * @vlan_hash: VLAN entry rhashtable + * @vlan_list: sorted VLAN entry list + * @num_vlans: number of total VLAN entries + * @pvid: PVID VLAN id + * + * IMPORTANT: Be careful when checking if there're VLAN entries using list + * primitives because the bridge can have entries in its list which + * are just for global context but not for filtering, i.e. they have + * the master flag set but not the brentry flag. If you have to check + * if there're "real" entries in the bridge please test @num_vlans + */ +struct net_bridge_vlan_group { + struct rhashtable vlan_hash; + struct list_head vlan_list; u16 num_vlans; + u16 pvid; }; struct net_bridge_fdb_entry @@ -95,15 +139,15 @@ struct net_bridge_fdb_entry struct hlist_node hlist; struct net_bridge_port *dst; - struct rcu_head rcu; unsigned long updated; unsigned long used; mac_addr addr; + __u16 vlan_id; unsigned char is_local:1, is_static:1, added_by_user:1, added_by_external_learn:1; - __u16 vlan_id; + struct rcu_head rcu; }; struct net_bridge_port_group { @@ -185,7 +229,7 @@ struct net_bridge_port struct netpoll *np; #endif #ifdef CONFIG_BRIDGE_VLAN_FILTERING - struct net_port_vlans __rcu *vlan_info; + struct net_bridge_vlan_group *vlgrp; #endif }; @@ -293,10 +337,10 @@ struct net_bridge struct kobject *ifobj; u32 auto_cnt; #ifdef CONFIG_BRIDGE_VLAN_FILTERING + struct net_bridge_vlan_group *vlgrp; u8 vlan_enabled; __be16 vlan_proto; u16 default_pvid; - struct net_port_vlans __rcu *vlan_info; #endif }; @@ -344,6 +388,31 @@ static inline int br_is_root_bridge(const struct net_bridge *br) return !memcmp(&br->bridge_id, &br->designated_root, 8); } +/* check if a VLAN entry is global */ +static inline bool br_vlan_is_master(const struct net_bridge_vlan *v) +{ + return v->flags & BRIDGE_VLAN_INFO_MASTER; +} + +/* check if a VLAN entry is used by the bridge */ +static inline bool br_vlan_is_brentry(const struct net_bridge_vlan *v) +{ + return v->flags & BRIDGE_VLAN_INFO_BRENTRY; +} + +/* check if we should use the vlan entry, returns false if it's only context */ +static inline bool br_vlan_should_use(const struct net_bridge_vlan *v) +{ + if (br_vlan_is_master(v)) { + if (br_vlan_is_brentry(v)) + return true; + else + return false; + } + + return true; +} + /* br_device.c */ void br_dev_setup(struct net_device *dev); void br_dev_delete(struct net_device *dev, struct list_head *list); @@ -413,10 +482,10 @@ int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p, /* br_forward.c */ void br_deliver(const struct net_bridge_port *to, struct sk_buff *skb); -int br_dev_queue_push_xmit(struct sock *sk, struct sk_buff *skb); +int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb); void br_forward(const struct net_bridge_port *to, struct sk_buff *skb, struct sk_buff *skb0); -int br_forward_finish(struct sock *sk, struct sk_buff *skb); +int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb); void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb, bool unicast); void br_flood_forward(struct net_bridge *br, struct sk_buff *skb, struct sk_buff *skb2, bool unicast); @@ -434,7 +503,7 @@ void br_port_flags_change(struct net_bridge_port *port, unsigned long mask); void br_manage_promisc(struct net_bridge *br); /* br_input.c */ -int br_handle_frame_finish(struct sock *sk, struct sk_buff *skb); +int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb); rx_handler_result_t br_handle_frame(struct sk_buff **pskb); static inline bool br_rx_handler_check_rcu(const struct net_device *dev) @@ -601,40 +670,43 @@ static inline void br_mdb_uninit(void) /* br_vlan.c */ #ifdef CONFIG_BRIDGE_VLAN_FILTERING -bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v, - struct sk_buff *skb, u16 *vid); -bool br_allowed_egress(struct net_bridge *br, const struct net_port_vlans *v, +bool br_allowed_ingress(const struct net_bridge *br, + struct net_bridge_vlan_group *vg, struct sk_buff *skb, + u16 *vid); +bool br_allowed_egress(struct net_bridge_vlan_group *vg, const struct sk_buff *skb); bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid); struct sk_buff *br_handle_vlan(struct net_bridge *br, - const struct net_port_vlans *v, + struct net_bridge_vlan_group *vg, struct sk_buff *skb); int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags); int br_vlan_delete(struct net_bridge *br, u16 vid); void br_vlan_flush(struct net_bridge *br); -bool br_vlan_find(struct net_bridge *br, u16 vid); +struct net_bridge_vlan *br_vlan_find(struct net_bridge_vlan_group *vg, u16 vid); void br_recalculate_fwd_mask(struct net_bridge *br); int __br_vlan_filter_toggle(struct net_bridge *br, unsigned long val); int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val); +int __br_vlan_set_proto(struct net_bridge *br, __be16 proto); int br_vlan_set_proto(struct net_bridge *br, unsigned long val); int br_vlan_init(struct net_bridge *br); int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val); +int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid); int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags); int nbp_vlan_delete(struct net_bridge_port *port, u16 vid); void nbp_vlan_flush(struct net_bridge_port *port); -bool nbp_vlan_find(struct net_bridge_port *port, u16 vid); int nbp_vlan_init(struct net_bridge_port *port); +int nbp_get_num_vlan_infos(struct net_bridge_port *p, u32 filter_mask); -static inline struct net_port_vlans *br_get_vlan_info( - const struct net_bridge *br) +static inline struct net_bridge_vlan_group *br_vlan_group( + const struct net_bridge *br) { - return rcu_dereference_rtnl(br->vlan_info); + return br->vlgrp; } -static inline struct net_port_vlans *nbp_get_vlan_info( - const struct net_bridge_port *p) +static inline struct net_bridge_vlan_group *nbp_vlan_group( + const struct net_bridge_port *p) { - return rcu_dereference_rtnl(p->vlan_info); + return p->vlgrp; } /* Since bridge now depends on 8021Q module, but the time bridge sees the @@ -644,9 +716,9 @@ static inline int br_vlan_get_tag(const struct sk_buff *skb, u16 *vid) { int err = 0; - if (skb_vlan_tag_present(skb)) + if (skb_vlan_tag_present(skb)) { *vid = skb_vlan_tag_get(skb) & VLAN_VID_MASK; - else { + } else { *vid = 0; err = -EINVAL; } @@ -654,13 +726,13 @@ static inline int br_vlan_get_tag(const struct sk_buff *skb, u16 *vid) return err; } -static inline u16 br_get_pvid(const struct net_port_vlans *v) +static inline u16 br_get_pvid(const struct net_bridge_vlan_group *vg) { - if (!v) + if (!vg) return 0; smp_rmb(); - return v->pvid; + return vg->pvid; } static inline int br_vlan_enabled(struct net_bridge *br) @@ -668,16 +740,15 @@ static inline int br_vlan_enabled(struct net_bridge *br) return br->vlan_enabled; } #else -static inline bool br_allowed_ingress(struct net_bridge *br, - struct net_port_vlans *v, +static inline bool br_allowed_ingress(const struct net_bridge *br, + struct net_bridge_vlan_group *vg, struct sk_buff *skb, u16 *vid) { return true; } -static inline bool br_allowed_egress(struct net_bridge *br, - const struct net_port_vlans *v, +static inline bool br_allowed_egress(struct net_bridge_vlan_group *vg, const struct sk_buff *skb) { return true; @@ -690,7 +761,7 @@ static inline bool br_should_learn(struct net_bridge_port *p, } static inline struct sk_buff *br_handle_vlan(struct net_bridge *br, - const struct net_port_vlans *v, + struct net_bridge_vlan_group *vg, struct sk_buff *skb) { return skb; @@ -710,11 +781,6 @@ static inline void br_vlan_flush(struct net_bridge *br) { } -static inline bool br_vlan_find(struct net_bridge *br, u16 vid) -{ - return false; -} - static inline void br_recalculate_fwd_mask(struct net_bridge *br) { } @@ -738,21 +804,11 @@ static inline void nbp_vlan_flush(struct net_bridge_port *port) { } -static inline struct net_port_vlans *br_get_vlan_info( - const struct net_bridge *br) +static inline struct net_bridge_vlan *br_vlan_find(struct net_bridge_vlan_group *vg, + u16 vid) { return NULL; } -static inline struct net_port_vlans *nbp_get_vlan_info( - const struct net_bridge_port *p) -{ - return NULL; -} - -static inline bool nbp_vlan_find(struct net_bridge_port *port, u16 vid) -{ - return false; -} static inline int nbp_vlan_init(struct net_bridge_port *port) { @@ -763,7 +819,8 @@ static inline u16 br_vlan_get_tag(const struct sk_buff *skb, u16 *tag) { return 0; } -static inline u16 br_get_pvid(const struct net_port_vlans *v) + +static inline u16 br_get_pvid(const struct net_bridge_vlan_group *vg) { return 0; } @@ -778,6 +835,24 @@ static inline int __br_vlan_filter_toggle(struct net_bridge *br, { return -EOPNOTSUPP; } + +static inline int nbp_get_num_vlan_infos(struct net_bridge_port *p, + u32 filter_mask) +{ + return 0; +} + +static inline struct net_bridge_vlan_group *br_vlan_group( + const struct net_bridge *br) +{ + return NULL; +} + +static inline struct net_bridge_vlan_group *nbp_vlan_group( + const struct net_bridge_port *p) +{ + return NULL; +} #endif struct nf_br_ops { @@ -807,6 +882,7 @@ void __br_set_forward_delay(struct net_bridge *br, unsigned long t); int br_set_forward_delay(struct net_bridge *br, unsigned long x); int br_set_hello_time(struct net_bridge *br, unsigned long x); int br_set_max_age(struct net_bridge *br, unsigned long x); +int br_set_ageing_time(struct net_bridge *br, u32 ageing_time); /* br_stp_if.c */ diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index ed74ffaa851f..db6d243defb2 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -40,7 +40,7 @@ void br_log_state(const struct net_bridge_port *p) void br_set_state(struct net_bridge_port *p, unsigned int state) { struct switchdev_attr attr = { - .id = SWITCHDEV_ATTR_PORT_STP_STATE, + .id = SWITCHDEV_ATTR_ID_PORT_STP_STATE, .u.stp_state = state, }; int err; @@ -566,6 +566,29 @@ int br_set_max_age(struct net_bridge *br, unsigned long val) } +int br_set_ageing_time(struct net_bridge *br, u32 ageing_time) +{ + struct switchdev_attr attr = { + .id = SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME, + .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP, + .u.ageing_time = ageing_time, + }; + unsigned long t = clock_t_to_jiffies(ageing_time); + int err; + + if (t < BR_MIN_AGEING_TIME || t > BR_MAX_AGEING_TIME) + return -ERANGE; + + err = switchdev_port_attr_set(br->dev, &attr); + if (err) + return err; + + br->ageing_time = t; + mod_timer(&br->gc_timer, jiffies); + + return 0; +} + void __br_set_forward_delay(struct net_bridge *br, unsigned long t) { br->bridge_forward_delay = t; @@ -576,17 +599,12 @@ void __br_set_forward_delay(struct net_bridge *br, unsigned long t) int br_set_forward_delay(struct net_bridge *br, unsigned long val) { unsigned long t = clock_t_to_jiffies(val); - int err = -ERANGE; - spin_lock_bh(&br->lock); - if (br->stp_enabled != BR_NO_STP && - (t < BR_MIN_FORWARD_DELAY || t > BR_MAX_FORWARD_DELAY)) - goto unlock; + if (t < BR_MIN_FORWARD_DELAY || t > BR_MAX_FORWARD_DELAY) + return -ERANGE; + spin_lock_bh(&br->lock); __br_set_forward_delay(br, t); - err = 0; - -unlock: spin_unlock_bh(&br->lock); - return err; + return 0; } diff --git a/net/bridge/br_stp_bpdu.c b/net/bridge/br_stp_bpdu.c index 534fc4cd263e..5881fbc114a9 100644 --- a/net/bridge/br_stp_bpdu.c +++ b/net/bridge/br_stp_bpdu.c @@ -30,6 +30,12 @@ #define LLC_RESERVE sizeof(struct llc_pdu_un) +static int br_send_bpdu_finish(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + return dev_queue_xmit(skb); +} + static void br_send_bpdu(struct net_bridge_port *p, const unsigned char *data, int length) { @@ -54,9 +60,9 @@ static void br_send_bpdu(struct net_bridge_port *p, skb_reset_mac_header(skb); - NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, NULL, skb, - NULL, skb->dev, - dev_queue_xmit_sk); + NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, + dev_net(p->dev), NULL, skb, NULL, skb->dev, + br_send_bpdu_finish); } static inline void br_set_ticks(unsigned char *dest, int j) diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index 4c97fc50fb70..04ef1926ee7e 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -102,8 +102,7 @@ static ssize_t ageing_time_show(struct device *d, static int set_ageing_time(struct net_bridge *br, unsigned long val) { - br->ageing_time = clock_t_to_jiffies(val); - return 0; + return br_set_ageing_time(br, val); } static ssize_t ageing_time_store(struct device *d, diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 3cef6892c0bb..ad7e4f6b6d6b 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -6,86 +6,195 @@ #include "br_private.h" -static void __vlan_add_pvid(struct net_port_vlans *v, u16 vid) +static inline int br_vlan_cmp(struct rhashtable_compare_arg *arg, + const void *ptr) { - if (v->pvid == vid) + const struct net_bridge_vlan *vle = ptr; + u16 vid = *(u16 *)arg->key; + + return vle->vid != vid; +} + +static const struct rhashtable_params br_vlan_rht_params = { + .head_offset = offsetof(struct net_bridge_vlan, vnode), + .key_offset = offsetof(struct net_bridge_vlan, vid), + .key_len = sizeof(u16), + .nelem_hint = 3, + .locks_mul = 1, + .max_size = VLAN_N_VID, + .obj_cmpfn = br_vlan_cmp, + .automatic_shrinking = true, +}; + +static struct net_bridge_vlan *br_vlan_lookup(struct rhashtable *tbl, u16 vid) +{ + return rhashtable_lookup_fast(tbl, &vid, br_vlan_rht_params); +} + +static void __vlan_add_pvid(struct net_bridge_vlan_group *vg, u16 vid) +{ + if (vg->pvid == vid) return; smp_wmb(); - v->pvid = vid; + vg->pvid = vid; } -static void __vlan_delete_pvid(struct net_port_vlans *v, u16 vid) +static void __vlan_delete_pvid(struct net_bridge_vlan_group *vg, u16 vid) { - if (v->pvid != vid) + if (vg->pvid != vid) return; smp_wmb(); - v->pvid = 0; + vg->pvid = 0; } -static void __vlan_add_flags(struct net_port_vlans *v, u16 vid, u16 flags) +static void __vlan_add_flags(struct net_bridge_vlan *v, u16 flags) { + struct net_bridge_vlan_group *vg; + + if (br_vlan_is_master(v)) + vg = v->br->vlgrp; + else + vg = v->port->vlgrp; + if (flags & BRIDGE_VLAN_INFO_PVID) - __vlan_add_pvid(v, vid); + __vlan_add_pvid(vg, v->vid); else - __vlan_delete_pvid(v, vid); + __vlan_delete_pvid(vg, v->vid); if (flags & BRIDGE_VLAN_INFO_UNTAGGED) - set_bit(vid, v->untagged_bitmap); + v->flags |= BRIDGE_VLAN_INFO_UNTAGGED; else - clear_bit(vid, v->untagged_bitmap); + v->flags &= ~BRIDGE_VLAN_INFO_UNTAGGED; } static int __vlan_vid_add(struct net_device *dev, struct net_bridge *br, u16 vid, u16 flags) { - const struct net_device_ops *ops = dev->netdev_ops; + struct switchdev_obj_port_vlan v = { + .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, + .flags = flags, + .vid_begin = vid, + .vid_end = vid, + }; int err; - /* If driver uses VLAN ndo ops, use 8021q to install vid - * on device, otherwise try switchdev ops to install vid. + /* Try switchdev op first. In case it is not supported, fallback to + * 8021q add. */ + err = switchdev_port_obj_add(dev, &v.obj); + if (err == -EOPNOTSUPP) + return vlan_vid_add(dev, br->vlan_proto, vid); + return err; +} - if (ops->ndo_vlan_rx_add_vid) { - err = vlan_vid_add(dev, br->vlan_proto, vid); - } else { - struct switchdev_obj vlan_obj = { - .id = SWITCHDEV_OBJ_PORT_VLAN, - .u.vlan = { - .flags = flags, - .vid_begin = vid, - .vid_end = vid, - }, - }; - - err = switchdev_port_obj_add(dev, &vlan_obj); - if (err == -EOPNOTSUPP) - err = 0; +static void __vlan_add_list(struct net_bridge_vlan *v) +{ + struct list_head *headp, *hpos; + struct net_bridge_vlan *vent; + + headp = br_vlan_is_master(v) ? &v->br->vlgrp->vlan_list : + &v->port->vlgrp->vlan_list; + list_for_each_prev(hpos, headp) { + vent = list_entry(hpos, struct net_bridge_vlan, vlist); + if (v->vid < vent->vid) + continue; + else + break; } + list_add_rcu(&v->vlist, hpos); +} - return err; +static void __vlan_del_list(struct net_bridge_vlan *v) +{ + list_del_rcu(&v->vlist); } -static int __vlan_add(struct net_port_vlans *v, u16 vid, u16 flags) +static int __vlan_vid_del(struct net_device *dev, struct net_bridge *br, + u16 vid) { - struct net_bridge_port *p = NULL; - struct net_bridge *br; - struct net_device *dev; + struct switchdev_obj_port_vlan v = { + .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, + .vid_begin = vid, + .vid_end = vid, + }; int err; - if (test_bit(vid, v->vlan_bitmap)) { - __vlan_add_flags(v, vid, flags); + /* Try switchdev op first. In case it is not supported, fallback to + * 8021q del. + */ + err = switchdev_port_obj_del(dev, &v.obj); + if (err == -EOPNOTSUPP) { + vlan_vid_del(dev, br->vlan_proto, vid); return 0; } + return err; +} + +/* Returns a master vlan, if it didn't exist it gets created. In all cases a + * a reference is taken to the master vlan before returning. + */ +static struct net_bridge_vlan *br_vlan_get_master(struct net_bridge *br, u16 vid) +{ + struct net_bridge_vlan *masterv; + + masterv = br_vlan_find(br->vlgrp, vid); + if (!masterv) { + /* missing global ctx, create it now */ + if (br_vlan_add(br, vid, 0)) + return NULL; + masterv = br_vlan_find(br->vlgrp, vid); + if (WARN_ON(!masterv)) + return NULL; + } + atomic_inc(&masterv->refcnt); + + return masterv; +} + +static void br_vlan_put_master(struct net_bridge_vlan *masterv) +{ + if (!br_vlan_is_master(masterv)) + return; + + if (atomic_dec_and_test(&masterv->refcnt)) { + rhashtable_remove_fast(&masterv->br->vlgrp->vlan_hash, + &masterv->vnode, br_vlan_rht_params); + __vlan_del_list(masterv); + kfree_rcu(masterv, rcu); + } +} - if (v->port_idx) { - p = v->parent.port; +/* This is the shared VLAN add function which works for both ports and bridge + * devices. There are four possible calls to this function in terms of the + * vlan entry type: + * 1. vlan is being added on a port (no master flags, global entry exists) + * 2. vlan is being added on a bridge (both master and brvlan flags) + * 3. vlan is being added on a port, but a global entry didn't exist which + * is being created right now (master flag set, brvlan flag unset), the + * global entry is used for global per-vlan features, but not for filtering + * 4. same as 3 but with both master and brvlan flags set so the entry + * will be used for filtering in both the port and the bridge + */ +static int __vlan_add(struct net_bridge_vlan *v, u16 flags) +{ + struct net_bridge_vlan *masterv = NULL; + struct net_bridge_port *p = NULL; + struct net_bridge_vlan_group *vg; + struct net_device *dev; + struct net_bridge *br; + int err; + + if (br_vlan_is_master(v)) { + br = v->br; + dev = br->dev; + vg = br->vlgrp; + } else { + p = v->port; br = p->br; dev = p->dev; - } else { - br = v->parent.br; - dev = br->dev; + vg = p->vlgrp; } if (p) { @@ -93,107 +202,135 @@ static int __vlan_add(struct net_port_vlans *v, u16 vid, u16 flags) * This ensures tagged traffic enters the bridge when * promiscuous mode is disabled by br_manage_promisc(). */ - err = __vlan_vid_add(dev, br, vid, flags); + err = __vlan_vid_add(dev, br, v->vid, flags); if (err) - return err; + goto out; + + /* need to work on the master vlan too */ + if (flags & BRIDGE_VLAN_INFO_MASTER) { + err = br_vlan_add(br, v->vid, flags | + BRIDGE_VLAN_INFO_BRENTRY); + if (err) + goto out_filt; + } + + masterv = br_vlan_get_master(br, v->vid); + if (!masterv) + goto out_filt; + v->brvlan = masterv; } - err = br_fdb_insert(br, p, dev->dev_addr, vid); - if (err) { - br_err(br, "failed insert local address into bridge " - "forwarding table\n"); - goto out_filt; + /* Add the dev mac and count the vlan only if it's usable */ + if (br_vlan_should_use(v)) { + err = br_fdb_insert(br, p, dev->dev_addr, v->vid); + if (err) { + br_err(br, "failed insert local address into bridge forwarding table\n"); + goto out_filt; + } + vg->num_vlans++; } - set_bit(vid, v->vlan_bitmap); - v->num_vlans++; - __vlan_add_flags(v, vid, flags); + err = rhashtable_lookup_insert_fast(&vg->vlan_hash, &v->vnode, + br_vlan_rht_params); + if (err) + goto out_fdb_insert; - return 0; + __vlan_add_list(v); + __vlan_add_flags(v, flags); +out: + return err; + +out_fdb_insert: + if (br_vlan_should_use(v)) { + br_fdb_find_delete_local(br, p, dev->dev_addr, v->vid); + vg->num_vlans--; + } out_filt: - if (p) - vlan_vid_del(dev, br->vlan_proto, vid); - return err; + if (p) { + __vlan_vid_del(dev, br, v->vid); + if (masterv) { + br_vlan_put_master(masterv); + v->brvlan = NULL; + } + } + + goto out; } -static void __vlan_vid_del(struct net_device *dev, struct net_bridge *br, - u16 vid) +static int __vlan_del(struct net_bridge_vlan *v) { - const struct net_device_ops *ops = dev->netdev_ops; - - /* If driver uses VLAN ndo ops, use 8021q to delete vid - * on device, otherwise try switchdev ops to delete vid. - */ + struct net_bridge_vlan *masterv = v; + struct net_bridge_vlan_group *vg; + struct net_bridge_port *p = NULL; + int err = 0; - if (ops->ndo_vlan_rx_kill_vid) { - vlan_vid_del(dev, br->vlan_proto, vid); + if (br_vlan_is_master(v)) { + vg = v->br->vlgrp; } else { - struct switchdev_obj vlan_obj = { - .id = SWITCHDEV_OBJ_PORT_VLAN, - .u.vlan = { - .vid_begin = vid, - .vid_end = vid, - }, - }; - - switchdev_port_obj_del(dev, &vlan_obj); + p = v->port; + vg = v->port->vlgrp; + masterv = v->brvlan; } -} -static int __vlan_del(struct net_port_vlans *v, u16 vid) -{ - if (!test_bit(vid, v->vlan_bitmap)) - return -EINVAL; - - __vlan_delete_pvid(v, vid); - clear_bit(vid, v->untagged_bitmap); + __vlan_delete_pvid(vg, v->vid); + if (p) { + err = __vlan_vid_del(p->dev, p->br, v->vid); + if (err) + goto out; + } - if (v->port_idx) { - struct net_bridge_port *p = v->parent.port; - __vlan_vid_del(p->dev, p->br, vid); + if (br_vlan_should_use(v)) { + v->flags &= ~BRIDGE_VLAN_INFO_BRENTRY; + vg->num_vlans--; } - clear_bit(vid, v->vlan_bitmap); - v->num_vlans--; - if (bitmap_empty(v->vlan_bitmap, VLAN_N_VID)) { - if (v->port_idx) - RCU_INIT_POINTER(v->parent.port->vlan_info, NULL); - else - RCU_INIT_POINTER(v->parent.br->vlan_info, NULL); + if (masterv != v) { + rhashtable_remove_fast(&vg->vlan_hash, &v->vnode, + br_vlan_rht_params); + __vlan_del_list(v); kfree_rcu(v, rcu); } - return 0; + + br_vlan_put_master(masterv); +out: + return err; } -static void __vlan_flush(struct net_port_vlans *v) +static void __vlan_flush(struct net_bridge_vlan_group *vlgrp) { - smp_wmb(); - v->pvid = 0; - bitmap_zero(v->vlan_bitmap, VLAN_N_VID); - if (v->port_idx) - RCU_INIT_POINTER(v->parent.port->vlan_info, NULL); - else - RCU_INIT_POINTER(v->parent.br->vlan_info, NULL); - kfree_rcu(v, rcu); + struct net_bridge_vlan *vlan, *tmp; + + __vlan_delete_pvid(vlgrp, vlgrp->pvid); + list_for_each_entry_safe(vlan, tmp, &vlgrp->vlan_list, vlist) + __vlan_del(vlan); + rhashtable_destroy(&vlgrp->vlan_hash); + kfree(vlgrp); } struct sk_buff *br_handle_vlan(struct net_bridge *br, - const struct net_port_vlans *pv, + struct net_bridge_vlan_group *vg, struct sk_buff *skb) { + struct net_bridge_vlan *v; u16 vid; /* If this packet was not filtered at input, let it pass */ if (!BR_INPUT_SKB_CB(skb)->vlan_filtered) goto out; - /* Vlan filter table must be configured at this point. The + /* At this point, we know that the frame was filtered and contains + * a valid vlan id. If the vlan id has untagged flag set, + * send untagged; otherwise, send tagged. + */ + br_vlan_get_tag(skb, &vid); + v = br_vlan_find(vg, vid); + /* Vlan entry must be configured at this point. The * only exception is the bridge is set in promisc mode and the * packet is destined for the bridge device. In this case * pass the packet as is. */ - if (!pv) { + if (!v || !br_vlan_should_use(v)) { if ((br->dev->flags & IFF_PROMISC) && skb->dev == br->dev) { goto out; } else { @@ -201,13 +338,7 @@ struct sk_buff *br_handle_vlan(struct net_bridge *br, return NULL; } } - - /* At this point, we know that the frame was filtered and contains - * a valid vlan id. If the vlan id is set in the untagged bitmap, - * send untagged; otherwise, send tagged. - */ - br_vlan_get_tag(skb, &vid); - if (test_bit(vid, pv->untagged_bitmap)) + if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED) skb->vlan_tci = 0; out: @@ -215,29 +346,13 @@ out: } /* Called under RCU */ -bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v, - struct sk_buff *skb, u16 *vid) +static bool __allowed_ingress(struct net_bridge_vlan_group *vg, __be16 proto, + struct sk_buff *skb, u16 *vid) { + const struct net_bridge_vlan *v; bool tagged; - __be16 proto; - - /* If VLAN filtering is disabled on the bridge, all packets are - * permitted. - */ - if (!br->vlan_enabled) { - BR_INPUT_SKB_CB(skb)->vlan_filtered = false; - return true; - } - - /* If there are no vlan in the permitted list, all packets are - * rejected. - */ - if (!v) - goto drop; BR_INPUT_SKB_CB(skb)->vlan_filtered = true; - proto = br->vlan_proto; - /* If vlan tx offload is disabled on bridge device and frame was * sent from vlan device on the bridge device, it does not have * HW accelerated vlan tag. @@ -272,7 +387,7 @@ bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v, } if (!*vid) { - u16 pvid = br_get_pvid(v); + u16 pvid = br_get_pvid(vg); /* Frame had a tag with VID 0 or did not have a tag. * See if pvid is set on this port. That tells us which @@ -300,29 +415,43 @@ bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v, } /* Frame had a valid vlan tag. See if vlan is allowed */ - if (test_bit(*vid, v->vlan_bitmap)) + v = br_vlan_find(vg, *vid); + if (v && br_vlan_should_use(v)) return true; drop: kfree_skb(skb); return false; } +bool br_allowed_ingress(const struct net_bridge *br, + struct net_bridge_vlan_group *vg, struct sk_buff *skb, + u16 *vid) +{ + /* If VLAN filtering is disabled on the bridge, all packets are + * permitted. + */ + if (!br->vlan_enabled) { + BR_INPUT_SKB_CB(skb)->vlan_filtered = false; + return true; + } + + return __allowed_ingress(vg, br->vlan_proto, skb, vid); +} + /* Called under RCU. */ -bool br_allowed_egress(struct net_bridge *br, - const struct net_port_vlans *v, +bool br_allowed_egress(struct net_bridge_vlan_group *vg, const struct sk_buff *skb) { + const struct net_bridge_vlan *v; u16 vid; /* If this packet was not filtered at input, let it pass */ if (!BR_INPUT_SKB_CB(skb)->vlan_filtered) return true; - if (!v) - return false; - br_vlan_get_tag(skb, &vid); - if (test_bit(vid, v->vlan_bitmap)) + v = br_vlan_find(vg, vid); + if (v && br_vlan_should_use(v)) return true; return false; @@ -331,29 +460,29 @@ bool br_allowed_egress(struct net_bridge *br, /* Called under RCU */ bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid) { + struct net_bridge_vlan_group *vg; struct net_bridge *br = p->br; - struct net_port_vlans *v; /* If filtering was disabled at input, let it pass. */ if (!br->vlan_enabled) return true; - v = rcu_dereference(p->vlan_info); - if (!v) + vg = p->vlgrp; + if (!vg || !vg->num_vlans) return false; if (!br_vlan_get_tag(skb, vid) && skb->vlan_proto != br->vlan_proto) *vid = 0; if (!*vid) { - *vid = br_get_pvid(v); + *vid = br_get_pvid(vg); if (!*vid) return false; return true; } - if (test_bit(*vid, v->vlan_bitmap)) + if (br_vlan_find(vg, *vid)) return true; return false; @@ -364,31 +493,47 @@ bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid) */ int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags) { - struct net_port_vlans *pv = NULL; - int err; + struct net_bridge_vlan *vlan; + int ret; ASSERT_RTNL(); - pv = rtnl_dereference(br->vlan_info); - if (pv) - return __vlan_add(pv, vid, flags); + vlan = br_vlan_find(br->vlgrp, vid); + if (vlan) { + if (!br_vlan_is_brentry(vlan)) { + /* Trying to change flags of non-existent bridge vlan */ + if (!(flags & BRIDGE_VLAN_INFO_BRENTRY)) + return -EINVAL; + /* It was only kept for port vlans, now make it real */ + ret = br_fdb_insert(br, NULL, br->dev->dev_addr, + vlan->vid); + if (ret) { + br_err(br, "failed insert local address into bridge forwarding table\n"); + return ret; + } + atomic_inc(&vlan->refcnt); + vlan->flags |= BRIDGE_VLAN_INFO_BRENTRY; + br->vlgrp->num_vlans++; + } + __vlan_add_flags(vlan, flags); + return 0; + } - /* Create port vlan infomration - */ - pv = kzalloc(sizeof(*pv), GFP_KERNEL); - if (!pv) + vlan = kzalloc(sizeof(*vlan), GFP_KERNEL); + if (!vlan) return -ENOMEM; - pv->parent.br = br; - err = __vlan_add(pv, vid, flags); - if (err) - goto out; - - rcu_assign_pointer(br->vlan_info, pv); - return 0; -out: - kfree(pv); - return err; + vlan->vid = vid; + vlan->flags = flags | BRIDGE_VLAN_INFO_MASTER; + vlan->flags &= ~BRIDGE_VLAN_INFO_PVID; + vlan->br = br; + if (flags & BRIDGE_VLAN_INFO_BRENTRY) + atomic_set(&vlan->refcnt, 1); + ret = __vlan_add(vlan, flags); + if (ret) + kfree(vlan); + + return ret; } /* Must be protected by RTNL. @@ -396,49 +541,33 @@ out: */ int br_vlan_delete(struct net_bridge *br, u16 vid) { - struct net_port_vlans *pv; + struct net_bridge_vlan *v; ASSERT_RTNL(); - pv = rtnl_dereference(br->vlan_info); - if (!pv) - return -EINVAL; + v = br_vlan_find(br->vlgrp, vid); + if (!v || !br_vlan_is_brentry(v)) + return -ENOENT; br_fdb_find_delete_local(br, NULL, br->dev->dev_addr, vid); + br_fdb_delete_by_port(br, NULL, vid, 0); - __vlan_del(pv, vid); - return 0; + return __vlan_del(v); } void br_vlan_flush(struct net_bridge *br) { - struct net_port_vlans *pv; - ASSERT_RTNL(); - pv = rtnl_dereference(br->vlan_info); - if (!pv) - return; - __vlan_flush(pv); + __vlan_flush(br_vlan_group(br)); } -bool br_vlan_find(struct net_bridge *br, u16 vid) +struct net_bridge_vlan *br_vlan_find(struct net_bridge_vlan_group *vg, u16 vid) { - struct net_port_vlans *pv; - bool found = false; - - rcu_read_lock(); - pv = rcu_dereference(br->vlan_info); + if (!vg) + return NULL; - if (!pv) - goto out; - - if (test_bit(vid, pv->vlan_bitmap)) - found = true; - -out: - rcu_read_unlock(); - return found; + return br_vlan_lookup(&vg->vlan_hash, vid); } /* Must be protected by RTNL. */ @@ -492,32 +621,20 @@ int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val) return 0; } -int br_vlan_set_proto(struct net_bridge *br, unsigned long val) +int __br_vlan_set_proto(struct net_bridge *br, __be16 proto) { int err = 0; struct net_bridge_port *p; - struct net_port_vlans *pv; - __be16 proto, oldproto; - u16 vid, errvid; - - if (val != ETH_P_8021Q && val != ETH_P_8021AD) - return -EPROTONOSUPPORT; - - if (!rtnl_trylock()) - return restart_syscall(); + struct net_bridge_vlan *vlan; + __be16 oldproto; - proto = htons(val); if (br->vlan_proto == proto) - goto unlock; + return 0; /* Add VLANs for the new proto to the device filter. */ list_for_each_entry(p, &br->port_list, list) { - pv = rtnl_dereference(p->vlan_info); - if (!pv) - continue; - - for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) { - err = vlan_vid_add(p->dev, proto, vid); + list_for_each_entry(vlan, &p->vlgrp->vlan_list, vlist) { + err = vlan_vid_add(p->dev, proto, vlan->vid); if (err) goto err_filt; } @@ -530,39 +647,52 @@ int br_vlan_set_proto(struct net_bridge *br, unsigned long val) br_recalculate_fwd_mask(br); /* Delete VLANs for the old proto from the device filter. */ - list_for_each_entry(p, &br->port_list, list) { - pv = rtnl_dereference(p->vlan_info); - if (!pv) - continue; + list_for_each_entry(p, &br->port_list, list) + list_for_each_entry(vlan, &p->vlgrp->vlan_list, vlist) + vlan_vid_del(p->dev, oldproto, vlan->vid); - for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) - vlan_vid_del(p->dev, oldproto, vid); - } + return 0; + +err_filt: + list_for_each_entry_continue_reverse(vlan, &p->vlgrp->vlan_list, vlist) + vlan_vid_del(p->dev, proto, vlan->vid); + + list_for_each_entry_continue_reverse(p, &br->port_list, list) + list_for_each_entry(vlan, &p->vlgrp->vlan_list, vlist) + vlan_vid_del(p->dev, proto, vlan->vid); -unlock: - rtnl_unlock(); return err; +} -err_filt: - errvid = vid; - for_each_set_bit(vid, pv->vlan_bitmap, errvid) - vlan_vid_del(p->dev, proto, vid); +int br_vlan_set_proto(struct net_bridge *br, unsigned long val) +{ + int err; - list_for_each_entry_continue_reverse(p, &br->port_list, list) { - pv = rtnl_dereference(p->vlan_info); - if (!pv) - continue; + if (val != ETH_P_8021Q && val != ETH_P_8021AD) + return -EPROTONOSUPPORT; - for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) - vlan_vid_del(p->dev, proto, vid); - } + if (!rtnl_trylock()) + return restart_syscall(); + + err = __br_vlan_set_proto(br, htons(val)); + rtnl_unlock(); - goto unlock; + return err; } -static bool vlan_default_pvid(struct net_port_vlans *pv, u16 vid) +static bool vlan_default_pvid(struct net_bridge_vlan_group *vg, u16 vid) { - return pv && vid == pv->pvid && test_bit(vid, pv->untagged_bitmap); + struct net_bridge_vlan *v; + + if (vid != vg->pvid) + return false; + + v = br_vlan_lookup(&vg->vlan_hash, vid); + if (v && br_vlan_should_use(v) && + (v->flags & BRIDGE_VLAN_INFO_UNTAGGED)) + return true; + + return false; } static void br_vlan_disable_default_pvid(struct net_bridge *br) @@ -573,24 +703,30 @@ static void br_vlan_disable_default_pvid(struct net_bridge *br) /* Disable default_pvid on all ports where it is still * configured. */ - if (vlan_default_pvid(br_get_vlan_info(br), pvid)) + if (vlan_default_pvid(br->vlgrp, pvid)) br_vlan_delete(br, pvid); list_for_each_entry(p, &br->port_list, list) { - if (vlan_default_pvid(nbp_get_vlan_info(p), pvid)) + if (vlan_default_pvid(p->vlgrp, pvid)) nbp_vlan_delete(p, pvid); } br->default_pvid = 0; } -static int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid) +int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid) { + const struct net_bridge_vlan *pvent; struct net_bridge_port *p; u16 old_pvid; int err = 0; unsigned long *changed; + if (!pvid) { + br_vlan_disable_default_pvid(br); + return 0; + } + changed = kcalloc(BITS_TO_LONGS(BR_MAX_PORTS), sizeof(unsigned long), GFP_KERNEL); if (!changed) @@ -601,11 +737,13 @@ static int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid) /* Update default_pvid config only if we do not conflict with * user configuration. */ - if ((!old_pvid || vlan_default_pvid(br_get_vlan_info(br), old_pvid)) && - !br_vlan_find(br, pvid)) { + pvent = br_vlan_find(br->vlgrp, pvid); + if ((!old_pvid || vlan_default_pvid(br->vlgrp, old_pvid)) && + (!pvent || !br_vlan_should_use(pvent))) { err = br_vlan_add(br, pvid, BRIDGE_VLAN_INFO_PVID | - BRIDGE_VLAN_INFO_UNTAGGED); + BRIDGE_VLAN_INFO_UNTAGGED | + BRIDGE_VLAN_INFO_BRENTRY); if (err) goto out; br_vlan_delete(br, old_pvid); @@ -617,8 +755,8 @@ static int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid) * user configuration. */ if ((old_pvid && - !vlan_default_pvid(nbp_get_vlan_info(p), old_pvid)) || - nbp_vlan_find(p, pvid)) + !vlan_default_pvid(p->vlgrp, old_pvid)) || + br_vlan_find(p->vlgrp, pvid)) continue; err = nbp_vlan_add(p, pvid, @@ -652,7 +790,8 @@ err_port: if (old_pvid) br_vlan_add(br, old_pvid, BRIDGE_VLAN_INFO_PVID | - BRIDGE_VLAN_INFO_UNTAGGED); + BRIDGE_VLAN_INFO_UNTAGGED | + BRIDGE_VLAN_INFO_BRENTRY); br_vlan_delete(br, pvid); } goto out; @@ -678,12 +817,7 @@ int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val) err = -EPERM; goto unlock; } - - if (!pvid) - br_vlan_disable_default_pvid(br); - else - err = __br_vlan_set_default_pvid(br, pvid); - + err = __br_vlan_set_default_pvid(br, pvid); unlock: rtnl_unlock(); return err; @@ -691,10 +825,66 @@ unlock: int br_vlan_init(struct net_bridge *br) { + int ret = -ENOMEM; + + br->vlgrp = kzalloc(sizeof(struct net_bridge_vlan_group), GFP_KERNEL); + if (!br->vlgrp) + goto out; + ret = rhashtable_init(&br->vlgrp->vlan_hash, &br_vlan_rht_params); + if (ret) + goto err_rhtbl; + INIT_LIST_HEAD(&br->vlgrp->vlan_list); br->vlan_proto = htons(ETH_P_8021Q); br->default_pvid = 1; - return br_vlan_add(br, 1, - BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED); + ret = br_vlan_add(br, 1, + BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED | + BRIDGE_VLAN_INFO_BRENTRY); + if (ret) + goto err_vlan_add; + +out: + return ret; + +err_vlan_add: + rhashtable_destroy(&br->vlgrp->vlan_hash); +err_rhtbl: + kfree(br->vlgrp); + + goto out; +} + +int nbp_vlan_init(struct net_bridge_port *p) +{ + struct net_bridge_vlan_group *vg; + int ret = -ENOMEM; + + vg = kzalloc(sizeof(struct net_bridge_vlan_group), GFP_KERNEL); + if (!vg) + goto out; + + ret = rhashtable_init(&vg->vlan_hash, &br_vlan_rht_params); + if (ret) + goto err_rhtbl; + INIT_LIST_HEAD(&vg->vlan_list); + /* Make sure everything's committed before publishing vg */ + smp_wmb(); + p->vlgrp = vg; + if (p->br->default_pvid) { + ret = nbp_vlan_add(p, p->br->default_pvid, + BRIDGE_VLAN_INFO_PVID | + BRIDGE_VLAN_INFO_UNTAGGED); + if (ret) + goto err_vlan_add; + } +out: + return ret; + +err_vlan_add: + rhashtable_destroy(&vg->vlan_hash); +err_rhtbl: + kfree(vg); + + goto out; } /* Must be protected by RTNL. @@ -702,35 +892,28 @@ int br_vlan_init(struct net_bridge *br) */ int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags) { - struct net_port_vlans *pv = NULL; - int err; + struct net_bridge_vlan *vlan; + int ret; ASSERT_RTNL(); - pv = rtnl_dereference(port->vlan_info); - if (pv) - return __vlan_add(pv, vid, flags); - - /* Create port vlan infomration - */ - pv = kzalloc(sizeof(*pv), GFP_KERNEL); - if (!pv) { - err = -ENOMEM; - goto clean_up; + vlan = br_vlan_find(port->vlgrp, vid); + if (vlan) { + __vlan_add_flags(vlan, flags); + return 0; } - pv->port_idx = port->port_no; - pv->parent.port = port; - err = __vlan_add(pv, vid, flags); - if (err) - goto clean_up; + vlan = kzalloc(sizeof(*vlan), GFP_KERNEL); + if (!vlan) + return -ENOMEM; - rcu_assign_pointer(port->vlan_info, pv); - return 0; + vlan->vid = vid; + vlan->port = port; + ret = __vlan_add(vlan, flags); + if (ret) + kfree(vlan); -clean_up: - kfree(pv); - return err; + return ret; } /* Must be protected by RTNL. @@ -738,61 +921,27 @@ clean_up: */ int nbp_vlan_delete(struct net_bridge_port *port, u16 vid) { - struct net_port_vlans *pv; + struct net_bridge_vlan *v; ASSERT_RTNL(); - pv = rtnl_dereference(port->vlan_info); - if (!pv) - return -EINVAL; - + v = br_vlan_find(port->vlgrp, vid); + if (!v) + return -ENOENT; br_fdb_find_delete_local(port->br, port, port->dev->dev_addr, vid); br_fdb_delete_by_port(port->br, port, vid, 0); - return __vlan_del(pv, vid); + return __vlan_del(v); } void nbp_vlan_flush(struct net_bridge_port *port) { - struct net_port_vlans *pv; - u16 vid; + struct net_bridge_vlan *vlan; ASSERT_RTNL(); - pv = rtnl_dereference(port->vlan_info); - if (!pv) - return; - - for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) - vlan_vid_del(port->dev, port->br->vlan_proto, vid); - - __vlan_flush(pv); -} - -bool nbp_vlan_find(struct net_bridge_port *port, u16 vid) -{ - struct net_port_vlans *pv; - bool found = false; - - rcu_read_lock(); - pv = rcu_dereference(port->vlan_info); - - if (!pv) - goto out; + list_for_each_entry(vlan, &port->vlgrp->vlan_list, vlist) + vlan_vid_del(port->dev, port->br->vlan_proto, vlan->vid); - if (test_bit(vid, pv->vlan_bitmap)) - found = true; - -out: - rcu_read_unlock(); - return found; -} - -int nbp_vlan_init(struct net_bridge_port *p) -{ - return p->br->default_pvid ? - nbp_vlan_add(p, p->br->default_pvid, - BRIDGE_VLAN_INFO_PVID | - BRIDGE_VLAN_INFO_UNTAGGED) : - 0; + __vlan_flush(nbp_vlan_group(port)); } diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c index 17f2e4bc2a29..0ad639a96142 100644 --- a/net/bridge/netfilter/ebt_log.c +++ b/net/bridge/netfilter/ebt_log.c @@ -180,7 +180,7 @@ ebt_log_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ebt_log_info *info = par->targinfo; struct nf_loginfo li; - struct net *net = dev_net(par->in ? par->in : par->out); + struct net *net = par->net; li.type = NF_LOG_TYPE_LOG; li.u.log.level = info->loglevel; diff --git a/net/bridge/netfilter/ebt_nflog.c b/net/bridge/netfilter/ebt_nflog.c index 59ac7952010d..54816150608e 100644 --- a/net/bridge/netfilter/ebt_nflog.c +++ b/net/bridge/netfilter/ebt_nflog.c @@ -24,7 +24,7 @@ ebt_nflog_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ebt_nflog_info *info = par->targinfo; struct nf_loginfo li; - struct net *net = dev_net(par->in ? par->in : par->out); + struct net *net = par->net; li.type = NF_LOG_TYPE_ULOG; li.u.ulog.copy_len = info->len; diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c index d2cdf5d6e98c..ec94c6f1ae88 100644 --- a/net/bridge/netfilter/ebtable_broute.c +++ b/net/bridge/netfilter/ebtable_broute.c @@ -50,10 +50,14 @@ static const struct ebt_table broute_table = { static int ebt_broute(struct sk_buff *skb) { + struct nf_hook_state state; int ret; - ret = ebt_do_table(NF_BR_BROUTING, skb, skb->dev, NULL, - dev_net(skb->dev)->xt.broute_table); + nf_hook_state_init(&state, NULL, NF_BR_BROUTING, INT_MIN, + NFPROTO_BRIDGE, skb->dev, NULL, NULL, + dev_net(skb->dev), NULL); + + ret = ebt_do_table(skb, &state, state.net->xt.broute_table); if (ret == NF_DROP) return 1; /* route it */ return 0; /* bridge it */ diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c index 8a3f63b2e807..f9242dffa65e 100644 --- a/net/bridge/netfilter/ebtable_filter.c +++ b/net/bridge/netfilter/ebtable_filter.c @@ -57,19 +57,17 @@ static const struct ebt_table frame_filter = { }; static unsigned int -ebt_in_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, +ebt_in_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ebt_do_table(ops->hooknum, skb, state->in, state->out, - dev_net(state->in)->xt.frame_filter); + return ebt_do_table(skb, state, state->net->xt.frame_filter); } static unsigned int -ebt_out_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, +ebt_out_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ebt_do_table(ops->hooknum, skb, state->in, state->out, - dev_net(state->out)->xt.frame_filter); + return ebt_do_table(skb, state, state->net->xt.frame_filter); } static struct nf_hook_ops ebt_ops_filter[] __read_mostly = { diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c index c5ef5b1ab678..4bbefe03ab58 100644 --- a/net/bridge/netfilter/ebtable_nat.c +++ b/net/bridge/netfilter/ebtable_nat.c @@ -57,19 +57,17 @@ static struct ebt_table frame_nat = { }; static unsigned int -ebt_nat_in(const struct nf_hook_ops *ops, struct sk_buff *skb, +ebt_nat_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ebt_do_table(ops->hooknum, skb, state->in, state->out, - dev_net(state->in)->xt.frame_nat); + return ebt_do_table(skb, state, state->net->xt.frame_nat); } static unsigned int -ebt_nat_out(const struct nf_hook_ops *ops, struct sk_buff *skb, +ebt_nat_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ebt_do_table(ops->hooknum, skb, state->in, state->out, - dev_net(state->out)->xt.frame_nat); + return ebt_do_table(skb, state, state->net->xt.frame_nat); } static struct nf_hook_ops ebt_ops_nat[] __read_mostly = { diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 18ca4b24c418..f46ca417bf2d 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -176,17 +176,18 @@ ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb, return 0; } -static inline __pure +static inline struct ebt_entry *ebt_next_entry(const struct ebt_entry *entry) { return (void *)entry + entry->next_offset; } /* Do some firewalling */ -unsigned int ebt_do_table (unsigned int hook, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - struct ebt_table *table) +unsigned int ebt_do_table(struct sk_buff *skb, + const struct nf_hook_state *state, + struct ebt_table *table) { + unsigned int hook = state->hook; int i, nentries; struct ebt_entry *point; struct ebt_counter *counter_base, *cb_base; @@ -199,8 +200,9 @@ unsigned int ebt_do_table (unsigned int hook, struct sk_buff *skb, struct xt_action_param acpar; acpar.family = NFPROTO_BRIDGE; - acpar.in = in; - acpar.out = out; + acpar.net = state->net; + acpar.in = state->in; + acpar.out = state->out; acpar.hotdrop = false; acpar.hooknum = hook; @@ -220,7 +222,7 @@ unsigned int ebt_do_table (unsigned int hook, struct sk_buff *skb, base = private->entries; i = 0; while (i < nentries) { - if (ebt_basic_match(point, skb, in, out)) + if (ebt_basic_match(point, skb, state->in, state->out)) goto letscontinue; if (EBT_MATCH_ITERATE(point, ebt_do_match, skb, &acpar) != 0) diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c index a343e62442b1..62f6b1b19589 100644 --- a/net/bridge/netfilter/nf_tables_bridge.c +++ b/net/bridge/netfilter/nf_tables_bridge.c @@ -65,31 +65,29 @@ int nft_bridge_ip6hdr_validate(struct sk_buff *skb) EXPORT_SYMBOL_GPL(nft_bridge_ip6hdr_validate); static inline void nft_bridge_set_pktinfo_ipv4(struct nft_pktinfo *pkt, - const struct nf_hook_ops *ops, struct sk_buff *skb, const struct nf_hook_state *state) { if (nft_bridge_iphdr_validate(skb)) - nft_set_pktinfo_ipv4(pkt, ops, skb, state); + nft_set_pktinfo_ipv4(pkt, skb, state); else - nft_set_pktinfo(pkt, ops, skb, state); + nft_set_pktinfo(pkt, skb, state); } static inline void nft_bridge_set_pktinfo_ipv6(struct nft_pktinfo *pkt, - const struct nf_hook_ops *ops, struct sk_buff *skb, const struct nf_hook_state *state) { #if IS_ENABLED(CONFIG_IPV6) if (nft_bridge_ip6hdr_validate(skb) && - nft_set_pktinfo_ipv6(pkt, ops, skb, state) == 0) + nft_set_pktinfo_ipv6(pkt, skb, state) == 0) return; #endif - nft_set_pktinfo(pkt, ops, skb, state); + nft_set_pktinfo(pkt, skb, state); } static unsigned int -nft_do_chain_bridge(const struct nf_hook_ops *ops, +nft_do_chain_bridge(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { @@ -97,17 +95,17 @@ nft_do_chain_bridge(const struct nf_hook_ops *ops, switch (eth_hdr(skb)->h_proto) { case htons(ETH_P_IP): - nft_bridge_set_pktinfo_ipv4(&pkt, ops, skb, state); + nft_bridge_set_pktinfo_ipv4(&pkt, skb, state); break; case htons(ETH_P_IPV6): - nft_bridge_set_pktinfo_ipv6(&pkt, ops, skb, state); + nft_bridge_set_pktinfo_ipv6(&pkt, skb, state); break; default: - nft_set_pktinfo(&pkt, ops, skb, state); + nft_set_pktinfo(&pkt, skb, state); break; } - return nft_do_chain(&pkt, ops); + return nft_do_chain(&pkt, priv); } static struct nft_af_info nft_af_bridge __read_mostly = { diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c index 858d848564ee..fdba3d9fbff3 100644 --- a/net/bridge/netfilter/nft_reject_bridge.c +++ b/net/bridge/netfilter/nft_reject_bridge.c @@ -261,7 +261,6 @@ static void nft_reject_bridge_eval(const struct nft_expr *expr, const struct nft_pktinfo *pkt) { struct nft_reject *priv = nft_expr_priv(expr); - struct net *net = dev_net((pkt->in != NULL) ? pkt->in : pkt->out); const unsigned char *dest = eth_hdr(pkt->skb)->h_dest; if (is_broadcast_ether_addr(dest) || @@ -273,16 +272,16 @@ static void nft_reject_bridge_eval(const struct nft_expr *expr, switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: nft_reject_br_send_v4_unreach(pkt->skb, pkt->in, - pkt->ops->hooknum, + pkt->hook, priv->icmp_code); break; case NFT_REJECT_TCP_RST: nft_reject_br_send_v4_tcp_reset(pkt->skb, pkt->in, - pkt->ops->hooknum); + pkt->hook); break; case NFT_REJECT_ICMPX_UNREACH: nft_reject_br_send_v4_unreach(pkt->skb, pkt->in, - pkt->ops->hooknum, + pkt->hook, nft_reject_icmp_code(priv->icmp_code)); break; } @@ -290,17 +289,17 @@ static void nft_reject_bridge_eval(const struct nft_expr *expr, case htons(ETH_P_IPV6): switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: - nft_reject_br_send_v6_unreach(net, pkt->skb, pkt->in, - pkt->ops->hooknum, + nft_reject_br_send_v6_unreach(pkt->net, pkt->skb, + pkt->in, pkt->hook, priv->icmp_code); break; case NFT_REJECT_TCP_RST: - nft_reject_br_send_v6_tcp_reset(net, pkt->skb, pkt->in, - pkt->ops->hooknum); + nft_reject_br_send_v6_tcp_reset(pkt->net, pkt->skb, + pkt->in, pkt->hook); break; case NFT_REJECT_ICMPX_UNREACH: - nft_reject_br_send_v6_unreach(net, pkt->skb, pkt->in, - pkt->ops->hooknum, + nft_reject_br_send_v6_unreach(pkt->net, pkt->skb, + pkt->in, pkt->hook, nft_reject_icmpv6_code(priv->icmp_code)); break; } diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index f30329f72641..54a00d66509e 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -357,6 +357,7 @@ ceph_parse_options(char *options, const char *dev_name, opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; + opt->monc_ping_timeout = CEPH_MONC_PING_TIMEOUT_DEFAULT; /* get mon ip(s) */ /* ip1[:port1][,ip2[:port2]...] */ @@ -517,8 +518,11 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client) struct ceph_options *opt = client->options; size_t pos = m->count; - if (opt->name) - seq_printf(m, "name=%s,", opt->name); + if (opt->name) { + seq_puts(m, "name="); + seq_escape(m, opt->name, ", \t\n\\"); + seq_putc(m, ','); + } if (opt->key) seq_puts(m, "secret=<hidden>,"); diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c index 790fe89d90c0..4440edcce0d6 100644 --- a/net/ceph/crypto.c +++ b/net/ceph/crypto.c @@ -79,10 +79,6 @@ int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey) return 0; } - - -#define AES_KEY_SIZE 16 - static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void) { return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC); diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index e3be1d22a247..b9b0e3b5da49 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -163,6 +163,7 @@ static struct kmem_cache *ceph_msg_data_cache; static char tag_msg = CEPH_MSGR_TAG_MSG; static char tag_ack = CEPH_MSGR_TAG_ACK; static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE; +static char tag_keepalive2 = CEPH_MSGR_TAG_KEEPALIVE2; #ifdef CONFIG_LOCKDEP static struct lock_class_key socket_class; @@ -176,7 +177,7 @@ static struct lock_class_key socket_class; static void queue_con(struct ceph_connection *con); static void cancel_con(struct ceph_connection *con); -static void con_work(struct work_struct *); +static void ceph_con_workfn(struct work_struct *); static void con_fault(struct ceph_connection *con); /* @@ -276,22 +277,22 @@ static void _ceph_msgr_exit(void) ceph_msgr_wq = NULL; } - ceph_msgr_slab_exit(); - BUG_ON(zero_page == NULL); page_cache_release(zero_page); zero_page = NULL; + + ceph_msgr_slab_exit(); } int ceph_msgr_init(void) { + if (ceph_msgr_slab_init()) + return -ENOMEM; + BUG_ON(zero_page != NULL); zero_page = ZERO_PAGE(0); page_cache_get(zero_page); - if (ceph_msgr_slab_init()) - return -ENOMEM; - /* * The number of active work items is limited by the number of * connections, so leave @max_active at default. @@ -749,7 +750,7 @@ void ceph_con_init(struct ceph_connection *con, void *private, mutex_init(&con->mutex); INIT_LIST_HEAD(&con->out_queue); INIT_LIST_HEAD(&con->out_sent); - INIT_DELAYED_WORK(&con->work, con_work); + INIT_DELAYED_WORK(&con->work, ceph_con_workfn); con->state = CON_STATE_CLOSED; } @@ -1351,7 +1352,16 @@ static void prepare_write_keepalive(struct ceph_connection *con) { dout("prepare_write_keepalive %p\n", con); con_out_kvec_reset(con); - con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive); + if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) { + struct timespec now = CURRENT_TIME; + + con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2); + ceph_encode_timespec(&con->out_temp_keepalive2, &now); + con_out_kvec_add(con, sizeof(con->out_temp_keepalive2), + &con->out_temp_keepalive2); + } else { + con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive); + } con_flag_set(con, CON_FLAG_WRITE_PENDING); } @@ -1625,6 +1635,12 @@ static void prepare_read_tag(struct ceph_connection *con) con->in_tag = CEPH_MSGR_TAG_READY; } +static void prepare_read_keepalive_ack(struct ceph_connection *con) +{ + dout("prepare_read_keepalive_ack %p\n", con); + con->in_base_pos = 0; +} + /* * Prepare to read a message. */ @@ -2322,13 +2338,6 @@ static int read_partial_message(struct ceph_connection *con) return ret; BUG_ON(!con->in_msg ^ skip); - if (con->in_msg && data_len > con->in_msg->data_length) { - pr_warn("%s skipping long message (%u > %zd)\n", - __func__, data_len, con->in_msg->data_length); - ceph_msg_put(con->in_msg); - con->in_msg = NULL; - skip = 1; - } if (skip) { /* skip this message */ dout("alloc_msg said skip message\n"); @@ -2457,6 +2466,17 @@ static void process_message(struct ceph_connection *con) mutex_lock(&con->mutex); } +static int read_keepalive_ack(struct ceph_connection *con) +{ + struct ceph_timespec ceph_ts; + size_t size = sizeof(ceph_ts); + int ret = read_partial(con, size, size, &ceph_ts); + if (ret <= 0) + return ret; + ceph_decode_timespec(&con->last_keepalive_ack, &ceph_ts); + prepare_read_tag(con); + return 1; +} /* * Write something to the socket. Called in a worker thread when the @@ -2526,6 +2546,10 @@ more_kvec: do_next: if (con->state == CON_STATE_OPEN) { + if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) { + prepare_write_keepalive(con); + goto more; + } /* is anything else pending? */ if (!list_empty(&con->out_queue)) { prepare_write_message(con); @@ -2535,10 +2559,6 @@ do_next: prepare_write_ack(con); goto more; } - if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) { - prepare_write_keepalive(con); - goto more; - } } /* Nothing to do! */ @@ -2641,6 +2661,9 @@ more: case CEPH_MSGR_TAG_ACK: prepare_read_ack(con); break; + case CEPH_MSGR_TAG_KEEPALIVE2_ACK: + prepare_read_keepalive_ack(con); + break; case CEPH_MSGR_TAG_CLOSE: con_close_socket(con); con->state = CON_STATE_CLOSED; @@ -2684,6 +2707,12 @@ more: process_ack(con); goto more; } + if (con->in_tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) { + ret = read_keepalive_ack(con); + if (ret <= 0) + goto out; + goto more; + } out: dout("try_read done on %p ret %d\n", con, ret); @@ -2799,7 +2828,7 @@ static void con_fault_finish(struct ceph_connection *con) /* * Do some work on a connection. Drop a connection ref when we're done. */ -static void con_work(struct work_struct *work) +static void ceph_con_workfn(struct work_struct *work) { struct ceph_connection *con = container_of(work, struct ceph_connection, work.work); @@ -3101,6 +3130,20 @@ void ceph_con_keepalive(struct ceph_connection *con) } EXPORT_SYMBOL(ceph_con_keepalive); +bool ceph_con_keepalive_expired(struct ceph_connection *con, + unsigned long interval) +{ + if (interval > 0 && + (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2)) { + struct timespec now = CURRENT_TIME; + struct timespec ts; + jiffies_to_timespec(interval, &ts); + ts = timespec_add(con->last_keepalive_ack, ts); + return timespec_compare(&now, &ts) >= 0; + } + return false; +} + static struct ceph_msg_data *ceph_msg_data_create(enum ceph_msg_data_type type) { struct ceph_msg_data *data; diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 9d6ff1215928..edda01626a45 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -149,6 +149,10 @@ static int __open_session(struct ceph_mon_client *monc) CEPH_ENTITY_TYPE_MON, monc->cur_mon, &monc->monmap->mon_inst[monc->cur_mon].addr); + /* send an initial keepalive to ensure our timestamp is + * valid by the time we are in an OPENED state */ + ceph_con_keepalive(&monc->con); + /* initiatiate authentication handshake */ ret = ceph_auth_build_hello(monc->auth, monc->m_auth->front.iov_base, @@ -170,14 +174,19 @@ static bool __sub_expired(struct ceph_mon_client *monc) */ static void __schedule_delayed(struct ceph_mon_client *monc) { - unsigned int delay; + struct ceph_options *opt = monc->client->options; + unsigned long delay; - if (monc->cur_mon < 0 || __sub_expired(monc)) + if (monc->cur_mon < 0 || __sub_expired(monc)) { delay = 10 * HZ; - else + } else { delay = 20 * HZ; - dout("__schedule_delayed after %u\n", delay); - schedule_delayed_work(&monc->delayed_work, delay); + if (opt->monc_ping_timeout > 0) + delay = min(delay, opt->monc_ping_timeout / 3); + } + dout("__schedule_delayed after %lu\n", delay); + schedule_delayed_work(&monc->delayed_work, + round_jiffies_relative(delay)); } /* @@ -743,11 +752,23 @@ static void delayed_work(struct work_struct *work) __close_session(monc); __open_session(monc); /* continue hunting */ } else { - ceph_con_keepalive(&monc->con); + struct ceph_options *opt = monc->client->options; + int is_auth = ceph_auth_is_authenticated(monc->auth); + if (ceph_con_keepalive_expired(&monc->con, + opt->monc_ping_timeout)) { + dout("monc keepalive timeout\n"); + is_auth = 0; + __close_session(monc); + monc->hunting = true; + __open_session(monc); + } - __validate_auth(monc); + if (!monc->hunting) { + ceph_con_keepalive(&monc->con); + __validate_auth(monc); + } - if (ceph_auth_is_authenticated(monc->auth)) + if (is_auth) __send_subscribe(monc); } __schedule_delayed(monc); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 50033677c0fa..80b94e37c94a 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -2817,8 +2817,9 @@ out: } /* - * lookup and return message for incoming reply. set up reply message - * pages. + * Lookup and return message for incoming reply. Don't try to do + * anything about a larger than preallocated data portion of the + * message at the moment - for now, just skip the message. */ static struct ceph_msg *get_reply(struct ceph_connection *con, struct ceph_msg_header *hdr, @@ -2836,10 +2837,10 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, mutex_lock(&osdc->request_mutex); req = __lookup_request(osdc, tid); if (!req) { - *skip = 1; + pr_warn("%s osd%d tid %llu unknown, skipping\n", + __func__, osd->o_osd, tid); m = NULL; - dout("get_reply unknown tid %llu from osd%d\n", tid, - osd->o_osd); + *skip = 1; goto out; } @@ -2849,10 +2850,9 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, ceph_msg_revoke_incoming(req->r_reply); if (front_len > req->r_reply->front_alloc_len) { - pr_warn("get_reply front %d > preallocated %d (%u#%llu)\n", - front_len, req->r_reply->front_alloc_len, - (unsigned int)con->peer_name.type, - le64_to_cpu(con->peer_name.num)); + pr_warn("%s osd%d tid %llu front %d > preallocated %d\n", + __func__, osd->o_osd, req->r_tid, front_len, + req->r_reply->front_alloc_len); m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS, false); if (!m) @@ -2860,37 +2860,22 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, ceph_msg_put(req->r_reply); req->r_reply = m; } - m = ceph_msg_get(req->r_reply); - - if (data_len > 0) { - struct ceph_osd_data *osd_data; - /* - * XXX This is assuming there is only one op containing - * XXX page data. Probably OK for reads, but this - * XXX ought to be done more generally. - */ - osd_data = osd_req_op_extent_osd_data(req, 0); - if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { - if (osd_data->pages && - unlikely(osd_data->length < data_len)) { - - pr_warn("tid %lld reply has %d bytes we had only %llu bytes ready\n", - tid, data_len, osd_data->length); - *skip = 1; - ceph_msg_put(m); - m = NULL; - goto out; - } - } + if (data_len > req->r_reply->data_length) { + pr_warn("%s osd%d tid %llu data %d > preallocated %zu, skipping\n", + __func__, osd->o_osd, req->r_tid, data_len, + req->r_reply->data_length); + m = NULL; + *skip = 1; + goto out; } - *skip = 0; + + m = ceph_msg_get(req->r_reply); dout("get_reply tid %lld %p\n", tid, m); out: mutex_unlock(&osdc->request_mutex); return m; - } static struct ceph_msg *alloc_msg(struct ceph_connection *con, diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 4a3125836b64..7d8f581d9f1f 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1300,7 +1300,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, ceph_decode_addr(&addr); pr_info("osd%d up\n", osd); BUG_ON(osd >= map->max_osd); - map->osd_state[osd] |= CEPH_OSD_UP; + map->osd_state[osd] |= CEPH_OSD_UP | CEPH_OSD_EXISTS; map->osd_addr[osd] = addr; } diff --git a/net/core/dev.c b/net/core/dev.c index b1f3f4844e60..a229bf0d649d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2915,9 +2915,11 @@ EXPORT_SYMBOL(xmit_recursion); /** * dev_loopback_xmit - loop back @skb + * @net: network namespace this loopback is happening in + * @sk: sk needed to be a netfilter okfn * @skb: buffer to transmit */ -int dev_loopback_xmit(struct sock *sk, struct sk_buff *skb) +int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) { skb_reset_mac_header(skb); __skb_pull(skb, skb_network_offset(skb)); @@ -2972,6 +2974,7 @@ static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) new_index = skb_tx_hash(dev, skb); if (queue_index != new_index && sk && + sk_fullsock(sk) && rcu_access_pointer(sk->sk_dst_cache)) sk_tx_queue_set(sk, new_index); @@ -3143,11 +3146,11 @@ out: return rc; } -int dev_queue_xmit_sk(struct sock *sk, struct sk_buff *skb) +int dev_queue_xmit(struct sk_buff *skb) { return __dev_queue_xmit(skb, NULL); } -EXPORT_SYMBOL(dev_queue_xmit_sk); +EXPORT_SYMBOL(dev_queue_xmit); int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv) { @@ -3657,7 +3660,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb, skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); qdisc_bstats_cpu_update(cl->q, skb); - switch (tc_classify(skb, cl, &cl_res)) { + switch (tc_classify(skb, cl, &cl_res, false)) { case TC_ACT_OK: case TC_ACT_RECLASSIFY: skb->tc_index = TC_H_MIN(cl_res.classid); @@ -3668,6 +3671,14 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb, case TC_ACT_QUEUED: kfree_skb(skb); return NULL; + case TC_ACT_REDIRECT: + /* skb_mac_header check was done by cls/act_bpf, so + * we can safely push the L2 header back before + * redirecting to another netdev + */ + __skb_push(skb, skb->mac_len); + skb_do_redirect(skb); + return NULL; default: break; } @@ -3982,13 +3993,13 @@ static int netif_receive_skb_internal(struct sk_buff *skb) * NET_RX_SUCCESS: no congestion * NET_RX_DROP: packet was dropped */ -int netif_receive_skb_sk(struct sock *sk, struct sk_buff *skb) +int netif_receive_skb(struct sk_buff *skb) { trace_netif_receive_skb_entry(skb); return netif_receive_skb_internal(skb); } -EXPORT_SYMBOL(netif_receive_skb_sk); +EXPORT_SYMBOL(netif_receive_skb); /* Network device is going away, flush any packets still pending * Called with irqs disabled. @@ -4713,6 +4724,8 @@ void napi_disable(struct napi_struct *n) while (test_and_set_bit(NAPI_STATE_SCHED, &n->state)) msleep(1); + while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state)) + msleep(1); hrtimer_cancel(&n->timer); @@ -4855,8 +4868,7 @@ struct netdev_adjacent { struct rcu_head rcu; }; -static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev, - struct net_device *adj_dev, +static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev, struct list_head *adj_list) { struct netdev_adjacent *adj; @@ -4882,7 +4894,7 @@ bool netdev_has_upper_dev(struct net_device *dev, { ASSERT_RTNL(); - return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper); + return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper); } EXPORT_SYMBOL(netdev_has_upper_dev); @@ -5144,7 +5156,7 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, struct netdev_adjacent *adj; int ret; - adj = __netdev_find_adj(dev, adj_dev, dev_list); + adj = __netdev_find_adj(adj_dev, dev_list); if (adj) { adj->ref_nr++; @@ -5200,7 +5212,7 @@ static void __netdev_adjacent_dev_remove(struct net_device *dev, { struct netdev_adjacent *adj; - adj = __netdev_find_adj(dev, adj_dev, dev_list); + adj = __netdev_find_adj(adj_dev, dev_list); if (!adj) { pr_err("tried to remove device %s from %s\n", @@ -5311,6 +5323,7 @@ static int __netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, bool master, void *private) { + struct netdev_notifier_changeupper_info changeupper_info; struct netdev_adjacent *i, *j, *to_i, *to_j; int ret = 0; @@ -5320,15 +5333,19 @@ static int __netdev_upper_dev_link(struct net_device *dev, return -EBUSY; /* To prevent loops, check if dev is not upper device to upper_dev. */ - if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper)) + if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper)) return -EBUSY; - if (__netdev_find_adj(dev, upper_dev, &dev->adj_list.upper)) + if (__netdev_find_adj(upper_dev, &dev->adj_list.upper)) return -EEXIST; if (master && netdev_master_upper_dev_get(dev)) return -EBUSY; + changeupper_info.upper_dev = upper_dev; + changeupper_info.master = master; + changeupper_info.linking = true; + ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private, master); if (ret) @@ -5367,7 +5384,8 @@ static int __netdev_upper_dev_link(struct net_device *dev, goto rollback_lower_mesh; } - call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); + call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, + &changeupper_info.info); return 0; rollback_lower_mesh: @@ -5462,9 +5480,14 @@ EXPORT_SYMBOL(netdev_master_upper_dev_link_private); void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev) { + struct netdev_notifier_changeupper_info changeupper_info; struct netdev_adjacent *i, *j; ASSERT_RTNL(); + changeupper_info.upper_dev = upper_dev; + changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev; + changeupper_info.linking = false; + __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); /* Here is the tricky part. We must remove all dev's lower @@ -5484,7 +5507,8 @@ void netdev_upper_dev_unlink(struct net_device *dev, list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) __netdev_adjacent_dev_unlink(dev, i->dev); - call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); + call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, + &changeupper_info.info); } EXPORT_SYMBOL(netdev_upper_dev_unlink); @@ -5590,7 +5614,7 @@ void *netdev_lower_dev_get_private(struct net_device *dev, if (!lower_dev) return NULL; - lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower); + lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower); if (!lower) return NULL; @@ -6998,7 +7022,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, setup(dev); if (!dev->tx_queue_len) - printk(KERN_WARNING "%s uses DEPRECATED zero tx_queue_len - convert driver to use IFF_NO_QUEUE instead.\n", name); + dev->priv_flags |= IFF_NO_QUEUE; dev->num_tx_queues = txqs; dev->real_num_tx_queues = txqs; diff --git a/net/core/dst.c b/net/core/dst.c index 477035ed7903..2a1818065e12 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -144,12 +144,12 @@ loop: mutex_unlock(&dst_gc_mutex); } -int dst_discard_sk(struct sock *sk, struct sk_buff *skb) +int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) { kfree_skb(skb); return 0; } -EXPORT_SYMBOL(dst_discard_sk); +EXPORT_SYMBOL(dst_discard_out); const u32 dst_default_metrics[RTAX_MAX + 1] = { /* This initializer is needed to force linker to place this variable @@ -177,7 +177,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, dst->xfrm = NULL; #endif dst->input = dst_discard; - dst->output = dst_discard_sk; + dst->output = dst_discard_out; dst->error = 0; dst->obsolete = initial_obsolete; dst->header_len = 0; @@ -224,7 +224,7 @@ static void ___dst_free(struct dst_entry *dst) */ if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) { dst->input = dst_discard; - dst->output = dst_discard_sk; + dst->output = dst_discard_out; } dst->obsolete = DST_OBSOLETE_DEAD; } @@ -352,7 +352,7 @@ static struct dst_ops md_dst_ops = { .family = AF_UNSPEC, }; -static int dst_md_discard_sk(struct sock *sk, struct sk_buff *skb) +static int dst_md_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) { WARN_ONCE(1, "Attempting to call output on metadata dst\n"); kfree_skb(skb); @@ -375,10 +375,9 @@ static void __metadata_dst_init(struct metadata_dst *md_dst, u8 optslen) DST_METADATA | DST_NOCACHE | DST_NOCOUNT); dst->input = dst_md_discard; - dst->output = dst_md_discard_sk; + dst->output = dst_md_discard_out; memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst)); - md_dst->opts_len = optslen; } struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags) @@ -431,7 +430,7 @@ static void dst_ifdown(struct dst_entry *dst, struct net_device *dev, if (!unregister) { dst->input = dst_discard; - dst->output = dst_discard_sk; + dst->output = dst_discard_out; } else { dst->dev = dev_net(dst->dev)->loopback_dev; dev_hold(dst->dev); diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index ae8306e7c56f..365de66436ac 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -44,7 +44,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops, } EXPORT_SYMBOL(fib_default_rule_add); -u32 fib_default_rule_pref(struct fib_rules_ops *ops) +static u32 fib_default_rule_pref(struct fib_rules_ops *ops) { struct list_head *pos; struct fib_rule *rule; @@ -60,7 +60,6 @@ u32 fib_default_rule_pref(struct fib_rules_ops *ops) return 0; } -EXPORT_SYMBOL(fib_default_rule_pref); static void notify_rule_change(int event, struct fib_rule *rule, struct fib_rules_ops *ops, struct nlmsghdr *nlh, @@ -299,8 +298,8 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) } rule->fr_net = net; - if (tb[FRA_PRIORITY]) - rule->pref = nla_get_u32(tb[FRA_PRIORITY]); + rule->pref = tb[FRA_PRIORITY] ? nla_get_u32(tb[FRA_PRIORITY]) + : fib_default_rule_pref(ops); if (tb[FRA_IIFNAME]) { struct net_device *dev; @@ -350,9 +349,6 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) else rule->suppress_ifgroup = -1; - if (!tb[FRA_PRIORITY] && ops->default_pref) - rule->pref = ops->default_pref(ops); - err = -EINVAL; if (tb[FRA_GOTO]) { if (rule->action != FR_ACT_GOTO) @@ -635,15 +631,17 @@ static int dump_rules(struct sk_buff *skb, struct netlink_callback *cb, { int idx = 0; struct fib_rule *rule; + int err = 0; rcu_read_lock(); list_for_each_entry_rcu(rule, &ops->rules_list, list) { if (idx < cb->args[1]) goto skip; - if (fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, RTM_NEWRULE, - NLM_F_MULTI, ops) < 0) + err = fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, RTM_NEWRULE, + NLM_F_MULTI, ops); + if (err) break; skip: idx++; @@ -652,7 +650,7 @@ skip: cb->args[1] = idx; rules_ops_put(ops); - return skb->len; + return err; } static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb) @@ -668,7 +666,9 @@ static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb) if (ops == NULL) return -EAFNOSUPPORT; - return dump_rules(skb, cb, ops); + dump_rules(skb, cb, ops); + + return skb->len; } rcu_read_lock(); diff --git a/net/core/filter.c b/net/core/filter.c index 66500d490995..0b00094932ab 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -49,16 +49,17 @@ #include <net/sch_generic.h> #include <net/cls_cgroup.h> #include <net/dst_metadata.h> +#include <net/dst.h> /** * sk_filter - run a packet through a socket filter * @sk: sock associated with &sk_buff * @skb: buffer to filter * - * Run the filter code and then cut skb->data to correct size returned by - * SK_RUN_FILTER. If pkt_len is 0 we toss packet. If skb->len is smaller + * Run the eBPF program and then cut skb->data to correct size returned by + * the program. If pkt_len is 0 we toss packet. If skb->len is smaller * than pkt_len we keep whole skb->data. This is the socket level - * wrapper to SK_RUN_FILTER. It returns 0 if the packet should + * wrapper to BPF_PROG_RUN. It returns 0 if the packet should * be accepted or -EPERM if the packet should be tossed. * */ @@ -82,7 +83,7 @@ int sk_filter(struct sock *sk, struct sk_buff *skb) rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); if (filter) { - unsigned int pkt_len = SK_RUN_FILTER(filter, skb); + unsigned int pkt_len = bpf_prog_run_save_cb(filter->prog, skb); err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM; } @@ -148,12 +149,6 @@ static u64 __get_raw_cpu_id(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) return raw_smp_processor_id(); } -/* note that this only generates 32-bit random numbers */ -static u64 __get_random_u32(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) -{ - return prandom_u32(); -} - static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, struct bpf_insn *insn_buf) { @@ -312,7 +307,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp, *insn = BPF_EMIT_CALL(__get_raw_cpu_id); break; case SKF_AD_OFF + SKF_AD_RANDOM: - *insn = BPF_EMIT_CALL(__get_random_u32); + *insn = BPF_EMIT_CALL(bpf_user_rnd_u32); + bpf_user_rnd_init_once(); break; } break; @@ -478,9 +474,9 @@ do_pass: bpf_src = BPF_X; } else { insn->dst_reg = BPF_REG_A; - insn->src_reg = BPF_REG_X; insn->imm = fp->k; bpf_src = BPF_SRC(fp->code); + insn->src_reg = bpf_src == BPF_X ? BPF_REG_X : 0; } /* Common case where 'jump_false' is next insn. */ @@ -1001,7 +997,7 @@ static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp, int err; fp->bpf_func = NULL; - fp->jited = false; + fp->jited = 0; err = bpf_check_classic(fp->insns, fp->len); if (err) { @@ -1083,16 +1079,18 @@ EXPORT_SYMBOL_GPL(bpf_prog_create); * @pfp: the unattached filter that is created * @fprog: the filter program * @trans: post-classic verifier transformation handler + * @save_orig: save classic BPF program * * This function effectively does the same as bpf_prog_create(), only * that it builds up its insns buffer from user space provided buffer. * It also allows for passing a bpf_aux_classic_check_t handler. */ int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, - bpf_aux_classic_check_t trans) + bpf_aux_classic_check_t trans, bool save_orig) { unsigned int fsize = bpf_classic_proglen(fprog); struct bpf_prog *fp; + int err; /* Make sure new filter is there and in the right amounts. */ if (fprog->filter == NULL) @@ -1108,12 +1106,16 @@ int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, } fp->len = fprog->len; - /* Since unattached filters are not copied back to user - * space through sk_get_filter(), we do not need to hold - * a copy here, and can spare us the work. - */ fp->orig_prog = NULL; + if (save_orig) { + err = bpf_prog_store_orig_filter(fp, fprog); + if (err) { + __bpf_prog_free(fp); + return -ENOMEM; + } + } + /* bpf_prepare_filter() already takes care of freeing * memory in case something goes wrong. */ @@ -1404,9 +1406,6 @@ static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5) if (unlikely(!dev)) return -EINVAL; - if (unlikely(!(dev->flags & IFF_UP))) - return -EINVAL; - skb2 = skb_clone(skb, GFP_ATOMIC); if (unlikely(!skb2)) return -ENOMEM; @@ -1427,6 +1426,49 @@ const struct bpf_func_proto bpf_clone_redirect_proto = { .arg3_type = ARG_ANYTHING, }; +struct redirect_info { + u32 ifindex; + u32 flags; +}; + +static DEFINE_PER_CPU(struct redirect_info, redirect_info); +static u64 bpf_redirect(u64 ifindex, u64 flags, u64 r3, u64 r4, u64 r5) +{ + struct redirect_info *ri = this_cpu_ptr(&redirect_info); + + ri->ifindex = ifindex; + ri->flags = flags; + return TC_ACT_REDIRECT; +} + +int skb_do_redirect(struct sk_buff *skb) +{ + struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct net_device *dev; + + dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex); + ri->ifindex = 0; + if (unlikely(!dev)) { + kfree_skb(skb); + return -EINVAL; + } + + if (BPF_IS_REDIRECT_INGRESS(ri->flags)) + return dev_forward_skb(dev, skb); + + skb->dev = dev; + skb_sender_cpu_clear(skb); + return dev_queue_xmit(skb); +} + +const struct bpf_func_proto bpf_redirect_proto = { + .func = bpf_redirect, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_ANYTHING, +}; + static u64 bpf_get_cgroup_classid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) { return task_get_classid((struct sk_buff *) (unsigned long) r1); @@ -1439,6 +1481,25 @@ static const struct bpf_func_proto bpf_get_cgroup_classid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +static u64 bpf_get_route_realm(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ +#ifdef CONFIG_IP_ROUTE_CLASSID + const struct dst_entry *dst; + + dst = skb_dst((struct sk_buff *) (unsigned long) r1); + if (dst) + return dst->tclassid; +#endif + return 0; +} + +static const struct bpf_func_proto bpf_get_route_realm_proto = { + .func = bpf_get_route_realm, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + static u64 bpf_skb_vlan_push(u64 r1, u64 r2, u64 vlan_tci, u64 r4, u64 r5) { struct sk_buff *skb = (struct sk_buff *) (long) r1; @@ -1493,6 +1554,8 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) if (unlikely(size != sizeof(struct bpf_tunnel_key) || flags || !info)) return -EINVAL; + if (ip_tunnel_info_af(info) != AF_INET) + return -EINVAL; to->tunnel_id = be64_to_cpu(info->key.tun_id); to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src); @@ -1577,7 +1640,8 @@ sk_filter_func_proto(enum bpf_func_id func_id) case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; case BPF_FUNC_trace_printk: - return bpf_get_trace_printk_proto(); + if (capable(CAP_SYS_ADMIN)) + return bpf_get_trace_printk_proto(); default: return NULL; } @@ -1605,6 +1669,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_skb_get_tunnel_key_proto; case BPF_FUNC_skb_set_tunnel_key: return bpf_get_skb_set_tunnel_key_proto(); + case BPF_FUNC_redirect: + return &bpf_redirect_proto; + case BPF_FUNC_get_route_realm: + return &bpf_get_route_realm_proto; default: return sk_filter_func_proto(func_id); } @@ -1630,6 +1698,9 @@ static bool __is_valid_access(int off, int size, enum bpf_access_type type) static bool sk_filter_is_valid_access(int off, int size, enum bpf_access_type type) { + if (off == offsetof(struct __sk_buff, tc_classid)) + return false; + if (type == BPF_WRITE) { switch (off) { case offsetof(struct __sk_buff, cb[0]) ... @@ -1646,10 +1717,14 @@ static bool sk_filter_is_valid_access(int off, int size, static bool tc_cls_act_is_valid_access(int off, int size, enum bpf_access_type type) { + if (off == offsetof(struct __sk_buff, tc_classid)) + return type == BPF_WRITE ? true : false; + if (type == BPF_WRITE) { switch (off) { case offsetof(struct __sk_buff, mark): case offsetof(struct __sk_buff, tc_index): + case offsetof(struct __sk_buff, priority): case offsetof(struct __sk_buff, cb[0]) ... offsetof(struct __sk_buff, cb[4]): break; @@ -1662,7 +1737,8 @@ static bool tc_cls_act_is_valid_access(int off, int size, static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, int src_reg, int ctx_off, - struct bpf_insn *insn_buf) + struct bpf_insn *insn_buf, + struct bpf_prog *prog) { struct bpf_insn *insn = insn_buf; @@ -1691,8 +1767,12 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, case offsetof(struct __sk_buff, priority): BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, priority) != 4); - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, - offsetof(struct sk_buff, priority)); + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sk_buff, priority)); + else + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sk_buff, priority)); break; case offsetof(struct __sk_buff, ingress_ifindex): @@ -1749,6 +1829,7 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, offsetof(struct __sk_buff, cb[4]): BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20); + prog->cb_access = 1; ctx_off -= offsetof(struct __sk_buff, cb[0]); ctx_off += offsetof(struct sk_buff, cb); ctx_off += offsetof(struct qdisc_skb_cb, data); @@ -1758,6 +1839,14 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, ctx_off); break; + case offsetof(struct __sk_buff, tc_classid): + ctx_off -= offsetof(struct __sk_buff, tc_classid); + ctx_off += offsetof(struct sk_buff, cb); + ctx_off += offsetof(struct qdisc_skb_cb, tc_classid); + WARN_ON(type != BPF_WRITE); + *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, ctx_off); + break; + case offsetof(struct __sk_buff, tc_index): #ifdef CONFIG_NET_SCHED BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2); diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 11e6540fa386..d79699c9d1b9 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -19,14 +19,14 @@ #include <net/flow_dissector.h> #include <scsi/fc/fc_fcoe.h> -static bool skb_flow_dissector_uses_key(struct flow_dissector *flow_dissector, - enum flow_dissector_key_id key_id) +static bool dissector_uses_key(const struct flow_dissector *flow_dissector, + enum flow_dissector_key_id key_id) { return flow_dissector->used_keys & (1 << key_id); } -static void skb_flow_dissector_set_key(struct flow_dissector *flow_dissector, - enum flow_dissector_key_id key_id) +static void dissector_set_key(struct flow_dissector *flow_dissector, + enum flow_dissector_key_id key_id) { flow_dissector->used_keys |= (1 << key_id); } @@ -51,20 +51,20 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, * boundaries of unsigned short. */ BUG_ON(key->offset > USHRT_MAX); - BUG_ON(skb_flow_dissector_uses_key(flow_dissector, - key->key_id)); + BUG_ON(dissector_uses_key(flow_dissector, + key->key_id)); - skb_flow_dissector_set_key(flow_dissector, key->key_id); + dissector_set_key(flow_dissector, key->key_id); flow_dissector->offset[key->key_id] = key->offset; } /* Ensure that the dissector always includes control and basic key. * That way we are able to avoid handling lack of these in fast path. */ - BUG_ON(!skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_CONTROL)); - BUG_ON(!skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_BASIC)); + BUG_ON(!dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_CONTROL)); + BUG_ON(!dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_BASIC)); } EXPORT_SYMBOL(skb_flow_dissector_init); @@ -121,7 +121,8 @@ EXPORT_SYMBOL(__skb_flow_get_ports); bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, - void *data, __be16 proto, int nhoff, int hlen) + void *data, __be16 proto, int nhoff, int hlen, + unsigned int flags) { struct flow_dissector_key_control *key_control; struct flow_dissector_key_basic *key_basic; @@ -130,6 +131,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector_key_tags *key_tags; struct flow_dissector_key_keyid *key_keyid; u8 ip_proto = 0; + bool ret = false; if (!data) { data = skb->data; @@ -152,8 +154,8 @@ bool __skb_flow_dissect(const struct sk_buff *skb, FLOW_DISSECTOR_KEY_BASIC, target_container); - if (skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_ETH_ADDRS)) { + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ETH_ADDRS)) { struct ethhdr *eth = eth_hdr(skb); struct flow_dissector_key_eth_addrs *key_eth_addrs; @@ -171,15 +173,13 @@ again: ip: iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph); if (!iph || iph->ihl < 5) - return false; + goto out_bad; nhoff += iph->ihl * 4; ip_proto = iph->protocol; - if (ip_is_fragment(iph)) - ip_proto = 0; - if (!skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_IPV4_ADDRS)) + if (!dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_IPV4_ADDRS)) break; key_addrs = skb_flow_dissector_target(flow_dissector, @@ -187,6 +187,22 @@ ip: memcpy(&key_addrs->v4addrs, &iph->saddr, sizeof(key_addrs->v4addrs)); key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + + if (ip_is_fragment(iph)) { + key_control->flags |= FLOW_DIS_IS_FRAGMENT; + + if (iph->frag_off & htons(IP_OFFSET)) { + goto out_good; + } else { + key_control->flags |= FLOW_DIS_FIRST_FRAG; + if (!(flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG)) + goto out_good; + } + } + + if (flags & FLOW_DISSECTOR_F_STOP_AT_L3) + goto out_good; + break; } case htons(ETH_P_IPV6): { @@ -197,13 +213,13 @@ ip: ipv6: iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph); if (!iph) - return false; + goto out_bad; ip_proto = iph->nexthdr; nhoff += sizeof(struct ipv6hdr); - if (skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_IPV6_ADDRS)) { + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_IPV6_ADDRS)) { struct flow_dissector_key_ipv6_addrs *key_ipv6_addrs; key_ipv6_addrs = skb_flow_dissector_target(flow_dissector, @@ -216,15 +232,20 @@ ipv6: flow_label = ip6_flowlabel(iph); if (flow_label) { - if (skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_FLOW_LABEL)) { + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_FLOW_LABEL)) { key_tags = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_FLOW_LABEL, target_container); key_tags->flow_label = ntohl(flow_label); } + if (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL) + goto out_good; } + if (flags & FLOW_DISSECTOR_F_STOP_AT_L3) + goto out_good; + break; } case htons(ETH_P_8021AD): @@ -234,10 +255,10 @@ ipv6: vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan), data, hlen, &_vlan); if (!vlan) - return false; + goto out_bad; - if (skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_VLANID)) { + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_VLANID)) { key_tags = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_VLANID, target_container); @@ -256,7 +277,7 @@ ipv6: } *hdr, _hdr; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) - return false; + goto out_bad; proto = hdr->proto; nhoff += PPPOE_SES_HLEN; switch (proto) { @@ -265,7 +286,7 @@ ipv6: case htons(PPP_IPV6): goto ipv6; default: - return false; + goto out_bad; } } case htons(ETH_P_TIPC): { @@ -275,19 +296,17 @@ ipv6: } *hdr, _hdr; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) - return false; - key_basic->n_proto = proto; - key_control->thoff = (u16)nhoff; + goto out_bad; - if (skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_TIPC_ADDRS)) { + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_TIPC_ADDRS)) { key_addrs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_TIPC_ADDRS, target_container); key_addrs->tipcaddrs.srcnode = hdr->srcnode; key_control->addr_type = FLOW_DISSECTOR_KEY_TIPC_ADDRS; } - return true; + goto out_good; } case htons(ETH_P_MPLS_UC): @@ -297,12 +316,12 @@ mpls: hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) - return false; + goto out_bad; if ((ntohl(hdr[0].entry) & MPLS_LS_LABEL_MASK) >> MPLS_LS_LABEL_SHIFT == MPLS_LABEL_ENTROPY) { - if (skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) { + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) { key_keyid = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_MPLS_ENTROPY, target_container); @@ -310,21 +329,17 @@ mpls: htonl(MPLS_LS_LABEL_MASK); } - key_basic->n_proto = proto; - key_basic->ip_proto = ip_proto; - key_control->thoff = (u16)nhoff; - - return true; + goto out_good; } - return true; + goto out_good; } case htons(ETH_P_FCOE): key_control->thoff = (u16)(nhoff + FCOE_HEADER_LEN); /* fall through */ default: - return false; + goto out_bad; } ip_proto_again: @@ -337,7 +352,7 @@ ip_proto_again: hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) - return false; + goto out_bad; /* * Only look inside GRE if version zero and no * routing @@ -357,10 +372,10 @@ ip_proto_again: data, hlen, &_keyid); if (!keyid) - return false; + goto out_bad; - if (skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_GRE_KEYID)) { + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_GRE_KEYID)) { key_keyid = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_GRE_KEYID, target_container); @@ -378,10 +393,15 @@ ip_proto_again: sizeof(_eth), data, hlen, &_eth); if (!eth) - return false; + goto out_bad; proto = eth->h_proto; nhoff += sizeof(*eth); } + + key_control->flags |= FLOW_DIS_ENCAPSULATION; + if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) + goto out_good; + goto again; } case NEXTHDR_HOP: @@ -395,18 +415,53 @@ ip_proto_again: opthdr = __skb_header_pointer(skb, nhoff, sizeof(_opthdr), data, hlen, &_opthdr); if (!opthdr) - return false; + goto out_bad; ip_proto = opthdr[0]; nhoff += (opthdr[1] + 1) << 3; goto ip_proto_again; } + case NEXTHDR_FRAGMENT: { + struct frag_hdr _fh, *fh; + + if (proto != htons(ETH_P_IPV6)) + break; + + fh = __skb_header_pointer(skb, nhoff, sizeof(_fh), + data, hlen, &_fh); + + if (!fh) + goto out_bad; + + key_control->flags |= FLOW_DIS_IS_FRAGMENT; + + nhoff += sizeof(_fh); + + if (!(fh->frag_off & htons(IP6_OFFSET))) { + key_control->flags |= FLOW_DIS_FIRST_FRAG; + if (flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG) { + ip_proto = fh->nexthdr; + goto ip_proto_again; + } + } + goto out_good; + } case IPPROTO_IPIP: proto = htons(ETH_P_IP); + + key_control->flags |= FLOW_DIS_ENCAPSULATION; + if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) + goto out_good; + goto ip; case IPPROTO_IPV6: proto = htons(ETH_P_IPV6); + + key_control->flags |= FLOW_DIS_ENCAPSULATION; + if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) + goto out_good; + goto ipv6; case IPPROTO_MPLS: proto = htons(ETH_P_MPLS_UC); @@ -415,12 +470,8 @@ ip_proto_again: break; } - key_basic->n_proto = proto; - key_basic->ip_proto = ip_proto; - key_control->thoff = (u16)nhoff; - - if (skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_PORTS)) { + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS)) { key_ports = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_PORTS, target_container); @@ -428,7 +479,15 @@ ip_proto_again: data, hlen); } - return true; +out_good: + ret = true; + +out_bad: + key_basic->n_proto = proto; + key_basic->ip_proto = ip_proto; + key_control->thoff = (u16)nhoff; + + return ret; } EXPORT_SYMBOL(__skb_flow_dissect); @@ -438,18 +497,21 @@ static __always_inline void __flow_hash_secret_init(void) net_get_random_once(&hashrnd, sizeof(hashrnd)); } -static __always_inline u32 __flow_hash_words(u32 *words, u32 length, u32 keyval) +static __always_inline u32 __flow_hash_words(const u32 *words, u32 length, + u32 keyval) { return jhash2(words, length, keyval); } -static inline void *flow_keys_hash_start(struct flow_keys *flow) +static inline const u32 *flow_keys_hash_start(const struct flow_keys *flow) { + const void *p = flow; + BUILD_BUG_ON(FLOW_KEYS_HASH_OFFSET % sizeof(u32)); - return (void *)flow + FLOW_KEYS_HASH_OFFSET; + return (const u32 *)(p + FLOW_KEYS_HASH_OFFSET); } -static inline size_t flow_keys_hash_length(struct flow_keys *flow) +static inline size_t flow_keys_hash_length(const struct flow_keys *flow) { size_t diff = FLOW_KEYS_HASH_OFFSET + sizeof(flow->addrs); BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32)); @@ -539,7 +601,7 @@ static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval) __flow_hash_consistentify(keys); - hash = __flow_hash_words((u32 *)flow_keys_hash_start(keys), + hash = __flow_hash_words(flow_keys_hash_start(keys), flow_keys_hash_length(keys), keyval); if (!hash) hash = 1; @@ -557,8 +619,8 @@ EXPORT_SYMBOL(flow_hash_from_keys); static inline u32 ___skb_get_hash(const struct sk_buff *skb, struct flow_keys *keys, u32 keyval) { - if (!skb_flow_dissect_flow_keys(skb, keys)) - return 0; + skb_flow_dissect_flow_keys(skb, keys, + FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); return __flow_hash_from_keys(keys, keyval); } @@ -590,15 +652,6 @@ void make_flow_keys_digest(struct flow_keys_digest *digest, } EXPORT_SYMBOL(make_flow_keys_digest); -static inline void __skb_set_sw_hash(struct sk_buff *skb, u32 hash, - struct flow_keys *keys) -{ - if (keys->ports.ports) - skb->l4_hash = 1; - skb->sw_hash = 1; - skb->hash = hash; -} - /** * __skb_get_hash: calculate a flow hash * @skb: sk_buff to calculate flow hash from @@ -611,15 +664,11 @@ static inline void __skb_set_sw_hash(struct sk_buff *skb, u32 hash, void __skb_get_hash(struct sk_buff *skb) { struct flow_keys keys; - u32 hash; __flow_hash_secret_init(); - hash = ___skb_get_hash(skb, &keys, hashrnd); - if (!hash) - return; - - __skb_set_sw_hash(skb, hash, &keys); + __skb_set_sw_hash(skb, ___skb_get_hash(skb, &keys, hashrnd), + flow_keys_have_l4(&keys)); } EXPORT_SYMBOL(__skb_get_hash); @@ -631,7 +680,7 @@ __u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb) } EXPORT_SYMBOL(skb_get_hash_perturb); -__u32 __skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6) +__u32 __skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6 *fl6) { struct flow_keys keys; @@ -648,13 +697,14 @@ __u32 __skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6) keys.tags.flow_label = (__force u32)fl6->flowlabel; keys.basic.ip_proto = fl6->flowi6_proto; - __skb_set_sw_hash(skb, flow_hash_from_keys(&keys), &keys); + __skb_set_sw_hash(skb, flow_hash_from_keys(&keys), + flow_keys_have_l4(&keys)); return skb->hash; } EXPORT_SYMBOL(__skb_get_hash_flowi6); -__u32 __skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl4) +__u32 __skb_get_hash_flowi4(struct sk_buff *skb, const struct flowi4 *fl4) { struct flow_keys keys; @@ -668,7 +718,8 @@ __u32 __skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl4) keys.keyid.keyid = fl4->fl4_gre_key; keys.basic.ip_proto = fl4->flowi4_proto; - __skb_set_sw_hash(skb, flow_hash_from_keys(&keys), &keys); + __skb_set_sw_hash(skb, flow_hash_from_keys(&keys), + flow_keys_have_l4(&keys)); return skb->hash; } @@ -733,12 +784,47 @@ u32 skb_get_poff(const struct sk_buff *skb) { struct flow_keys keys; - if (!skb_flow_dissect_flow_keys(skb, &keys)) + if (!skb_flow_dissect_flow_keys(skb, &keys, 0)) return 0; return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb)); } +__u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys) +{ + memset(keys, 0, sizeof(*keys)); + + memcpy(&keys->addrs.v6addrs.src, &fl6->saddr, + sizeof(keys->addrs.v6addrs.src)); + memcpy(&keys->addrs.v6addrs.dst, &fl6->daddr, + sizeof(keys->addrs.v6addrs.dst)); + keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + keys->ports.src = fl6->fl6_sport; + keys->ports.dst = fl6->fl6_dport; + keys->keyid.keyid = fl6->fl6_gre_key; + keys->tags.flow_label = (__force u32)fl6->flowlabel; + keys->basic.ip_proto = fl6->flowi6_proto; + + return flow_hash_from_keys(keys); +} +EXPORT_SYMBOL(__get_hash_from_flowi6); + +__u32 __get_hash_from_flowi4(const struct flowi4 *fl4, struct flow_keys *keys) +{ + memset(keys, 0, sizeof(*keys)); + + keys->addrs.v4addrs.src = fl4->saddr; + keys->addrs.v4addrs.dst = fl4->daddr; + keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + keys->ports.src = fl4->fl4_sport; + keys->ports.dst = fl4->fl4_dport; + keys->keyid.keyid = fl4->fl4_gre_key; + keys->basic.ip_proto = fl4->flowi4_proto; + + return flow_hash_from_keys(keys); +} +EXPORT_SYMBOL(__get_hash_from_flowi4); + static const struct flow_dissector_key flow_keys_dissector_keys[] = { { .key_id = FLOW_DISSECTOR_KEY_CONTROL, diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index dfb1a9ca0835..299cfc24d888 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -180,7 +180,7 @@ int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b) } EXPORT_SYMBOL(lwtunnel_cmp_encap); -int lwtunnel_output(struct sock *sk, struct sk_buff *skb) +int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); const struct lwtunnel_encap_ops *ops; @@ -199,7 +199,7 @@ int lwtunnel_output(struct sock *sk, struct sk_buff *skb) rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); if (likely(ops && ops->output)) - ret = ops->output(sk, skb); + ret = ops->output(net, sk, skb); rcu_read_unlock(); if (ret == -EOPNOTSUPP) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 2b515ba7e94f..1aa8437ed6c4 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2235,14 +2235,53 @@ static void neigh_update_notify(struct neighbour *neigh) __neigh_notify(neigh, RTM_NEWNEIGH, 0); } +static bool neigh_master_filtered(struct net_device *dev, int master_idx) +{ + struct net_device *master; + + if (!master_idx) + return false; + + master = netdev_master_upper_dev_get(dev); + if (!master || master->ifindex != master_idx) + return true; + + return false; +} + +static bool neigh_ifindex_filtered(struct net_device *dev, int filter_idx) +{ + if (filter_idx && dev->ifindex != filter_idx) + return true; + + return false; +} + static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); + const struct nlmsghdr *nlh = cb->nlh; + struct nlattr *tb[NDA_MAX + 1]; struct neighbour *n; int rc, h, s_h = cb->args[1]; int idx, s_idx = idx = cb->args[2]; struct neigh_hash_table *nht; + int filter_master_idx = 0, filter_idx = 0; + unsigned int flags = NLM_F_MULTI; + int err; + + err = nlmsg_parse(nlh, sizeof(struct ndmsg), tb, NDA_MAX, NULL); + if (!err) { + if (tb[NDA_IFINDEX]) + filter_idx = nla_get_u32(tb[NDA_IFINDEX]); + + if (tb[NDA_MASTER]) + filter_master_idx = nla_get_u32(tb[NDA_MASTER]); + + if (filter_idx || filter_master_idx) + flags |= NLM_F_DUMP_FILTERED; + } rcu_read_lock_bh(); nht = rcu_dereference_bh(tbl->nht); @@ -2255,12 +2294,16 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, n = rcu_dereference_bh(n->next)) { if (!net_eq(dev_net(n->dev), net)) continue; + if (neigh_ifindex_filtered(n->dev, filter_idx)) + continue; + if (neigh_master_filtered(n->dev, filter_master_idx)) + continue; if (idx < s_idx) goto next; if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, - NLM_F_MULTI) < 0) { + flags) < 0) { rc = -1; goto out; } diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index b279077c3089..f88a62ab019d 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -31,7 +31,6 @@ static const char fmt_hex[] = "%#x\n"; static const char fmt_long_hex[] = "%#lx\n"; static const char fmt_dec[] = "%d\n"; -static const char fmt_udec[] = "%u\n"; static const char fmt_ulong[] = "%lu\n"; static const char fmt_u64[] = "%llu\n"; @@ -202,7 +201,7 @@ static ssize_t speed_show(struct device *dev, if (netif_running(netdev)) { struct ethtool_cmd cmd; if (!__ethtool_get_settings(netdev, &cmd)) - ret = sprintf(buf, fmt_udec, ethtool_cmd_speed(&cmd)); + ret = sprintf(buf, fmt_dec, ethtool_cmd_speed(&cmd)); } rtnl_unlock(); return ret; @@ -472,7 +471,7 @@ static ssize_t phys_switch_id_show(struct device *dev, if (dev_isalive(netdev)) { struct switchdev_attr attr = { - .id = SWITCHDEV_ATTR_PORT_PARENT_ID, + .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID, .flags = SWITCHDEV_F_NO_RECURSE, }; @@ -1004,15 +1003,12 @@ static ssize_t show_trans_timeout(struct netdev_queue *queue, } #ifdef CONFIG_XPS -static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue) +static unsigned int get_netdev_queue_index(struct netdev_queue *queue) { struct net_device *dev = queue->dev; - int i; - - for (i = 0; i < dev->num_tx_queues; i++) - if (queue == &dev->_tx[i]) - break; + unsigned int i; + i = queue - dev->_tx; BUG_ON(i >= dev->num_tx_queues); return i; @@ -1481,6 +1477,15 @@ static int of_dev_node_match(struct device *dev, const void *data) return ret == 0 ? dev->of_node == data : ret; } +/* + * of_find_net_device_by_node - lookup the net device for the device node + * @np: OF device node + * + * Looks up the net_device structure corresponding with the device node. + * If successful, returns a pointer to the net_device with the embedded + * struct device refcount incremented by one, or NULL on failure. The + * refcount must be dropped when done with the net_device. + */ struct net_device *of_find_net_device_by_node(struct device_node *np) { struct device *dev; diff --git a/net/core/net-traces.c b/net/core/net-traces.c index ba3c0120786c..adef015b2f41 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -31,6 +31,7 @@ #include <trace/events/napi.h> #include <trace/events/sock.h> #include <trace/events/udp.h> +#include <trace/events/fib.h> EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index c126a878c47c..94acfc89ad97 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -140,36 +140,42 @@ static void queue_process(struct work_struct *work) * case. Further, we test the poll_owner to avoid recursion on UP * systems where the lock doesn't exist. */ -static int poll_one_napi(struct napi_struct *napi, int budget) +static void poll_one_napi(struct napi_struct *napi) { - int work; + int work = 0; /* net_rx_action's ->poll() invocations and our's are * synchronized by this test which is only made while * holding the napi->poll_lock. */ if (!test_bit(NAPI_STATE_SCHED, &napi->state)) - return budget; + return; - set_bit(NAPI_STATE_NPSVC, &napi->state); + /* If we set this bit but see that it has already been set, + * that indicates that napi has been disabled and we need + * to abort this operation + */ + if (test_and_set_bit(NAPI_STATE_NPSVC, &napi->state)) + return; - work = napi->poll(napi, budget); - WARN_ONCE(work > budget, "%pF exceeded budget in poll\n", napi->poll); + /* We explicilty pass the polling call a budget of 0 to + * indicate that we are clearing the Tx path only. + */ + work = napi->poll(napi, 0); + WARN_ONCE(work, "%pF exceeded budget in poll\n", napi->poll); trace_napi_poll(napi); clear_bit(NAPI_STATE_NPSVC, &napi->state); - - return budget - work; } -static void poll_napi(struct net_device *dev, int budget) +static void poll_napi(struct net_device *dev) { struct napi_struct *napi; list_for_each_entry(napi, &dev->napi_list, dev_list) { if (napi->poll_owner != smp_processor_id() && spin_trylock(&napi->poll_lock)) { - budget = poll_one_napi(napi, budget); + poll_one_napi(napi); spin_unlock(&napi->poll_lock); } } @@ -179,7 +185,6 @@ static void netpoll_poll_dev(struct net_device *dev) { const struct net_device_ops *ops; struct netpoll_info *ni = rcu_dereference_bh(dev->npinfo); - int budget = 0; /* Don't do any rx activity if the dev_lock mutex is held * the dev_open/close paths use this to block netpoll activity @@ -202,7 +207,7 @@ static void netpoll_poll_dev(struct net_device *dev) /* Process pending work on NIC */ ops->ndo_poll_controller(dev); - poll_napi(dev, budget); + poll_napi(dev); up(&ni->dev_lock); @@ -380,6 +385,8 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) static atomic_t ip_ident; struct ipv6hdr *ip6h; + WARN_ON_ONCE(!irqs_disabled()); + udp_len = len + sizeof(*udph); if (np->ipv6) ip_len = udp_len + sizeof(*ip6h); diff --git a/net/core/request_sock.c b/net/core/request_sock.c index b42f0e26f89e..5d26056b6d8f 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -37,90 +37,16 @@ int sysctl_max_syn_backlog = 256; EXPORT_SYMBOL(sysctl_max_syn_backlog); -int reqsk_queue_alloc(struct request_sock_queue *queue, - unsigned int nr_table_entries) +void reqsk_queue_alloc(struct request_sock_queue *queue) { - size_t lopt_size = sizeof(struct listen_sock); - struct listen_sock *lopt = NULL; + spin_lock_init(&queue->rskq_lock); - nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog); - nr_table_entries = max_t(u32, nr_table_entries, 8); - nr_table_entries = roundup_pow_of_two(nr_table_entries + 1); - lopt_size += nr_table_entries * sizeof(struct request_sock *); + spin_lock_init(&queue->fastopenq.lock); + queue->fastopenq.rskq_rst_head = NULL; + queue->fastopenq.rskq_rst_tail = NULL; + queue->fastopenq.qlen = 0; - if (lopt_size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) - lopt = kzalloc(lopt_size, GFP_KERNEL | - __GFP_NOWARN | - __GFP_NORETRY); - if (!lopt) - lopt = vzalloc(lopt_size); - if (!lopt) - return -ENOMEM; - - get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd)); - spin_lock_init(&queue->syn_wait_lock); queue->rskq_accept_head = NULL; - lopt->nr_table_entries = nr_table_entries; - lopt->max_qlen_log = ilog2(nr_table_entries); - - spin_lock_bh(&queue->syn_wait_lock); - queue->listen_opt = lopt; - spin_unlock_bh(&queue->syn_wait_lock); - - return 0; -} - -void __reqsk_queue_destroy(struct request_sock_queue *queue) -{ - /* This is an error recovery path only, no locking needed */ - kvfree(queue->listen_opt); -} - -static inline struct listen_sock *reqsk_queue_yank_listen_sk( - struct request_sock_queue *queue) -{ - struct listen_sock *lopt; - - spin_lock_bh(&queue->syn_wait_lock); - lopt = queue->listen_opt; - queue->listen_opt = NULL; - spin_unlock_bh(&queue->syn_wait_lock); - - return lopt; -} - -void reqsk_queue_destroy(struct request_sock_queue *queue) -{ - /* make all the listen_opt local to us */ - struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue); - - if (listen_sock_qlen(lopt) != 0) { - unsigned int i; - - for (i = 0; i < lopt->nr_table_entries; i++) { - struct request_sock *req; - - spin_lock_bh(&queue->syn_wait_lock); - while ((req = lopt->syn_table[i]) != NULL) { - lopt->syn_table[i] = req->dl_next; - /* Because of following del_timer_sync(), - * we must release the spinlock here - * or risk a dead lock. - */ - spin_unlock_bh(&queue->syn_wait_lock); - atomic_inc(&lopt->qlen_dec); - if (del_timer_sync(&req->rsk_timer)) - reqsk_put(req); - reqsk_put(req); - spin_lock_bh(&queue->syn_wait_lock); - } - spin_unlock_bh(&queue->syn_wait_lock); - } - } - - if (WARN_ON(listen_sock_qlen(lopt) != 0)) - pr_err("qlen %u\n", listen_sock_qlen(lopt)); - kvfree(lopt); } /* @@ -174,7 +100,7 @@ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, struct sock *lsk = req->rsk_listener; struct fastopen_queue *fastopenq; - fastopenq = inet_csk(lsk)->icsk_accept_queue.fastopenq; + fastopenq = &inet_csk(lsk)->icsk_accept_queue.fastopenq; tcp_sk(sk)->fastopen_rsk = NULL; spin_lock_bh(&fastopenq->lock); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 788ceed39463..24775953fa68 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -96,7 +96,7 @@ int rtnl_is_locked(void) EXPORT_SYMBOL(rtnl_is_locked); #ifdef CONFIG_PROVE_LOCKING -int lockdep_rtnl_is_held(void) +bool lockdep_rtnl_is_held(void) { return lockdep_is_held(&rtnl_mutex); } @@ -678,6 +678,12 @@ int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics) continue; if (nla_put_string(skb, i + 1, name)) goto nla_put_failure; + } else if (i == RTAX_FEATURES - 1) { + u32 user_features = metrics[i] & RTAX_FEATURE_MASK; + + BUILD_BUG_ON(RTAX_FEATURE_MASK & DST_FEATURE_MASK); + if (nla_put_u32(skb, i + 1, user_features)) + goto nla_put_failure; } else { if (nla_put_u32(skb, i + 1, metrics[i])) goto nla_put_failure; @@ -1019,7 +1025,7 @@ static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev) { int err; struct switchdev_attr attr = { - .id = SWITCHDEV_ATTR_PORT_PARENT_ID, + .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID, .flags = SWITCHDEV_F_NO_RECURSE, }; @@ -1266,7 +1272,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, if (!(af = nla_nest_start(skb, af_ops->family))) goto nla_put_failure; - err = af_ops->fill_link_af(skb, dev); + err = af_ops->fill_link_af(skb, dev, ext_filter_mask); /* * Caller may return ENODATA to indicate that there @@ -3041,6 +3047,7 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) u32 portid = NETLINK_CB(cb->skb).portid; u32 seq = cb->nlh->nlmsg_seq; u32 filter_mask = 0; + int err; if (nlmsg_len(cb->nlh) > sizeof(struct ifinfomsg)) { struct nlattr *extfilt; @@ -3061,20 +3068,25 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) struct net_device *br_dev = netdev_master_upper_dev_get(dev); if (br_dev && br_dev->netdev_ops->ndo_bridge_getlink) { - if (idx >= cb->args[0] && - br_dev->netdev_ops->ndo_bridge_getlink( - skb, portid, seq, dev, filter_mask, - NLM_F_MULTI) < 0) - break; + if (idx >= cb->args[0]) { + err = br_dev->netdev_ops->ndo_bridge_getlink( + skb, portid, seq, dev, + filter_mask, NLM_F_MULTI); + if (err < 0 && err != -EOPNOTSUPP) + break; + } idx++; } if (ops->ndo_bridge_getlink) { - if (idx >= cb->args[0] && - ops->ndo_bridge_getlink(skb, portid, seq, dev, - filter_mask, - NLM_F_MULTI) < 0) - break; + if (idx >= cb->args[0]) { + err = ops->ndo_bridge_getlink(skb, portid, + seq, dev, + filter_mask, + NLM_F_MULTI); + if (err < 0 && err != -EOPNOTSUPP) + break; + } idx++; } } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 8a725cc50a90..fab4599ba8b2 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -340,7 +340,7 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size) if (skb && frag_size) { skb->head_frag = 1; - if (virt_to_head_page(data)->pfmemalloc) + if (page_is_pfmemalloc(virt_to_head_page(data))) skb->pfmemalloc = 1; } return skb; @@ -2958,11 +2958,12 @@ EXPORT_SYMBOL_GPL(skb_append_pagefrags); */ unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len) { + unsigned char *data = skb->data; + BUG_ON(len > skb->len); - skb->len -= len; - BUG_ON(skb->len < skb->data_len); - skb_postpull_rcsum(skb, skb->data, len); - return skb->data += len; + __skb_pull(skb, len); + skb_postpull_rcsum(skb, data, len); + return skb->data; } EXPORT_SYMBOL_GPL(skb_pull_rcsum); diff --git a/net/core/sock.c b/net/core/sock.c index 193901d09757..dcc7d62654d5 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -988,6 +988,10 @@ set_rcvbuf: sk->sk_max_pacing_rate); break; + case SO_INCOMING_CPU: + sk->sk_incoming_cpu = val; + break; + default: ret = -ENOPROTOOPT; break; @@ -1852,6 +1856,32 @@ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, } EXPORT_SYMBOL(sock_alloc_send_skb); +int sock_cmsg_send(struct sock *sk, struct msghdr *msg, + struct sockcm_cookie *sockc) +{ + struct cmsghdr *cmsg; + + for_each_cmsghdr(cmsg, msg) { + if (!CMSG_OK(msg, cmsg)) + return -EINVAL; + if (cmsg->cmsg_level != SOL_SOCKET) + continue; + switch (cmsg->cmsg_type) { + case SO_MARK: + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) + return -EPERM; + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) + return -EINVAL; + sockc->mark = *(u32 *)CMSG_DATA(cmsg); + break; + default: + return -EINVAL; + } + } + return 0; +} +EXPORT_SYMBOL(sock_cmsg_send); + /* On 32bit arches, an skb frag is limited to 2^15 */ #define SKB_FRAG_PAGE_ORDER get_order(32768) @@ -2078,7 +2108,7 @@ suppress_allocation: EXPORT_SYMBOL(__sk_mem_schedule); /** - * __sk_reclaim - reclaim memory_allocated + * __sk_mem_reclaim - reclaim memory_allocated * @sk: socket * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple) */ @@ -2353,6 +2383,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_max_pacing_rate = ~0U; sk->sk_pacing_rate = ~0U; + sk->sk_incoming_cpu = -1; /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.txt for details) @@ -2740,10 +2771,8 @@ static void req_prot_cleanup(struct request_sock_ops *rsk_prot) return; kfree(rsk_prot->slab_name); rsk_prot->slab_name = NULL; - if (rsk_prot->slab) { - kmem_cache_destroy(rsk_prot->slab); - rsk_prot->slab = NULL; - } + kmem_cache_destroy(rsk_prot->slab); + rsk_prot->slab = NULL; } static int req_prot_init(const struct proto *prot) @@ -2760,7 +2789,7 @@ static int req_prot_init(const struct proto *prot) rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, rsk_prot->obj_size, 0, - 0, NULL); + prot->slab_flags, NULL); if (!rsk_prot->slab) { pr_crit("%s: Can't create request sock SLAB cache!\n", @@ -2828,10 +2857,8 @@ void proto_unregister(struct proto *prot) list_del(&prot->node); mutex_unlock(&proto_list_mutex); - if (prot->slab != NULL) { - kmem_cache_destroy(prot->slab); - prot->slab = NULL; - } + kmem_cache_destroy(prot->slab); + prot->slab = NULL; req_prot_cleanup(prot->rsk_prot); diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index d79866c5f8bc..0c1d58d43f67 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -1,3 +1,5 @@ +/* License: GPL */ + #include <linux/mutex.h> #include <linux/socket.h> #include <linux/skbuff.h> @@ -90,6 +92,9 @@ int sock_diag_put_filterinfo(bool may_report_filterinfo, struct sock *sk, goto out; fprog = filter->prog->orig_prog; + if (!fprog) + goto out; + flen = bpf_classic_proglen(fprog); attr = nla_reserve(skb, attrtype, flen); @@ -320,14 +325,4 @@ static int __init sock_diag_init(void) BUG_ON(!broadcast_wq); return register_pernet_subsys(&diag_net_ops); } - -static void __exit sock_diag_exit(void) -{ - unregister_pernet_subsys(&diag_net_ops); - destroy_workqueue(broadcast_wq); -} - -module_init(sock_diag_init); -module_exit(sock_diag_exit); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_SOCK_DIAG); +device_initcall(sock_diag_init); diff --git a/net/core/utils.c b/net/core/utils.c index 3dffce953c39..3d17ca8b4744 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -348,52 +348,3 @@ void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb, } } EXPORT_SYMBOL(inet_proto_csum_replace_by_diff); - -struct __net_random_once_work { - struct work_struct work; - struct static_key *key; -}; - -static void __net_random_once_deferred(struct work_struct *w) -{ - struct __net_random_once_work *work = - container_of(w, struct __net_random_once_work, work); - BUG_ON(!static_key_enabled(work->key)); - static_key_slow_dec(work->key); - kfree(work); -} - -static void __net_random_once_disable_jump(struct static_key *key) -{ - struct __net_random_once_work *w; - - w = kmalloc(sizeof(*w), GFP_ATOMIC); - if (!w) - return; - - INIT_WORK(&w->work, __net_random_once_deferred); - w->key = key; - schedule_work(&w->work); -} - -bool __net_get_random_once(void *buf, int nbytes, bool *done, - struct static_key *once_key) -{ - static DEFINE_SPINLOCK(lock); - unsigned long flags; - - spin_lock_irqsave(&lock, flags); - if (*done) { - spin_unlock_irqrestore(&lock, flags); - return false; - } - - get_random_bytes(buf, nbytes); - *done = true; - spin_unlock_irqrestore(&lock, flags); - - __net_random_once_disable_jump(once_key); - - return true; -} -EXPORT_SYMBOL(__net_get_random_once); diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index 5b21f6f88e97..4f6c1862dfd2 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -13,6 +13,7 @@ * You should have received a copy of the GNU General Public License along with * this program; if not, see <http://www.gnu.org/licenses/>. * + * Description: Data Center Bridging netlink interface * Author: Lucy Liu <lucy.liu@intel.com> */ @@ -24,7 +25,7 @@ #include <linux/dcbnl.h> #include <net/dcbevent.h> #include <linux/rtnetlink.h> -#include <linux/module.h> +#include <linux/init.h> #include <net/sock.h> /* Data Center Bridging (DCB) is a collection of Ethernet enhancements @@ -48,10 +49,6 @@ * features for capable devices. */ -MODULE_AUTHOR("Lucy Liu, <lucy.liu@intel.com>"); -MODULE_DESCRIPTION("Data Center Bridging netlink interface"); -MODULE_LICENSE("GPL"); - /**************** DCB attribute policies *************************************/ /* DCB netlink attributes policy */ @@ -1935,19 +1932,6 @@ int dcb_ieee_delapp(struct net_device *dev, struct dcb_app *del) } EXPORT_SYMBOL(dcb_ieee_delapp); -static void dcb_flushapp(void) -{ - struct dcb_app_type *app; - struct dcb_app_type *tmp; - - spin_lock_bh(&dcb_lock); - list_for_each_entry_safe(app, tmp, &dcb_app_list, list) { - list_del(&app->list); - kfree(app); - } - spin_unlock_bh(&dcb_lock); -} - static int __init dcbnl_init(void) { INIT_LIST_HEAD(&dcb_app_list); @@ -1957,12 +1941,4 @@ static int __init dcbnl_init(void) return 0; } -module_init(dcbnl_init); - -static void __exit dcbnl_exit(void) -{ - rtnl_unregister(PF_UNSPEC, RTM_GETDCB); - rtnl_unregister(PF_UNSPEC, RTM_SETDCB); - dcb_flushapp(); -} -module_exit(dcbnl_exit); +device_initcall(dcbnl_init); diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c index bd9e718c2a20..3de0d0362d7f 100644 --- a/net/dccp/ackvec.c +++ b/net/dccp/ackvec.c @@ -398,12 +398,8 @@ out_err: void dccp_ackvec_exit(void) { - if (dccp_ackvec_slab != NULL) { - kmem_cache_destroy(dccp_ackvec_slab); - dccp_ackvec_slab = NULL; - } - if (dccp_ackvec_record_slab != NULL) { - kmem_cache_destroy(dccp_ackvec_record_slab); - dccp_ackvec_record_slab = NULL; - } + kmem_cache_destroy(dccp_ackvec_slab); + dccp_ackvec_slab = NULL; + kmem_cache_destroy(dccp_ackvec_record_slab); + dccp_ackvec_record_slab = NULL; } diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c index 83498975165f..90f77d08cc37 100644 --- a/net/dccp/ccid.c +++ b/net/dccp/ccid.c @@ -95,8 +95,7 @@ static struct kmem_cache *ccid_kmem_cache_create(int obj_size, char *slab_name_f static void ccid_kmem_cache_destroy(struct kmem_cache *slab) { - if (slab != NULL) - kmem_cache_destroy(slab); + kmem_cache_destroy(slab); } static int __init ccid_activate(struct ccid_operations *ccid_ops) diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index bebc735f5afc..923f5a180134 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -229,7 +229,7 @@ void dccp_v4_send_check(struct sock *sk, struct sk_buff *skb); int dccp_retransmit_skb(struct sock *sk); void dccp_send_ack(struct sock *sk); -void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, +void dccp_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *rsk); void dccp_send_sync(struct sock *sk, const u64 seq, @@ -270,13 +270,13 @@ int dccp_reqsk_init(struct request_sock *rq, struct dccp_sock const *dp, int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb); -struct sock *dccp_create_openreq_child(struct sock *sk, +struct sock *dccp_create_openreq_child(const struct sock *sk, const struct request_sock *req, const struct sk_buff *skb); int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb); -struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb, +struct sock *dccp_v4_request_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst); struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, @@ -293,7 +293,7 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized); void dccp_destroy_sock(struct sock *sk); void dccp_close(struct sock *sk, long timeout); -struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst, +struct sk_buff *dccp_make_response(const struct sock *sk, struct dst_entry *dst, struct request_sock *req); int dccp_connect(struct sock *sk); @@ -325,13 +325,13 @@ void dccp_send_close(struct sock *sk, const int active); int dccp_invalid_packet(struct sk_buff *skb); u32 dccp_sample_rtt(struct sock *sk, long delta); -static inline int dccp_bad_service_code(const struct sock *sk, +static inline bool dccp_bad_service_code(const struct sock *sk, const __be32 service) { const struct dccp_sock *dp = dccp_sk(sk); if (dp->dccps_service == service) - return 0; + return false; return !dccp_list_has_service(dp->dccps_service_list, service); } diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index ccf4c5629b3c..8e99681c8189 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -390,7 +390,8 @@ static inline u64 dccp_v4_init_sequence(const struct sk_buff *skb) * * This is the equivalent of TCP's tcp_v4_syn_recv_sock */ -struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb, +struct sock *dccp_v4_request_recv_sock(const struct sock *sk, + struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst) { @@ -443,36 +444,6 @@ put_and_exit: } EXPORT_SYMBOL_GPL(dccp_v4_request_recv_sock); -static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) -{ - const struct dccp_hdr *dh = dccp_hdr(skb); - const struct iphdr *iph = ip_hdr(skb); - struct sock *nsk; - /* Find possible connection requests. */ - struct request_sock *req = inet_csk_search_req(sk, dh->dccph_sport, - iph->saddr, iph->daddr); - if (req) { - nsk = dccp_check_req(sk, skb, req); - if (!nsk) - reqsk_put(req); - return nsk; - } - nsk = inet_lookup_established(sock_net(sk), &dccp_hashinfo, - iph->saddr, dh->dccph_sport, - iph->daddr, dh->dccph_dport, - inet_iif(skb)); - if (nsk != NULL) { - if (nsk->sk_state != DCCP_TIME_WAIT) { - bh_lock_sock(nsk); - return nsk; - } - inet_twsk_put(inet_twsk(nsk)); - return NULL; - } - - return sk; -} - static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk, struct sk_buff *skb) { @@ -498,7 +469,7 @@ static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk, return &rt->dst; } -static int dccp_v4_send_response(struct sock *sk, struct request_sock *req) +static int dccp_v4_send_response(const struct sock *sk, struct request_sock *req) { int err = -1; struct sk_buff *skb; @@ -527,7 +498,7 @@ out: return err; } -static void dccp_v4_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb) +static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb) { int err; const struct iphdr *rxiph; @@ -624,7 +595,7 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) goto drop; - req = inet_reqsk_alloc(&dccp_request_sock_ops, sk); + req = inet_reqsk_alloc(&dccp_request_sock_ops, sk, true); if (req == NULL) goto drop; @@ -704,18 +675,6 @@ int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) * NOTE: the check for the packet types is done in * dccp_rcv_state_process */ - if (sk->sk_state == DCCP_LISTEN) { - struct sock *nsk = dccp_v4_hnd_req(sk, skb); - - if (nsk == NULL) - goto discard; - - if (nsk != sk) { - if (dccp_child_process(sk, nsk, skb)) - goto reset; - return 0; - } - } if (dccp_rcv_state_process(sk, skb, dh, skb->len)) goto reset; @@ -723,7 +682,6 @@ int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) reset: dccp_v4_ctl_send_reset(sk, skb); -discard: kfree_skb(skb); return 0; } @@ -867,6 +825,27 @@ static int dccp_v4_rcv(struct sk_buff *skb) goto no_dccp_socket; } + if (sk->sk_state == DCCP_NEW_SYN_RECV) { + struct request_sock *req = inet_reqsk(sk); + struct sock *nsk = NULL; + + sk = req->rsk_listener; + if (sk->sk_state == DCCP_LISTEN) + nsk = dccp_check_req(sk, skb, req); + if (!nsk) { + reqsk_put(req); + goto discard_it; + } + if (nsk == sk) { + sock_hold(sk); + reqsk_put(req); + } else if (dccp_child_process(sk, nsk, skb)) { + dccp_v4_ctl_send_reset(sk, skb); + goto discard_it; + } else { + return 0; + } + } /* * RFC 4340, sec. 9.2.1: Minimum Checksum Coverage * o if MinCsCov = 0, only packets with CsCov = 0 are accepted diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 5165571f397a..aed314f8c7c6 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -181,7 +181,7 @@ out: } -static int dccp_v6_send_response(struct sock *sk, struct request_sock *req) +static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req) { struct inet_request_sock *ireq = inet_rsk(req); struct ipv6_pinfo *np = inet6_sk(sk); @@ -234,7 +234,7 @@ static void dccp_v6_reqsk_destructor(struct request_sock *req) kfree_skb(inet_rsk(req)->pktopts); } -static void dccp_v6_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb) +static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb) { const struct ipv6hdr *rxip6h; struct sk_buff *skb; @@ -290,37 +290,6 @@ static struct request_sock_ops dccp6_request_sock_ops = { .syn_ack_timeout = dccp_syn_ack_timeout, }; -static struct sock *dccp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) -{ - const struct dccp_hdr *dh = dccp_hdr(skb); - const struct ipv6hdr *iph = ipv6_hdr(skb); - struct request_sock *req; - struct sock *nsk; - - req = inet6_csk_search_req(sk, dh->dccph_sport, &iph->saddr, - &iph->daddr, inet6_iif(skb)); - if (req) { - nsk = dccp_check_req(sk, skb, req); - if (!nsk) - reqsk_put(req); - return nsk; - } - nsk = __inet6_lookup_established(sock_net(sk), &dccp_hashinfo, - &iph->saddr, dh->dccph_sport, - &iph->daddr, ntohs(dh->dccph_dport), - inet6_iif(skb)); - if (nsk != NULL) { - if (nsk->sk_state != DCCP_TIME_WAIT) { - bh_lock_sock(nsk); - return nsk; - } - inet_twsk_put(inet_twsk(nsk)); - return NULL; - } - - return sk; -} - static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) { struct request_sock *req; @@ -350,7 +319,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) goto drop; - req = inet_reqsk_alloc(&dccp6_request_sock_ops, sk); + req = inet_reqsk_alloc(&dccp6_request_sock_ops, sk, true); if (req == NULL) goto drop; @@ -398,7 +367,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (dccp_v6_send_response(sk, req)) goto drop_and_free; - inet6_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT); + inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT); return 0; drop_and_free: @@ -408,13 +377,14 @@ drop: return -1; } -static struct sock *dccp_v6_request_recv_sock(struct sock *sk, +static struct sock *dccp_v6_request_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst) { struct inet_request_sock *ireq = inet_rsk(req); - struct ipv6_pinfo *newnp, *np = inet6_sk(sk); + struct ipv6_pinfo *newnp; + const struct ipv6_pinfo *np = inet6_sk(sk); struct inet_sock *newinet; struct dccp6_sock *newdp6; struct sock *newsk; @@ -462,22 +432,11 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk, if (sk_acceptq_is_full(sk)) goto out_overflow; - if (dst == NULL) { - struct in6_addr *final_p, final; + if (!dst) { struct flowi6 fl6; - memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_DCCP; - fl6.daddr = ireq->ir_v6_rmt_addr; - final_p = fl6_update_dst(&fl6, np->opt, &final); - fl6.saddr = ireq->ir_v6_loc_addr; - fl6.flowi6_oif = sk->sk_bound_dev_if; - fl6.fl6_dport = ireq->ir_rmt_port; - fl6.fl6_sport = htons(ireq->ir_num); - security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); - - dst = ip6_dst_lookup_flow(sk, &fl6, final_p); - if (IS_ERR(dst)) + dst = inet6_csk_route_req(sk, &fl6, req, IPPROTO_DCCP); + if (!dst) goto out; } @@ -651,24 +610,6 @@ static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) * NOTE: the check for the packet types is done in * dccp_rcv_state_process */ - if (sk->sk_state == DCCP_LISTEN) { - struct sock *nsk = dccp_v6_hnd_req(sk, skb); - - if (nsk == NULL) - goto discard; - /* - * Queue it on the new socket if the new socket is active, - * otherwise we just shortcircuit this and continue with - * the new socket.. - */ - if (nsk != sk) { - if (dccp_child_process(sk, nsk, skb)) - goto reset; - if (opt_skb != NULL) - __kfree_skb(opt_skb); - return 0; - } - } if (dccp_rcv_state_process(sk, skb, dccp_hdr(skb), skb->len)) goto reset; @@ -742,6 +683,27 @@ static int dccp_v6_rcv(struct sk_buff *skb) goto no_dccp_socket; } + if (sk->sk_state == DCCP_NEW_SYN_RECV) { + struct request_sock *req = inet_reqsk(sk); + struct sock *nsk = NULL; + + sk = req->rsk_listener; + if (sk->sk_state == DCCP_LISTEN) + nsk = dccp_check_req(sk, skb, req); + if (!nsk) { + reqsk_put(req); + goto discard_it; + } + if (nsk == sk) { + sock_hold(sk); + reqsk_put(req); + } else if (dccp_child_process(sk, nsk, skb)) { + dccp_v6_ctl_send_reset(sk, skb); + goto discard_it; + } else { + return 0; + } + } /* * RFC 4340, sec. 9.2.1: Minimum Checksum Coverage * o if MinCsCov = 0, only packets with CsCov = 0 are accepted diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 30addee2dd03..d10aace43672 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -48,8 +48,6 @@ void dccp_time_wait(struct sock *sk, int state, int timeo) tw->tw_ipv6only = sk->sk_ipv6only; } #endif - /* Linkage updates. */ - __inet_twsk_hashdance(tw, sk, &dccp_hashinfo); /* Get the TIME_WAIT timeout firing. */ if (timeo < rto) @@ -60,6 +58,8 @@ void dccp_time_wait(struct sock *sk, int state, int timeo) timeo = DCCP_TIMEWAIT_LEN; inet_twsk_schedule(tw, timeo); + /* Linkage updates. */ + __inet_twsk_hashdance(tw, sk, &dccp_hashinfo); inet_twsk_put(tw); } else { /* Sorry, if we're out of memory, just CLOSE this @@ -72,7 +72,7 @@ void dccp_time_wait(struct sock *sk, int state, int timeo) dccp_done(sk); } -struct sock *dccp_create_openreq_child(struct sock *sk, +struct sock *dccp_create_openreq_child(const struct sock *sk, const struct request_sock *req, const struct sk_buff *skb) { @@ -236,7 +236,7 @@ int dccp_child_process(struct sock *parent, struct sock *child, EXPORT_SYMBOL_GPL(dccp_child_process); -void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, +void dccp_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *rsk) { DCCP_BUG("DCCP-ACK packets are never sent in LISTEN/RESPOND state"); diff --git a/net/dccp/output.c b/net/dccp/output.c index 0248e8a3460c..4ce912e691d0 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -390,7 +390,7 @@ int dccp_retransmit_skb(struct sock *sk) return dccp_transmit_skb(sk, skb_clone(sk->sk_send_head, GFP_ATOMIC)); } -struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst, +struct sk_buff *dccp_make_response(const struct sock *sk, struct dst_entry *dst, struct request_sock *req) { struct dccp_hdr *dh; @@ -398,13 +398,18 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst, const u32 dccp_header_size = sizeof(struct dccp_hdr) + sizeof(struct dccp_hdr_ext) + sizeof(struct dccp_hdr_response); - struct sk_buff *skb = sock_wmalloc(sk, sk->sk_prot->max_header, 1, - GFP_ATOMIC); - if (skb == NULL) + struct sk_buff *skb; + + /* sk is marked const to clearly express we dont hold socket lock. + * sock_wmalloc() will atomically change sk->sk_wmem_alloc, + * it is safe to promote sk to non const. + */ + skb = sock_wmalloc((struct sock *)sk, MAX_DCCP_HEADER, 1, + GFP_ATOMIC); + if (!skb) return NULL; - /* Reserve space for headers. */ - skb_reserve(skb, sk->sk_prot->max_header); + skb_reserve(skb, MAX_DCCP_HEADER); skb_dst_set(skb, dst_clone(dst)); diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c index 4507b188fc51..482730cd8a56 100644 --- a/net/decnet/dn_neigh.c +++ b/net/decnet/dn_neigh.c @@ -194,7 +194,7 @@ static int dn_neigh_output(struct neighbour *neigh, struct sk_buff *skb) return err; } -static int dn_neigh_output_packet(struct sock *sk, struct sk_buff *skb) +static int dn_neigh_output_packet(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct dn_route *rt = (struct dn_route *)dst; @@ -246,8 +246,9 @@ static int dn_long_output(struct neighbour *neigh, struct sock *sk, skb_reset_network_header(skb); - return NF_HOOK(NFPROTO_DECNET, NF_DN_POST_ROUTING, sk, skb, - NULL, neigh->dev, dn_neigh_output_packet); + return NF_HOOK(NFPROTO_DECNET, NF_DN_POST_ROUTING, + &init_net, sk, skb, NULL, neigh->dev, + dn_neigh_output_packet); } /* @@ -286,8 +287,9 @@ static int dn_short_output(struct neighbour *neigh, struct sock *sk, skb_reset_network_header(skb); - return NF_HOOK(NFPROTO_DECNET, NF_DN_POST_ROUTING, sk, skb, - NULL, neigh->dev, dn_neigh_output_packet); + return NF_HOOK(NFPROTO_DECNET, NF_DN_POST_ROUTING, + &init_net, sk, skb, NULL, neigh->dev, + dn_neigh_output_packet); } /* @@ -327,11 +329,12 @@ static int dn_phase3_output(struct neighbour *neigh, struct sock *sk, skb_reset_network_header(skb); - return NF_HOOK(NFPROTO_DECNET, NF_DN_POST_ROUTING, sk, skb, - NULL, neigh->dev, dn_neigh_output_packet); + return NF_HOOK(NFPROTO_DECNET, NF_DN_POST_ROUTING, + &init_net, sk, skb, NULL, neigh->dev, + dn_neigh_output_packet); } -int dn_to_neigh_output(struct sock *sk, struct sk_buff *skb) +int dn_to_neigh_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct dn_route *rt = (struct dn_route *) dst; @@ -375,7 +378,7 @@ void dn_neigh_pointopoint_hello(struct sk_buff *skb) /* * Ethernet router hello message received */ -int dn_neigh_router_hello(struct sock *sk, struct sk_buff *skb) +int dn_neigh_router_hello(struct net *net, struct sock *sk, struct sk_buff *skb) { struct rtnode_hello_message *msg = (struct rtnode_hello_message *)skb->data; @@ -437,7 +440,7 @@ int dn_neigh_router_hello(struct sock *sk, struct sk_buff *skb) /* * Endnode hello message received */ -int dn_neigh_endnode_hello(struct sock *sk, struct sk_buff *skb) +int dn_neigh_endnode_hello(struct net *net, struct sock *sk, struct sk_buff *skb) { struct endnode_hello_message *msg = (struct endnode_hello_message *)skb->data; struct neighbour *neigh; diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c index a321eac9fd0c..7ac086d5c0c0 100644 --- a/net/decnet/dn_nsp_in.c +++ b/net/decnet/dn_nsp_in.c @@ -714,7 +714,8 @@ out: return ret; } -static int dn_nsp_rx_packet(struct sock *sk2, struct sk_buff *skb) +static int dn_nsp_rx_packet(struct net *net, struct sock *sk2, + struct sk_buff *skb) { struct dn_skb_cb *cb = DN_SKB_CB(skb); struct sock *sk = NULL; @@ -814,8 +815,8 @@ free_out: int dn_nsp_rx(struct sk_buff *skb) { - return NF_HOOK(NFPROTO_DECNET, NF_DN_LOCAL_IN, NULL, skb, - skb->dev, NULL, + return NF_HOOK(NFPROTO_DECNET, NF_DN_LOCAL_IN, + &init_net, NULL, skb, skb->dev, NULL, dn_nsp_rx_packet); } diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c index 1aaa51ebbda6..849805e7af52 100644 --- a/net/decnet/dn_nsp_out.c +++ b/net/decnet/dn_nsp_out.c @@ -85,7 +85,7 @@ static void dn_nsp_send(struct sk_buff *skb) if (dst) { try_again: skb_dst_set(skb, dst); - dst_output(skb); + dst_output(&init_net, skb->sk, skb); return; } @@ -582,7 +582,7 @@ static __inline__ void dn_nsp_do_disc(struct sock *sk, unsigned char msgflg, * associations. */ skb_dst_set(skb, dst_clone(dst)); - dst_output(skb); + dst_output(&init_net, skb->sk, skb); } diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 03227ffd19ce..27fce283117b 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -512,7 +512,7 @@ static int dn_return_long(struct sk_buff *skb) * * Returns: result of input function if route is found, error code otherwise */ -static int dn_route_rx_packet(struct sock *sk, struct sk_buff *skb) +static int dn_route_rx_packet(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dn_skb_cb *cb; int err; @@ -573,8 +573,8 @@ static int dn_route_rx_long(struct sk_buff *skb) ptr++; cb->hops = *ptr++; /* Visit Count */ - return NF_HOOK(NFPROTO_DECNET, NF_DN_PRE_ROUTING, NULL, skb, - skb->dev, NULL, + return NF_HOOK(NFPROTO_DECNET, NF_DN_PRE_ROUTING, + &init_net, NULL, skb, skb->dev, NULL, dn_route_rx_packet); drop_it: @@ -601,8 +601,8 @@ static int dn_route_rx_short(struct sk_buff *skb) ptr += 2; cb->hops = *ptr & 0x3f; - return NF_HOOK(NFPROTO_DECNET, NF_DN_PRE_ROUTING, NULL, skb, - skb->dev, NULL, + return NF_HOOK(NFPROTO_DECNET, NF_DN_PRE_ROUTING, + &init_net, NULL, skb, skb->dev, NULL, dn_route_rx_packet); drop_it: @@ -610,7 +610,7 @@ drop_it: return NET_RX_DROP; } -static int dn_route_discard(struct sock *sk, struct sk_buff *skb) +static int dn_route_discard(struct net *net, struct sock *sk, struct sk_buff *skb) { /* * I know we drop the packet here, but thats considered success in @@ -620,7 +620,7 @@ static int dn_route_discard(struct sock *sk, struct sk_buff *skb) return NET_RX_SUCCESS; } -static int dn_route_ptp_hello(struct sock *sk, struct sk_buff *skb) +static int dn_route_ptp_hello(struct net *net, struct sock *sk, struct sk_buff *skb) { dn_dev_hello(skb); dn_neigh_pointopoint_hello(skb); @@ -706,22 +706,22 @@ int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type switch (flags & DN_RT_CNTL_MSK) { case DN_RT_PKT_HELO: return NF_HOOK(NFPROTO_DECNET, NF_DN_HELLO, - NULL, skb, skb->dev, NULL, + &init_net, NULL, skb, skb->dev, NULL, dn_route_ptp_hello); case DN_RT_PKT_L1RT: case DN_RT_PKT_L2RT: return NF_HOOK(NFPROTO_DECNET, NF_DN_ROUTE, - NULL, skb, skb->dev, NULL, + &init_net, NULL, skb, skb->dev, NULL, dn_route_discard); case DN_RT_PKT_ERTH: return NF_HOOK(NFPROTO_DECNET, NF_DN_HELLO, - NULL, skb, skb->dev, NULL, + &init_net, NULL, skb, skb->dev, NULL, dn_neigh_router_hello); case DN_RT_PKT_EEDH: return NF_HOOK(NFPROTO_DECNET, NF_DN_HELLO, - NULL, skb, skb->dev, NULL, + &init_net, NULL, skb, skb->dev, NULL, dn_neigh_endnode_hello); } } else { @@ -744,7 +744,7 @@ out: return NET_RX_DROP; } -static int dn_output(struct sock *sk, struct sk_buff *skb) +static int dn_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct dn_route *rt = (struct dn_route *)dst; @@ -770,8 +770,8 @@ static int dn_output(struct sock *sk, struct sk_buff *skb) cb->rt_flags |= DN_RT_F_IE; cb->hops = 0; - return NF_HOOK(NFPROTO_DECNET, NF_DN_LOCAL_OUT, sk, skb, - NULL, dev, + return NF_HOOK(NFPROTO_DECNET, NF_DN_LOCAL_OUT, + &init_net, sk, skb, NULL, dev, dn_to_neigh_output); error: @@ -819,8 +819,8 @@ static int dn_forward(struct sk_buff *skb) if (rt->rt_flags & RTCF_DOREDIRECT) cb->rt_flags |= DN_RT_F_IE; - return NF_HOOK(NFPROTO_DECNET, NF_DN_FORWARD, NULL, skb, - dev, skb->dev, + return NF_HOOK(NFPROTO_DECNET, NF_DN_FORWARD, + &init_net, NULL, skb, dev, skb->dev, dn_to_neigh_output); drop: @@ -832,7 +832,7 @@ drop: * Used to catch bugs. This should never normally get * called. */ -static int dn_rt_bug_sk(struct sock *sk, struct sk_buff *skb) +static int dn_rt_bug_out(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dn_skb_cb *cb = DN_SKB_CB(skb); @@ -1469,7 +1469,7 @@ make_route: rt->n = neigh; rt->dst.lastuse = jiffies; - rt->dst.output = dn_rt_bug_sk; + rt->dst.output = dn_rt_bug_out; switch (res.type) { case RTN_UNICAST: rt->dst.input = dn_forward; diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c index 9d66a0f72f90..295bbd6a56f2 100644 --- a/net/decnet/dn_rules.c +++ b/net/decnet/dn_rules.c @@ -229,7 +229,6 @@ static const struct fib_rules_ops __net_initconst dn_fib_rules_ops_template = { .configure = dn_fib_rule_configure, .compare = dn_fib_rule_compare, .fill = dn_fib_rule_fill, - .default_pref = fib_default_rule_pref, .flush_cache = dn_fib_rule_flush_cache, .nlgroup = RTNLGRP_DECnet_RULE, .policy = dn_fib_rule_policy, diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c index af34fc9bdf69..85f2fdc360c2 100644 --- a/net/decnet/netfilter/dn_rtmsg.c +++ b/net/decnet/netfilter/dn_rtmsg.c @@ -87,7 +87,7 @@ static void dnrmg_send_peer(struct sk_buff *skb) } -static unsigned int dnrmg_hook(const struct nf_hook_ops *ops, +static unsigned int dnrmg_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 053eb2b8e682..aa398bcef9e3 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -176,6 +176,41 @@ __ATTRIBUTE_GROUPS(dsa_hwmon); #endif /* CONFIG_NET_DSA_HWMON */ /* basic switch operations **************************************************/ +static int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct net_device *master) +{ + struct dsa_chip_data *cd = ds->pd; + struct device_node *port_dn; + struct phy_device *phydev; + int ret, port, mode; + + for (port = 0; port < DSA_MAX_PORTS; port++) { + if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))) + continue; + + port_dn = cd->port_dn[port]; + if (of_phy_is_fixed_link(port_dn)) { + ret = of_phy_register_fixed_link(port_dn); + if (ret) { + netdev_err(master, + "failed to register fixed PHY\n"); + return ret; + } + phydev = of_phy_find_device(port_dn); + + mode = of_get_phy_mode(port_dn); + if (mode < 0) + mode = PHY_INTERFACE_MODE_NA; + phydev->interface = mode; + + genphy_config_init(phydev); + genphy_read_status(phydev); + if (ds->drv->adjust_link) + ds->drv->adjust_link(ds, port, phydev); + } + } + return 0; +} + static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) { struct dsa_switch_driver *drv = ds->drv; @@ -291,12 +326,20 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) ret = dsa_slave_create(ds, parent, i, pd->port_names[i]); if (ret < 0) { - netdev_err(dst->master_netdev, "[%d]: can't create dsa slave device for port %d(%s)\n", - index, i, pd->port_names[i]); + netdev_err(dst->master_netdev, "[%d]: can't create dsa slave device for port %d(%s): %d\n", + index, i, pd->port_names[i], ret); ret = 0; } } + /* Perform configuration of the CPU and DSA ports */ + ret = dsa_cpu_dsa_setup(ds, dst->master_netdev); + if (ret < 0) { + netdev_err(dst->master_netdev, "[%d] : can't configure CPU and DSA ports\n", + index); + ret = 0; + } + #ifdef CONFIG_NET_DSA_HWMON /* If the switch provides a temperature sensor, * register with hardware monitoring subsystem. @@ -591,6 +634,10 @@ static void dsa_of_free_platform_data(struct dsa_platform_data *pd) port_index++; } kfree(pd->chip[i].rtable); + + /* Drop our reference to the MDIO bus device */ + if (pd->chip[i].host_dev) + put_device(pd->chip[i].host_dev); } kfree(pd->chip); } @@ -618,16 +665,22 @@ static int dsa_of_probe(struct device *dev) return -EPROBE_DEFER; ethernet = of_parse_phandle(np, "dsa,ethernet", 0); - if (!ethernet) - return -EINVAL; + if (!ethernet) { + ret = -EINVAL; + goto out_put_mdio; + } ethernet_dev = of_find_net_device_by_node(ethernet); - if (!ethernet_dev) - return -EPROBE_DEFER; + if (!ethernet_dev) { + ret = -EPROBE_DEFER; + goto out_put_mdio; + } pd = kzalloc(sizeof(*pd), GFP_KERNEL); - if (!pd) - return -ENOMEM; + if (!pd) { + ret = -ENOMEM; + goto out_put_ethernet; + } dev->platform_data = pd; pd->of_netdev = ethernet_dev; @@ -648,7 +701,9 @@ static int dsa_of_probe(struct device *dev) cd = &pd->chip[chip_index]; cd->of_node = child; - cd->host_dev = &mdio_bus->dev; + + /* When assigning the host device, increment its refcount */ + cd->host_dev = get_device(&mdio_bus->dev); sw_addr = of_get_property(child, "reg", NULL); if (!sw_addr) @@ -668,6 +723,12 @@ static int dsa_of_probe(struct device *dev) ret = -EPROBE_DEFER; goto out_free_chip; } + + /* Drop the mdio_bus device ref, replacing the host + * device with the mdio_bus_switch device, keeping + * the refcount from of_mdio_find_bus() above. + */ + put_device(cd->host_dev); cd->host_dev = &mdio_bus_switch->dev; } @@ -701,6 +762,10 @@ static int dsa_of_probe(struct device *dev) } } + /* The individual chips hold their own refcount on the mdio bus, + * so drop ours */ + put_device(&mdio_bus->dev); + return 0; out_free_chip: @@ -708,6 +773,10 @@ out_free_chip: out_free: kfree(pd); dev->platform_data = NULL; +out_put_ethernet: + put_device(ðernet_dev->dev); +out_put_mdio: + put_device(&mdio_bus->dev); return ret; } @@ -719,6 +788,7 @@ static void dsa_of_remove(struct device *dev) return; dsa_of_free_platform_data(pd); + put_device(&pd->of_netdev->dev); kfree(pd); } #else diff --git a/net/dsa/slave.c b/net/dsa/slave.c index cce97385f743..bb2bd3b56b16 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -242,16 +242,15 @@ static int dsa_bridge_check_vlan_range(struct dsa_switch *ds, } static int dsa_slave_port_vlan_add(struct net_device *dev, - struct switchdev_obj *obj) + const struct switchdev_obj_port_vlan *vlan, + struct switchdev_trans *trans) { - struct switchdev_obj_vlan *vlan = &obj->u.vlan; struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; u16 vid; int err; - switch (obj->trans) { - case SWITCHDEV_TRANS_PREPARE: + if (switchdev_trans_ph_prepare(trans)) { if (!ds->drv->port_vlan_add || !ds->drv->port_pvid_set) return -EOPNOTSUPP; @@ -263,8 +262,7 @@ static int dsa_slave_port_vlan_add(struct net_device *dev, vlan->vid_end); if (err) return err; - break; - case SWITCHDEV_TRANS_COMMIT: + } else { for (vid = vlan->vid_begin; vid <= vlan->vid_end; ++vid) { err = ds->drv->port_vlan_add(ds, p->port, vid, vlan->flags & @@ -274,18 +272,14 @@ static int dsa_slave_port_vlan_add(struct net_device *dev, if (err) return err; } - break; - default: - return -EOPNOTSUPP; } return 0; } static int dsa_slave_port_vlan_del(struct net_device *dev, - struct switchdev_obj *obj) + const struct switchdev_obj_port_vlan *vlan) { - struct switchdev_obj_vlan *vlan = &obj->u.vlan; struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; u16 vid; @@ -304,9 +298,9 @@ static int dsa_slave_port_vlan_del(struct net_device *dev, } static int dsa_slave_port_vlan_dump(struct net_device *dev, - struct switchdev_obj *obj) + struct switchdev_obj_port_vlan *vlan, + switchdev_obj_dump_cb_t *cb) { - struct switchdev_obj_vlan *vlan = &obj->u.vlan; struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; DECLARE_BITMAP(members, DSA_MAX_PORTS); @@ -338,7 +332,7 @@ static int dsa_slave_port_vlan_dump(struct net_device *dev, if (test_bit(p->port, untagged)) vlan->flags |= BRIDGE_VLAN_INFO_UNTAGGED; - err = obj->cb(dev, obj); + err = cb(&vlan->obj); if (err) break; } @@ -347,37 +341,40 @@ static int dsa_slave_port_vlan_dump(struct net_device *dev, } static int dsa_slave_port_fdb_add(struct net_device *dev, - struct switchdev_obj *obj) + const struct switchdev_obj_port_fdb *fdb, + struct switchdev_trans *trans) { - struct switchdev_obj_fdb *fdb = &obj->u.fdb; struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; - int ret = -EOPNOTSUPP; + int ret; - if (obj->trans == SWITCHDEV_TRANS_PREPARE) - ret = ds->drv->port_fdb_add ? 0 : -EOPNOTSUPP; - else if (obj->trans == SWITCHDEV_TRANS_COMMIT) - ret = ds->drv->port_fdb_add(ds, p->port, fdb->addr, fdb->vid); + if (!ds->drv->port_fdb_prepare || !ds->drv->port_fdb_add) + return -EOPNOTSUPP; + + if (switchdev_trans_ph_prepare(trans)) + ret = ds->drv->port_fdb_prepare(ds, p->port, fdb, trans); + else + ret = ds->drv->port_fdb_add(ds, p->port, fdb, trans); return ret; } static int dsa_slave_port_fdb_del(struct net_device *dev, - struct switchdev_obj *obj) + const struct switchdev_obj_port_fdb *fdb) { - struct switchdev_obj_fdb *fdb = &obj->u.fdb; struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; int ret = -EOPNOTSUPP; if (ds->drv->port_fdb_del) - ret = ds->drv->port_fdb_del(ds, p->port, fdb->addr, fdb->vid); + ret = ds->drv->port_fdb_del(ds, p->port, fdb); return ret; } static int dsa_slave_port_fdb_dump(struct net_device *dev, - struct switchdev_obj *obj) + struct switchdev_obj_port_fdb *fdb, + switchdev_obj_dump_cb_t *cb) { struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; @@ -396,11 +393,11 @@ static int dsa_slave_port_fdb_dump(struct net_device *dev, if (ret < 0) break; - obj->u.fdb.addr = addr; - obj->u.fdb.vid = vid; - obj->u.fdb.ndm_state = is_static ? NUD_NOARP : NUD_REACHABLE; + fdb->addr = addr; + fdb->vid = vid; + fdb->ndm_state = is_static ? NUD_NOARP : NUD_REACHABLE; - ret = obj->cb(dev, obj); + ret = cb(&fdb->obj); if (ret < 0) break; } @@ -456,14 +453,20 @@ static int dsa_slave_stp_update(struct net_device *dev, u8 state) } static int dsa_slave_port_attr_set(struct net_device *dev, - struct switchdev_attr *attr) + struct switchdev_attr *attr, + struct switchdev_trans *trans) { - int ret = 0; + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + int ret; switch (attr->id) { - case SWITCHDEV_ATTR_PORT_STP_STATE: - if (attr->trans == SWITCHDEV_TRANS_COMMIT) - ret = dsa_slave_stp_update(dev, attr->u.stp_state); + case SWITCHDEV_ATTR_ID_PORT_STP_STATE: + if (switchdev_trans_ph_prepare(trans)) + ret = ds->drv->port_stp_update ? 0 : -EOPNOTSUPP; + else + ret = ds->drv->port_stp_update(ds, p->port, + attr->u.stp_state); break; default: ret = -EOPNOTSUPP; @@ -474,7 +477,8 @@ static int dsa_slave_port_attr_set(struct net_device *dev, } static int dsa_slave_port_obj_add(struct net_device *dev, - struct switchdev_obj *obj) + const struct switchdev_obj *obj, + struct switchdev_trans *trans) { int err; @@ -484,11 +488,15 @@ static int dsa_slave_port_obj_add(struct net_device *dev, */ switch (obj->id) { - case SWITCHDEV_OBJ_PORT_FDB: - err = dsa_slave_port_fdb_add(dev, obj); + case SWITCHDEV_OBJ_ID_PORT_FDB: + err = dsa_slave_port_fdb_add(dev, + SWITCHDEV_OBJ_PORT_FDB(obj), + trans); break; - case SWITCHDEV_OBJ_PORT_VLAN: - err = dsa_slave_port_vlan_add(dev, obj); + case SWITCHDEV_OBJ_ID_PORT_VLAN: + err = dsa_slave_port_vlan_add(dev, + SWITCHDEV_OBJ_PORT_VLAN(obj), + trans); break; default: err = -EOPNOTSUPP; @@ -499,16 +507,18 @@ static int dsa_slave_port_obj_add(struct net_device *dev, } static int dsa_slave_port_obj_del(struct net_device *dev, - struct switchdev_obj *obj) + const struct switchdev_obj *obj) { int err; switch (obj->id) { - case SWITCHDEV_OBJ_PORT_FDB: - err = dsa_slave_port_fdb_del(dev, obj); + case SWITCHDEV_OBJ_ID_PORT_FDB: + err = dsa_slave_port_fdb_del(dev, + SWITCHDEV_OBJ_PORT_FDB(obj)); break; - case SWITCHDEV_OBJ_PORT_VLAN: - err = dsa_slave_port_vlan_del(dev, obj); + case SWITCHDEV_OBJ_ID_PORT_VLAN: + err = dsa_slave_port_vlan_del(dev, + SWITCHDEV_OBJ_PORT_VLAN(obj)); break; default: err = -EOPNOTSUPP; @@ -519,16 +529,21 @@ static int dsa_slave_port_obj_del(struct net_device *dev, } static int dsa_slave_port_obj_dump(struct net_device *dev, - struct switchdev_obj *obj) + struct switchdev_obj *obj, + switchdev_obj_dump_cb_t *cb) { int err; switch (obj->id) { - case SWITCHDEV_OBJ_PORT_FDB: - err = dsa_slave_port_fdb_dump(dev, obj); + case SWITCHDEV_OBJ_ID_PORT_FDB: + err = dsa_slave_port_fdb_dump(dev, + SWITCHDEV_OBJ_PORT_FDB(obj), + cb); break; - case SWITCHDEV_OBJ_PORT_VLAN: - err = dsa_slave_port_vlan_dump(dev, obj); + case SWITCHDEV_OBJ_ID_PORT_VLAN: + err = dsa_slave_port_vlan_dump(dev, + SWITCHDEV_OBJ_PORT_VLAN(obj), + cb); break; default: err = -EOPNOTSUPP; @@ -582,7 +597,7 @@ static int dsa_slave_port_attr_get(struct net_device *dev, struct dsa_switch *ds = p->parent; switch (attr->id) { - case SWITCHDEV_ATTR_PORT_PARENT_ID: + case SWITCHDEV_ATTR_ID_PORT_PARENT_ID: attr->u.ppid.id_len = sizeof(ds->index); memcpy(&attr->u.ppid.id, &ds->index, attr->u.ppid.id_len); break; @@ -962,6 +977,10 @@ static const struct switchdev_ops dsa_slave_switchdev_ops = { .switchdev_port_obj_dump = dsa_slave_port_obj_dump, }; +static struct device_type dsa_type = { + .name = "dsa", +}; + static void dsa_slave_adjust_link(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); @@ -1010,8 +1029,10 @@ static int dsa_slave_phy_connect(struct dsa_slave_priv *p, struct dsa_switch *ds = p->parent; p->phy = ds->slave_mii_bus->phy_map[addr]; - if (!p->phy) + if (!p->phy) { + netdev_err(slave_dev, "no phy at %d\n", addr); return -ENODEV; + } /* Use already configured phy mode */ if (p->phy_interface == PHY_INTERFACE_MODE_NA) @@ -1045,7 +1066,7 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p, */ ret = of_phy_register_fixed_link(port_dn); if (ret) { - netdev_err(slave_dev, "failed to register fixed PHY\n"); + netdev_err(slave_dev, "failed to register fixed PHY: %d\n", ret); return ret; } phy_is_fixed = true; @@ -1056,17 +1077,20 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p, phy_flags = ds->drv->get_phy_flags(ds, p->port); if (phy_dn) { - ret = of_mdio_parse_addr(&slave_dev->dev, phy_dn); + int phy_id = of_mdio_parse_addr(&slave_dev->dev, phy_dn); + /* If this PHY address is part of phys_mii_mask, which means * that we need to divert reads and writes to/from it, then we * want to bind this device using the slave MII bus created by * DSA to make that happen. */ - if (!phy_is_fixed && ret >= 0 && - (ds->phys_mii_mask & (1 << ret))) { - ret = dsa_slave_phy_connect(p, slave_dev, ret); - if (ret) + if (!phy_is_fixed && phy_id >= 0 && + (ds->phys_mii_mask & (1 << phy_id))) { + ret = dsa_slave_phy_connect(p, slave_dev, phy_id); + if (ret) { + netdev_err(slave_dev, "failed to connect to phy%d: %d\n", phy_id, ret); return ret; + } } else { p->phy = of_phy_connect(slave_dev, phy_dn, dsa_slave_adjust_link, @@ -1083,8 +1107,10 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p, */ if (!p->phy) { ret = dsa_slave_phy_connect(p, slave_dev, p->port); - if (ret) + if (ret) { + netdev_err(slave_dev, "failed to connect to port %d: %d\n", p->port, ret); return ret; + } } else { netdev_info(slave_dev, "attached PHY at address %d [%s]\n", p->phy->addr, p->phy->drv->name); @@ -1150,6 +1176,7 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent, slave_dev->priv_flags |= IFF_NO_QUEUE; slave_dev->netdev_ops = &dsa_slave_netdev_ops; slave_dev->switchdev_ops = &dsa_slave_switchdev_ops; + SET_NETDEV_DEVTYPE(slave_dev, &dsa_type); netdev_for_each_tx_queue(slave_dev, dsa_slave_set_lockdep_class_one, NULL); @@ -1195,6 +1222,7 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent, ret = dsa_slave_phy_setup(p, slave_dev); if (ret) { + netdev_err(master, "error %d setting up slave phy\n", ret); free_netdev(slave_dev); return ret; } diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index d25efc93d8f1..b6ca0890d018 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -78,7 +78,7 @@ static int trailer_rcv(struct sk_buff *skb, struct net_device *dev, trailer = skb_tail_pointer(skb) - 4; if (trailer[0] != 0x80 || (trailer[1] & 0xf8) != 0x00 || - (trailer[3] & 0xef) != 0x00 || trailer[3] != 0x00) + (trailer[2] & 0xef) != 0x00 || trailer[3] != 0x00) goto out_drop; source_port = trailer[1] & 7; diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 217127c3a3ef..9e63f252a89e 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -127,12 +127,12 @@ u32 eth_get_headlen(void *data, unsigned int len) struct flow_keys keys; /* this should never happen, but better safe than sorry */ - if (len < sizeof(*eth)) + if (unlikely(len < sizeof(*eth))) return len; /* parse any remaining L2/L3 headers, check for L4 */ if (!skb_flow_dissect_flow_keys_buf(&keys, data, eth->h_proto, - sizeof(*eth), len)) + sizeof(*eth), len, 0)) return max_t(u32, keys.control.thoff, sizeof(*eth)); /* parse for any L4 headers */ diff --git a/net/ieee802154/6lowpan/6lowpan_i.h b/net/ieee802154/6lowpan/6lowpan_i.h index ea339fa94c27..b4e17a7c0df0 100644 --- a/net/ieee802154/6lowpan/6lowpan_i.h +++ b/net/ieee802154/6lowpan/6lowpan_i.h @@ -7,6 +7,15 @@ #include <net/inet_frag.h> #include <net/6lowpan.h> +typedef unsigned __bitwise__ lowpan_rx_result; +#define RX_CONTINUE ((__force lowpan_rx_result) 0u) +#define RX_DROP_UNUSABLE ((__force lowpan_rx_result) 1u) +#define RX_DROP ((__force lowpan_rx_result) 2u) +#define RX_QUEUED ((__force lowpan_rx_result) 3u) + +#define LOWPAN_DISPATCH_FRAG1 0xc0 +#define LOWPAN_DISPATCH_FRAGN 0xe0 + struct lowpan_create_arg { u16 tag; u16 d_size; @@ -40,7 +49,7 @@ static inline u32 ieee802154_addr_hash(const struct ieee802154_addr *a) /* private device info */ struct lowpan_dev_info { - struct net_device *real_dev; /* real WPAN device ptr */ + struct net_device *wdev; /* wpan device ptr */ u16 fragment_tag; }; @@ -62,4 +71,7 @@ int lowpan_header_create(struct sk_buff *skb, struct net_device *dev, const void *_saddr, unsigned int len); netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *dev); +int lowpan_iphc_decompress(struct sk_buff *skb); +lowpan_rx_result lowpan_rx_h_ipv6(struct sk_buff *skb); + #endif /* __IEEE802154_6LOWPAN_I_H__ */ diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index 953b1c49f5d1..20c49c724ba0 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -61,7 +61,7 @@ static struct header_ops lowpan_header_ops = { static struct lock_class_key lowpan_tx_busylock; static struct lock_class_key lowpan_netdev_xmit_lock_key; -static void lowpan_set_lockdep_class_one(struct net_device *dev, +static void lowpan_set_lockdep_class_one(struct net_device *ldev, struct netdev_queue *txq, void *_unused) { @@ -69,35 +69,47 @@ static void lowpan_set_lockdep_class_one(struct net_device *dev, &lowpan_netdev_xmit_lock_key); } -static int lowpan_dev_init(struct net_device *dev) +static int lowpan_dev_init(struct net_device *ldev) { - netdev_for_each_tx_queue(dev, lowpan_set_lockdep_class_one, NULL); - dev->qdisc_tx_busylock = &lowpan_tx_busylock; + netdev_for_each_tx_queue(ldev, lowpan_set_lockdep_class_one, NULL); + ldev->qdisc_tx_busylock = &lowpan_tx_busylock; + return 0; +} + +static int lowpan_open(struct net_device *dev) +{ + if (!open_count) + lowpan_rx_init(); + open_count++; + return 0; +} + +static int lowpan_stop(struct net_device *dev) +{ + open_count--; + if (!open_count) + lowpan_rx_exit(); return 0; } static const struct net_device_ops lowpan_netdev_ops = { .ndo_init = lowpan_dev_init, .ndo_start_xmit = lowpan_xmit, + .ndo_open = lowpan_open, + .ndo_stop = lowpan_stop, }; -static void lowpan_setup(struct net_device *dev) +static void lowpan_setup(struct net_device *ldev) { - dev->addr_len = IEEE802154_ADDR_LEN; - memset(dev->broadcast, 0xff, IEEE802154_ADDR_LEN); - dev->type = ARPHRD_6LOWPAN; - /* Frame Control + Sequence Number + Address fields + Security Header */ - dev->hard_header_len = 2 + 1 + 20 + 14; - dev->needed_tailroom = 2; /* FCS */ - dev->mtu = IPV6_MIN_MTU; - dev->priv_flags |= IFF_NO_QUEUE; - dev->flags = IFF_BROADCAST | IFF_MULTICAST; - dev->watchdog_timeo = 0; - - dev->netdev_ops = &lowpan_netdev_ops; - dev->header_ops = &lowpan_header_ops; - dev->destructor = free_netdev; - dev->features |= NETIF_F_NETNS_LOCAL; + memset(ldev->broadcast, 0xff, IEEE802154_ADDR_LEN); + /* We need an ipv6hdr as minimum len when calling xmit */ + ldev->hard_header_len = sizeof(struct ipv6hdr); + ldev->flags = IFF_BROADCAST | IFF_MULTICAST; + + ldev->netdev_ops = &lowpan_netdev_ops; + ldev->header_ops = &lowpan_header_ops; + ldev->destructor = free_netdev; + ldev->features |= NETIF_F_NETNS_LOCAL; } static int lowpan_validate(struct nlattr *tb[], struct nlattr *data[]) @@ -109,10 +121,10 @@ static int lowpan_validate(struct nlattr *tb[], struct nlattr *data[]) return 0; } -static int lowpan_newlink(struct net *src_net, struct net_device *dev, +static int lowpan_newlink(struct net *src_net, struct net_device *ldev, struct nlattr *tb[], struct nlattr *data[]) { - struct net_device *real_dev; + struct net_device *wdev; int ret; ASSERT_RTNL(); @@ -120,58 +132,56 @@ static int lowpan_newlink(struct net *src_net, struct net_device *dev, pr_debug("adding new link\n"); if (!tb[IFLA_LINK] || - !net_eq(dev_net(dev), &init_net)) + !net_eq(dev_net(ldev), &init_net)) return -EINVAL; - /* find and hold real wpan device */ - real_dev = dev_get_by_index(dev_net(dev), nla_get_u32(tb[IFLA_LINK])); - if (!real_dev) + /* find and hold wpan device */ + wdev = dev_get_by_index(dev_net(ldev), nla_get_u32(tb[IFLA_LINK])); + if (!wdev) return -ENODEV; - if (real_dev->type != ARPHRD_IEEE802154) { - dev_put(real_dev); + if (wdev->type != ARPHRD_IEEE802154) { + dev_put(wdev); return -EINVAL; } - if (real_dev->ieee802154_ptr->lowpan_dev) { - dev_put(real_dev); + if (wdev->ieee802154_ptr->lowpan_dev) { + dev_put(wdev); return -EBUSY; } - lowpan_dev_info(dev)->real_dev = real_dev; + lowpan_dev_info(ldev)->wdev = wdev; /* Set the lowpan hardware address to the wpan hardware address. */ - memcpy(dev->dev_addr, real_dev->dev_addr, IEEE802154_ADDR_LEN); - - lowpan_netdev_setup(dev, LOWPAN_LLTYPE_IEEE802154); - - ret = register_netdevice(dev); + memcpy(ldev->dev_addr, wdev->dev_addr, IEEE802154_ADDR_LEN); + /* We need headroom for possible wpan_dev_hard_header call and tailroom + * for encryption/fcs handling. The lowpan interface will replace + * the IPv6 header with 6LoWPAN header. At worst case the 6LoWPAN + * header has LOWPAN_IPHC_MAX_HEADER_LEN more bytes than the IPv6 + * header. + */ + ldev->needed_headroom = LOWPAN_IPHC_MAX_HEADER_LEN + + wdev->needed_headroom; + ldev->needed_tailroom = wdev->needed_tailroom; + + lowpan_netdev_setup(ldev, LOWPAN_LLTYPE_IEEE802154); + + ret = register_netdevice(ldev); if (ret < 0) { - dev_put(real_dev); + dev_put(wdev); return ret; } - real_dev->ieee802154_ptr->lowpan_dev = dev; - if (!open_count) - lowpan_rx_init(); - - open_count++; - + wdev->ieee802154_ptr->lowpan_dev = ldev; return 0; } -static void lowpan_dellink(struct net_device *dev, struct list_head *head) +static void lowpan_dellink(struct net_device *ldev, struct list_head *head) { - struct lowpan_dev_info *lowpan_dev = lowpan_dev_info(dev); - struct net_device *real_dev = lowpan_dev->real_dev; + struct net_device *wdev = lowpan_dev_info(ldev)->wdev; ASSERT_RTNL(); - open_count--; - - if (!open_count) - lowpan_rx_exit(); - - real_dev->ieee802154_ptr->lowpan_dev = NULL; - unregister_netdevice(dev); - dev_put(real_dev); + wdev->ieee802154_ptr->lowpan_dev = NULL; + unregister_netdevice(ldev); + dev_put(wdev); } static struct rtnl_link_ops lowpan_link_ops __read_mostly = { @@ -196,9 +206,9 @@ static inline void lowpan_netlink_fini(void) static int lowpan_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { - struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct net_device *wdev = netdev_notifier_info_to_dev(ptr); - if (dev->type != ARPHRD_IEEE802154) + if (wdev->type != ARPHRD_IEEE802154) goto out; switch (event) { @@ -207,8 +217,8 @@ static int lowpan_device_event(struct notifier_block *unused, * also delete possible lowpan interfaces which belongs * to the wpan interface. */ - if (dev->ieee802154_ptr && dev->ieee802154_ptr->lowpan_dev) - lowpan_dellink(dev->ieee802154_ptr->lowpan_dev, NULL); + if (wdev->ieee802154_ptr->lowpan_dev) + lowpan_dellink(wdev->ieee802154_ptr->lowpan_dev, NULL); break; default: break; diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index 214d44aef35b..12e8cf4bda9f 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -32,21 +32,10 @@ static const char lowpan_frags_cache_name[] = "lowpan-frags"; -struct lowpan_frag_info { - u16 d_tag; - u16 d_size; - u8 d_offset; -}; - -static struct lowpan_frag_info *lowpan_cb(struct sk_buff *skb) -{ - return (struct lowpan_frag_info *)skb->cb; -} - static struct inet_frags lowpan_frags; static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, - struct sk_buff *prev, struct net_device *dev); + struct sk_buff *prev, struct net_device *ldev); static unsigned int lowpan_hash_frag(u16 tag, u16 d_size, const struct ieee802154_addr *saddr, @@ -111,7 +100,7 @@ out: } static inline struct lowpan_frag_queue * -fq_find(struct net *net, const struct lowpan_frag_info *frag_info, +fq_find(struct net *net, const struct lowpan_802154_cb *cb, const struct ieee802154_addr *src, const struct ieee802154_addr *dst) { @@ -121,12 +110,12 @@ fq_find(struct net *net, const struct lowpan_frag_info *frag_info, struct netns_ieee802154_lowpan *ieee802154_lowpan = net_ieee802154_lowpan(net); - arg.tag = frag_info->d_tag; - arg.d_size = frag_info->d_size; + arg.tag = cb->d_tag; + arg.d_size = cb->d_size; arg.src = src; arg.dst = dst; - hash = lowpan_hash_frag(frag_info->d_tag, frag_info->d_size, src, dst); + hash = lowpan_hash_frag(cb->d_tag, cb->d_size, src, dst); q = inet_frag_find(&ieee802154_lowpan->frags, &lowpan_frags, &arg, hash); @@ -138,17 +127,17 @@ fq_find(struct net *net, const struct lowpan_frag_info *frag_info, } static int lowpan_frag_queue(struct lowpan_frag_queue *fq, - struct sk_buff *skb, const u8 frag_type) + struct sk_buff *skb, u8 frag_type) { struct sk_buff *prev, *next; - struct net_device *dev; + struct net_device *ldev; int end, offset; if (fq->q.flags & INET_FRAG_COMPLETE) goto err; - offset = lowpan_cb(skb)->d_offset << 3; - end = lowpan_cb(skb)->d_size; + offset = lowpan_802154_cb(skb)->d_offset << 3; + end = lowpan_802154_cb(skb)->d_size; /* Is this the final fragment? */ if (offset + skb->len == end) { @@ -174,13 +163,16 @@ static int lowpan_frag_queue(struct lowpan_frag_queue *fq, * this fragment, right? */ prev = fq->q.fragments_tail; - if (!prev || lowpan_cb(prev)->d_offset < lowpan_cb(skb)->d_offset) { + if (!prev || + lowpan_802154_cb(prev)->d_offset < + lowpan_802154_cb(skb)->d_offset) { next = NULL; goto found; } prev = NULL; for (next = fq->q.fragments; next != NULL; next = next->next) { - if (lowpan_cb(next)->d_offset >= lowpan_cb(skb)->d_offset) + if (lowpan_802154_cb(next)->d_offset >= + lowpan_802154_cb(skb)->d_offset) break; /* bingo! */ prev = next; } @@ -195,18 +187,15 @@ found: else fq->q.fragments = skb; - dev = skb->dev; - if (dev) + ldev = skb->dev; + if (ldev) skb->dev = NULL; fq->q.stamp = skb->tstamp; - if (frag_type == LOWPAN_DISPATCH_FRAG1) { - /* Calculate uncomp. 6lowpan header to estimate full size */ - fq->q.meat += lowpan_uncompress_size(skb, NULL); + if (frag_type == LOWPAN_DISPATCH_FRAG1) fq->q.flags |= INET_FRAG_FIRST_IN; - } else { - fq->q.meat += skb->len; - } + + fq->q.meat += skb->len; add_frag_mem_limit(fq->q.net, skb->truesize); if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && @@ -215,7 +204,7 @@ found: unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL; - res = lowpan_frag_reasm(fq, prev, dev); + res = lowpan_frag_reasm(fq, prev, ldev); skb->_skb_refdst = orefdst; return res; } @@ -235,7 +224,7 @@ err: * the last and the first frames arrived and all the bits are here. */ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *prev, - struct net_device *dev) + struct net_device *ldev) { struct sk_buff *fp, *head = fq->q.fragments; int sum_truesize; @@ -313,7 +302,7 @@ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *prev, sub_frag_mem_limit(fq->q.net, sum_truesize); head->next = NULL; - head->dev = dev; + head->dev = ldev; head->tstamp = fq->q.stamp; fq->q.fragments = NULL; @@ -325,24 +314,87 @@ out_oom: return -1; } -static int lowpan_get_frag_info(struct sk_buff *skb, const u8 frag_type, - struct lowpan_frag_info *frag_info) +static int lowpan_frag_rx_handlers_result(struct sk_buff *skb, + lowpan_rx_result res) +{ + switch (res) { + case RX_QUEUED: + return NET_RX_SUCCESS; + case RX_CONTINUE: + /* nobody cared about this packet */ + net_warn_ratelimited("%s: received unknown dispatch\n", + __func__); + + /* fall-through */ + default: + /* all others failure */ + return NET_RX_DROP; + } +} + +static lowpan_rx_result lowpan_frag_rx_h_iphc(struct sk_buff *skb) +{ + int ret; + + if (!lowpan_is_iphc(*skb_network_header(skb))) + return RX_CONTINUE; + + ret = lowpan_iphc_decompress(skb); + if (ret < 0) + return RX_DROP; + + return RX_QUEUED; +} + +static int lowpan_invoke_frag_rx_handlers(struct sk_buff *skb) +{ + lowpan_rx_result res; + +#define CALL_RXH(rxh) \ + do { \ + res = rxh(skb); \ + if (res != RX_CONTINUE) \ + goto rxh_next; \ + } while (0) + + /* likely at first */ + CALL_RXH(lowpan_frag_rx_h_iphc); + CALL_RXH(lowpan_rx_h_ipv6); + +rxh_next: + return lowpan_frag_rx_handlers_result(skb, res); +#undef CALL_RXH +} + +#define LOWPAN_FRAG_DGRAM_SIZE_HIGH_MASK 0x07 +#define LOWPAN_FRAG_DGRAM_SIZE_HIGH_SHIFT 8 + +static int lowpan_get_cb(struct sk_buff *skb, u8 frag_type, + struct lowpan_802154_cb *cb) { bool fail; - u8 pattern = 0, low = 0; + u8 high = 0, low = 0; __be16 d_tag = 0; - fail = lowpan_fetch_skb(skb, &pattern, 1); + fail = lowpan_fetch_skb(skb, &high, 1); fail |= lowpan_fetch_skb(skb, &low, 1); - frag_info->d_size = (pattern & 7) << 8 | low; + /* remove the dispatch value and use first three bits as high value + * for the datagram size + */ + cb->d_size = (high & LOWPAN_FRAG_DGRAM_SIZE_HIGH_MASK) << + LOWPAN_FRAG_DGRAM_SIZE_HIGH_SHIFT | low; fail |= lowpan_fetch_skb(skb, &d_tag, 2); - frag_info->d_tag = ntohs(d_tag); + cb->d_tag = ntohs(d_tag); if (frag_type == LOWPAN_DISPATCH_FRAGN) { - fail |= lowpan_fetch_skb(skb, &frag_info->d_offset, 1); + fail |= lowpan_fetch_skb(skb, &cb->d_offset, 1); } else { skb_reset_network_header(skb); - frag_info->d_offset = 0; + cb->d_offset = 0; + /* check if datagram_size has ipv6hdr on FRAG1 */ + fail |= cb->d_size < sizeof(struct ipv6hdr); + /* check if we can dereference the dispatch value */ + fail |= !skb->len; } if (unlikely(fail)) @@ -351,27 +403,33 @@ static int lowpan_get_frag_info(struct sk_buff *skb, const u8 frag_type, return 0; } -int lowpan_frag_rcv(struct sk_buff *skb, const u8 frag_type) +int lowpan_frag_rcv(struct sk_buff *skb, u8 frag_type) { struct lowpan_frag_queue *fq; struct net *net = dev_net(skb->dev); - struct lowpan_frag_info *frag_info = lowpan_cb(skb); - struct ieee802154_addr source, dest; + struct lowpan_802154_cb *cb = lowpan_802154_cb(skb); + struct ieee802154_hdr hdr; int err; - source = mac_cb(skb)->source; - dest = mac_cb(skb)->dest; + if (ieee802154_hdr_peek_addrs(skb, &hdr) < 0) + goto err; - err = lowpan_get_frag_info(skb, frag_type, frag_info); + err = lowpan_get_cb(skb, frag_type, cb); if (err < 0) goto err; - if (frag_info->d_size > IPV6_MIN_MTU) { + if (frag_type == LOWPAN_DISPATCH_FRAG1) { + err = lowpan_invoke_frag_rx_handlers(skb); + if (err == NET_RX_DROP) + goto err; + } + + if (cb->d_size > IPV6_MIN_MTU) { net_warn_ratelimited("lowpan_frag_rcv: datagram size exceeds MTU\n"); goto err; } - fq = fq_find(net, frag_info, &source, &dest); + fq = fq_find(net, cb, &hdr.source, &hdr.dest); if (fq != NULL) { int ret; @@ -387,7 +445,6 @@ err: kfree_skb(skb); return -1; } -EXPORT_SYMBOL(lowpan_frag_rcv); #ifdef CONFIG_SYSCTL static int zero; diff --git a/net/ieee802154/6lowpan/rx.c b/net/ieee802154/6lowpan/rx.c index 12e10201d263..65d55e05516c 100644 --- a/net/ieee802154/6lowpan/rx.c +++ b/net/ieee802154/6lowpan/rx.c @@ -11,40 +11,101 @@ #include <linux/if_arp.h> #include <net/6lowpan.h> +#include <net/mac802154.h> #include <net/ieee802154_netdev.h> #include "6lowpan_i.h" -static int lowpan_give_skb_to_device(struct sk_buff *skb, - struct net_device *dev) +#define LOWPAN_DISPATCH_FIRST 0xc0 +#define LOWPAN_DISPATCH_FRAG_MASK 0xf8 + +#define LOWPAN_DISPATCH_NALP 0x00 +#define LOWPAN_DISPATCH_ESC 0x40 +#define LOWPAN_DISPATCH_HC1 0x42 +#define LOWPAN_DISPATCH_DFF 0x43 +#define LOWPAN_DISPATCH_BC0 0x50 +#define LOWPAN_DISPATCH_MESH 0x80 + +static int lowpan_give_skb_to_device(struct sk_buff *skb) { - skb->dev = dev->ieee802154_ptr->lowpan_dev; skb->protocol = htons(ETH_P_IPV6); - skb->pkt_type = PACKET_HOST; + skb->dev->stats.rx_packets++; + skb->dev->stats.rx_bytes += skb->len; return netif_rx(skb); } -static int -iphc_decompress(struct sk_buff *skb, const struct ieee802154_hdr *hdr) +static int lowpan_rx_handlers_result(struct sk_buff *skb, lowpan_rx_result res) +{ + switch (res) { + case RX_CONTINUE: + /* nobody cared about this packet */ + net_warn_ratelimited("%s: received unknown dispatch\n", + __func__); + + /* fall-through */ + case RX_DROP_UNUSABLE: + kfree_skb(skb); + + /* fall-through */ + case RX_DROP: + return NET_RX_DROP; + case RX_QUEUED: + return lowpan_give_skb_to_device(skb); + default: + break; + } + + return NET_RX_DROP; +} + +static inline bool lowpan_is_frag1(u8 dispatch) +{ + return (dispatch & LOWPAN_DISPATCH_FRAG_MASK) == LOWPAN_DISPATCH_FRAG1; +} + +static inline bool lowpan_is_fragn(u8 dispatch) +{ + return (dispatch & LOWPAN_DISPATCH_FRAG_MASK) == LOWPAN_DISPATCH_FRAGN; +} + +static lowpan_rx_result lowpan_rx_h_frag(struct sk_buff *skb) +{ + int ret; + + if (!(lowpan_is_frag1(*skb_network_header(skb)) || + lowpan_is_fragn(*skb_network_header(skb)))) + return RX_CONTINUE; + + ret = lowpan_frag_rcv(skb, *skb_network_header(skb) & + LOWPAN_DISPATCH_FRAG_MASK); + if (ret == 1) + return RX_QUEUED; + + /* Packet is freed by lowpan_frag_rcv on error or put into the frag + * bucket. + */ + return RX_DROP; +} + +int lowpan_iphc_decompress(struct sk_buff *skb) { - u8 iphc0, iphc1; struct ieee802154_addr_sa sa, da; + struct ieee802154_hdr hdr; + u8 iphc0, iphc1; void *sap, *dap; - raw_dump_table(__func__, "raw skb data dump", skb->data, skb->len); - /* at least two bytes will be used for the encoding */ - if (skb->len < 2) + if (ieee802154_hdr_peek_addrs(skb, &hdr) < 0) return -EINVAL; - if (lowpan_fetch_skb_u8(skb, &iphc0)) - return -EINVAL; + raw_dump_table(__func__, "raw skb data dump", skb->data, skb->len); - if (lowpan_fetch_skb_u8(skb, &iphc1)) + if (lowpan_fetch_skb_u8(skb, &iphc0) || + lowpan_fetch_skb_u8(skb, &iphc1)) return -EINVAL; - ieee802154_addr_to_sa(&sa, &hdr->source); - ieee802154_addr_to_sa(&da, &hdr->dest); + ieee802154_addr_to_sa(&sa, &hdr.source); + ieee802154_addr_to_sa(&da, &hdr.dest); if (sa.addr_type == IEEE802154_ADDR_SHORT) sap = &sa.short_addr; @@ -61,77 +122,216 @@ iphc_decompress(struct sk_buff *skb, const struct ieee802154_hdr *hdr) IEEE802154_ADDR_LEN, iphc0, iphc1); } -static int lowpan_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt, struct net_device *orig_dev) +static lowpan_rx_result lowpan_rx_h_iphc(struct sk_buff *skb) { - struct ieee802154_hdr hdr; int ret; - if (dev->type != ARPHRD_IEEE802154 || - !dev->ieee802154_ptr->lowpan_dev) - goto drop; + if (!lowpan_is_iphc(*skb_network_header(skb))) + return RX_CONTINUE; - skb = skb_share_check(skb, GFP_ATOMIC); - if (!skb) - goto drop; + /* Setting datagram_offset to zero indicates non frag handling + * while doing lowpan_header_decompress. + */ + lowpan_802154_cb(skb)->d_size = 0; - if (!netif_running(dev)) - goto drop_skb; + ret = lowpan_iphc_decompress(skb); + if (ret < 0) + return RX_DROP_UNUSABLE; - if (skb->pkt_type == PACKET_OTHERHOST) - goto drop_skb; + return RX_QUEUED; +} - if (ieee802154_hdr_peek_addrs(skb, &hdr) < 0) - goto drop_skb; - - /* check that it's our buffer */ - if (skb->data[0] == LOWPAN_DISPATCH_IPV6) { - /* Pull off the 1-byte of 6lowpan header. */ - skb_pull(skb, 1); - return lowpan_give_skb_to_device(skb, dev); - } else { - switch (skb->data[0] & 0xe0) { - case LOWPAN_DISPATCH_IPHC: /* ipv6 datagram */ - ret = iphc_decompress(skb, &hdr); - if (ret < 0) - goto drop_skb; - - return lowpan_give_skb_to_device(skb, dev); - case LOWPAN_DISPATCH_FRAG1: /* first fragment header */ - ret = lowpan_frag_rcv(skb, LOWPAN_DISPATCH_FRAG1); - if (ret == 1) { - ret = iphc_decompress(skb, &hdr); - if (ret < 0) - goto drop_skb; - - return lowpan_give_skb_to_device(skb, dev); - } else if (ret == -1) { - return NET_RX_DROP; - } else { - return NET_RX_SUCCESS; - } - case LOWPAN_DISPATCH_FRAGN: /* next fragments headers */ - ret = lowpan_frag_rcv(skb, LOWPAN_DISPATCH_FRAGN); - if (ret == 1) { - ret = iphc_decompress(skb, &hdr); - if (ret < 0) - goto drop_skb; - - return lowpan_give_skb_to_device(skb, dev); - } else if (ret == -1) { - return NET_RX_DROP; - } else { - return NET_RX_SUCCESS; - } - default: - break; - } +lowpan_rx_result lowpan_rx_h_ipv6(struct sk_buff *skb) +{ + if (!lowpan_is_ipv6(*skb_network_header(skb))) + return RX_CONTINUE; + + /* Pull off the 1-byte of 6lowpan header. */ + skb_pull(skb, 1); + return RX_QUEUED; +} + +static inline bool lowpan_is_esc(u8 dispatch) +{ + return dispatch == LOWPAN_DISPATCH_ESC; +} + +static lowpan_rx_result lowpan_rx_h_esc(struct sk_buff *skb) +{ + if (!lowpan_is_esc(*skb_network_header(skb))) + return RX_CONTINUE; + + net_warn_ratelimited("%s: %s\n", skb->dev->name, + "6LoWPAN ESC not supported\n"); + + return RX_DROP_UNUSABLE; +} + +static inline bool lowpan_is_hc1(u8 dispatch) +{ + return dispatch == LOWPAN_DISPATCH_HC1; +} + +static lowpan_rx_result lowpan_rx_h_hc1(struct sk_buff *skb) +{ + if (!lowpan_is_hc1(*skb_network_header(skb))) + return RX_CONTINUE; + + net_warn_ratelimited("%s: %s\n", skb->dev->name, + "6LoWPAN HC1 not supported\n"); + + return RX_DROP_UNUSABLE; +} + +static inline bool lowpan_is_dff(u8 dispatch) +{ + return dispatch == LOWPAN_DISPATCH_DFF; +} + +static lowpan_rx_result lowpan_rx_h_dff(struct sk_buff *skb) +{ + if (!lowpan_is_dff(*skb_network_header(skb))) + return RX_CONTINUE; + + net_warn_ratelimited("%s: %s\n", skb->dev->name, + "6LoWPAN DFF not supported\n"); + + return RX_DROP_UNUSABLE; +} + +static inline bool lowpan_is_bc0(u8 dispatch) +{ + return dispatch == LOWPAN_DISPATCH_BC0; +} + +static lowpan_rx_result lowpan_rx_h_bc0(struct sk_buff *skb) +{ + if (!lowpan_is_bc0(*skb_network_header(skb))) + return RX_CONTINUE; + + net_warn_ratelimited("%s: %s\n", skb->dev->name, + "6LoWPAN BC0 not supported\n"); + + return RX_DROP_UNUSABLE; +} + +static inline bool lowpan_is_mesh(u8 dispatch) +{ + return (dispatch & LOWPAN_DISPATCH_FIRST) == LOWPAN_DISPATCH_MESH; +} + +static lowpan_rx_result lowpan_rx_h_mesh(struct sk_buff *skb) +{ + if (!lowpan_is_mesh(*skb_network_header(skb))) + return RX_CONTINUE; + + net_warn_ratelimited("%s: %s\n", skb->dev->name, + "6LoWPAN MESH not supported\n"); + + return RX_DROP_UNUSABLE; +} + +static int lowpan_invoke_rx_handlers(struct sk_buff *skb) +{ + lowpan_rx_result res; + +#define CALL_RXH(rxh) \ + do { \ + res = rxh(skb); \ + if (res != RX_CONTINUE) \ + goto rxh_next; \ + } while (0) + + /* likely at first */ + CALL_RXH(lowpan_rx_h_iphc); + CALL_RXH(lowpan_rx_h_frag); + CALL_RXH(lowpan_rx_h_ipv6); + CALL_RXH(lowpan_rx_h_esc); + CALL_RXH(lowpan_rx_h_hc1); + CALL_RXH(lowpan_rx_h_dff); + CALL_RXH(lowpan_rx_h_bc0); + CALL_RXH(lowpan_rx_h_mesh); + +rxh_next: + return lowpan_rx_handlers_result(skb, res); +#undef CALL_RXH +} + +static inline bool lowpan_is_nalp(u8 dispatch) +{ + return (dispatch & LOWPAN_DISPATCH_FIRST) == LOWPAN_DISPATCH_NALP; +} + +/* Lookup for reserved dispatch values at: + * https://www.iana.org/assignments/_6lowpan-parameters/_6lowpan-parameters.xhtml#_6lowpan-parameters-1 + * + * Last Updated: 2015-01-22 + */ +static inline bool lowpan_is_reserved(u8 dispatch) +{ + return ((dispatch >= 0x44 && dispatch <= 0x4F) || + (dispatch >= 0x51 && dispatch <= 0x5F) || + (dispatch >= 0xc8 && dispatch <= 0xdf) || + (dispatch >= 0xe8 && dispatch <= 0xff)); +} + +/* lowpan_rx_h_check checks on generic 6LoWPAN requirements + * in MAC and 6LoWPAN header. + * + * Don't manipulate the skb here, it could be shared buffer. + */ +static inline bool lowpan_rx_h_check(struct sk_buff *skb) +{ + __le16 fc = ieee802154_get_fc_from_skb(skb); + + /* check on ieee802154 conform 6LoWPAN header */ + if (!ieee802154_is_data(fc) || + !ieee802154_is_intra_pan(fc)) + return false; + + /* check if we can dereference the dispatch */ + if (unlikely(!skb->len)) + return false; + + if (lowpan_is_nalp(*skb_network_header(skb)) || + lowpan_is_reserved(*skb_network_header(skb))) + return false; + + return true; +} + +static int lowpan_rcv(struct sk_buff *skb, struct net_device *wdev, + struct packet_type *pt, struct net_device *orig_wdev) +{ + struct net_device *ldev; + + if (wdev->type != ARPHRD_IEEE802154 || + skb->pkt_type == PACKET_OTHERHOST || + !lowpan_rx_h_check(skb)) + return NET_RX_DROP; + + ldev = wdev->ieee802154_ptr->lowpan_dev; + if (!ldev || !netif_running(ldev)) + return NET_RX_DROP; + + /* Replacing skb->dev and followed rx handlers will manipulate skb. */ + skb = skb_share_check(skb, GFP_ATOMIC); + if (!skb) + return NET_RX_DROP; + skb->dev = ldev; + + /* When receive frag1 it's likely that we manipulate the buffer. + * When recevie iphc we manipulate the data buffer. So we need + * to unshare the buffer. + */ + if (lowpan_is_frag1(*skb_network_header(skb)) || + lowpan_is_iphc(*skb_network_header(skb))) { + skb = skb_unshare(skb, GFP_ATOMIC); + if (!skb) + return NET_RX_DROP; } -drop_skb: - kfree_skb(skb); -drop: - return NET_RX_DROP; + return lowpan_invoke_rx_handlers(skb); } static struct packet_type lowpan_packet_type = { diff --git a/net/ieee802154/6lowpan/tx.c b/net/ieee802154/6lowpan/tx.c index f6263fc12340..62a21f6f021e 100644 --- a/net/ieee802154/6lowpan/tx.c +++ b/net/ieee802154/6lowpan/tx.c @@ -10,6 +10,7 @@ #include <net/6lowpan.h> #include <net/ieee802154_netdev.h> +#include <net/mac802154.h> #include "6lowpan_i.h" @@ -36,7 +37,14 @@ lowpan_addr_info *lowpan_skb_priv(const struct sk_buff *skb) sizeof(struct lowpan_addr_info)); } -int lowpan_header_create(struct sk_buff *skb, struct net_device *dev, +/* This callback will be called from AF_PACKET and IPv6 stack, the AF_PACKET + * sockets gives an 8 byte array for addresses only! + * + * TODO I think AF_PACKET DGRAM (sending/receiving) RAW (sending) makes no + * sense here. We should disable it, the right use-case would be AF_INET6 + * RAW/DGRAM sockets. + */ +int lowpan_header_create(struct sk_buff *skb, struct net_device *ldev, unsigned short type, const void *_daddr, const void *_saddr, unsigned int len) { @@ -51,7 +59,7 @@ int lowpan_header_create(struct sk_buff *skb, struct net_device *dev, return 0; if (!saddr) - saddr = dev->dev_addr; + saddr = ldev->dev_addr; raw_dump_inline(__func__, "saddr", (unsigned char *)saddr, 8); raw_dump_inline(__func__, "daddr", (unsigned char *)daddr, 8); @@ -71,28 +79,33 @@ int lowpan_header_create(struct sk_buff *skb, struct net_device *dev, static struct sk_buff* lowpan_alloc_frag(struct sk_buff *skb, int size, - const struct ieee802154_hdr *master_hdr) + const struct ieee802154_hdr *master_hdr, bool frag1) { - struct net_device *real_dev = lowpan_dev_info(skb->dev)->real_dev; + struct net_device *wdev = lowpan_dev_info(skb->dev)->wdev; struct sk_buff *frag; int rc; - frag = alloc_skb(real_dev->hard_header_len + - real_dev->needed_tailroom + size, + frag = alloc_skb(wdev->needed_headroom + wdev->needed_tailroom + size, GFP_ATOMIC); if (likely(frag)) { - frag->dev = real_dev; + frag->dev = wdev; frag->priority = skb->priority; - skb_reserve(frag, real_dev->hard_header_len); + skb_reserve(frag, wdev->needed_headroom); skb_reset_network_header(frag); *mac_cb(frag) = *mac_cb(skb); - rc = dev_hard_header(frag, real_dev, 0, &master_hdr->dest, - &master_hdr->source, size); - if (rc < 0) { - kfree_skb(frag); - return ERR_PTR(rc); + if (frag1) { + memcpy(skb_put(frag, skb->mac_len), + skb_mac_header(skb), skb->mac_len); + } else { + rc = wpan_dev_hard_header(frag, wdev, + &master_hdr->dest, + &master_hdr->source, size); + if (rc < 0) { + kfree_skb(frag); + return ERR_PTR(rc); + } } } else { frag = ERR_PTR(-ENOMEM); @@ -104,13 +117,13 @@ lowpan_alloc_frag(struct sk_buff *skb, int size, static int lowpan_xmit_fragment(struct sk_buff *skb, const struct ieee802154_hdr *wpan_hdr, u8 *frag_hdr, int frag_hdrlen, - int offset, int len) + int offset, int len, bool frag1) { struct sk_buff *frag; raw_dump_inline(__func__, " fragment header", frag_hdr, frag_hdrlen); - frag = lowpan_alloc_frag(skb, frag_hdrlen + len, wpan_hdr); + frag = lowpan_alloc_frag(skb, frag_hdrlen + len, wpan_hdr, frag1); if (IS_ERR(frag)) return PTR_ERR(frag); @@ -123,19 +136,17 @@ lowpan_xmit_fragment(struct sk_buff *skb, const struct ieee802154_hdr *wpan_hdr, } static int -lowpan_xmit_fragmented(struct sk_buff *skb, struct net_device *dev, - const struct ieee802154_hdr *wpan_hdr) +lowpan_xmit_fragmented(struct sk_buff *skb, struct net_device *ldev, + const struct ieee802154_hdr *wpan_hdr, u16 dgram_size, + u16 dgram_offset) { - u16 dgram_size, dgram_offset; __be16 frag_tag; u8 frag_hdr[5]; int frag_cap, frag_len, payload_cap, rc; int skb_unprocessed, skb_offset; - dgram_size = lowpan_uncompress_size(skb, &dgram_offset) - - skb->mac_len; - frag_tag = htons(lowpan_dev_info(dev)->fragment_tag); - lowpan_dev_info(dev)->fragment_tag++; + frag_tag = htons(lowpan_dev_info(ldev)->fragment_tag); + lowpan_dev_info(ldev)->fragment_tag++; frag_hdr[0] = LOWPAN_DISPATCH_FRAG1 | ((dgram_size >> 8) & 0x07); frag_hdr[1] = dgram_size & 0xff; @@ -151,7 +162,8 @@ lowpan_xmit_fragmented(struct sk_buff *skb, struct net_device *dev, rc = lowpan_xmit_fragment(skb, wpan_hdr, frag_hdr, LOWPAN_FRAG1_HEAD_SIZE, 0, - frag_len + skb_network_header_len(skb)); + frag_len + skb_network_header_len(skb), + true); if (rc) { pr_debug("%s unable to send FRAG1 packet (tag: %d)", __func__, ntohs(frag_tag)); @@ -172,7 +184,7 @@ lowpan_xmit_fragmented(struct sk_buff *skb, struct net_device *dev, rc = lowpan_xmit_fragment(skb, wpan_hdr, frag_hdr, LOWPAN_FRAGN_HEAD_SIZE, skb_offset, - frag_len); + frag_len, false); if (rc) { pr_debug("%s unable to send a FRAGN packet. (tag: %d, offset: %d)\n", __func__, ntohs(frag_tag), skb_offset); @@ -180,6 +192,8 @@ lowpan_xmit_fragmented(struct sk_buff *skb, struct net_device *dev, } } while (skb_unprocessed > frag_cap); + ldev->stats.tx_packets++; + ldev->stats.tx_bytes += dgram_size; consume_skb(skb); return NET_XMIT_SUCCESS; @@ -188,9 +202,10 @@ err: return rc; } -static int lowpan_header(struct sk_buff *skb, struct net_device *dev) +static int lowpan_header(struct sk_buff *skb, struct net_device *ldev, + u16 *dgram_size, u16 *dgram_offset) { - struct wpan_dev *wpan_dev = lowpan_dev_info(dev)->real_dev->ieee802154_ptr; + struct wpan_dev *wpan_dev = lowpan_dev_info(ldev)->wdev->ieee802154_ptr; struct ieee802154_addr sa, da; struct ieee802154_mac_cb *cb = mac_cb_init(skb); struct lowpan_addr_info info; @@ -202,7 +217,10 @@ static int lowpan_header(struct sk_buff *skb, struct net_device *dev) daddr = &info.daddr.u.extended_addr; saddr = &info.saddr.u.extended_addr; - lowpan_header_compress(skb, dev, ETH_P_IPV6, daddr, saddr, skb->len); + *dgram_size = skb->len; + lowpan_header_compress(skb, ldev, ETH_P_IPV6, daddr, saddr, skb->len); + /* dgram_offset = (saved bytes after compression) + lowpan header len */ + *dgram_offset = (*dgram_size - skb->len) + skb_network_header_len(skb); cb->type = IEEE802154_FC_TYPE_DATA; @@ -227,17 +245,20 @@ static int lowpan_header(struct sk_buff *skb, struct net_device *dev) cb->ackreq = wpan_dev->ackreq; } - return dev_hard_header(skb, lowpan_dev_info(dev)->real_dev, - ETH_P_IPV6, (void *)&da, (void *)&sa, 0); + return wpan_dev_hard_header(skb, lowpan_dev_info(ldev)->wdev, &da, &sa, + 0); } -netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *dev) +netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *ldev) { struct ieee802154_hdr wpan_hdr; int max_single, ret; + u16 dgram_size, dgram_offset; pr_debug("package xmit\n"); + WARN_ON_ONCE(skb->len > IPV6_MIN_MTU); + /* We must take a copy of the skb before we modify/replace the ipv6 * header as the header could be used elsewhere */ @@ -245,7 +266,7 @@ netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *dev) if (!skb) return NET_XMIT_DROP; - ret = lowpan_header(skb, dev); + ret = lowpan_header(skb, ldev, &dgram_size, &dgram_offset); if (ret < 0) { kfree_skb(skb); return NET_XMIT_DROP; @@ -259,13 +280,16 @@ netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *dev) max_single = ieee802154_max_payload(&wpan_hdr); if (skb_tail_pointer(skb) - skb_network_header(skb) <= max_single) { - skb->dev = lowpan_dev_info(dev)->real_dev; + skb->dev = lowpan_dev_info(ldev)->wdev; + ldev->stats.tx_packets++; + ldev->stats.tx_bytes += dgram_size; return dev_queue_xmit(skb); } else { netdev_tx_t rc; pr_debug("frame is too big, fragmentation is needed\n"); - rc = lowpan_xmit_fragmented(skb, dev, &wpan_hdr); + rc = lowpan_xmit_fragmented(skb, ldev, &wpan_hdr, dgram_size, + dgram_offset); return rc < 0 ? NET_XMIT_DROP : rc; } diff --git a/net/ieee802154/Kconfig b/net/ieee802154/Kconfig index 1370d5b0041b..188135bcb803 100644 --- a/net/ieee802154/Kconfig +++ b/net/ieee802154/Kconfig @@ -12,6 +12,11 @@ menuconfig IEEE802154 if IEEE802154 +config IEEE802154_NL802154_EXPERIMENTAL + bool "IEEE 802.15.4 experimental netlink support" + ---help--- + Adds experimental netlink support for nl802154. + config IEEE802154_SOCKET tristate "IEEE 802.15.4 socket interface" default y diff --git a/net/ieee802154/core.c b/net/ieee802154/core.c index b0248e934230..c35fdfa6d04e 100644 --- a/net/ieee802154/core.c +++ b/net/ieee802154/core.c @@ -95,6 +95,18 @@ cfg802154_rdev_by_wpan_phy_idx(int wpan_phy_idx) return result; } +struct wpan_phy *wpan_phy_idx_to_wpan_phy(int wpan_phy_idx) +{ + struct cfg802154_registered_device *rdev; + + ASSERT_RTNL(); + + rdev = cfg802154_rdev_by_wpan_phy_idx(wpan_phy_idx); + if (!rdev) + return NULL; + return &rdev->wpan_phy; +} + struct wpan_phy * wpan_phy_new(const struct cfg802154_ops *ops, size_t priv_size) { diff --git a/net/ieee802154/core.h b/net/ieee802154/core.h index f3e95580caee..231fade959f3 100644 --- a/net/ieee802154/core.h +++ b/net/ieee802154/core.h @@ -42,5 +42,6 @@ extern int cfg802154_rdev_list_generation; void cfg802154_dev_free(struct cfg802154_registered_device *rdev); struct cfg802154_registered_device * cfg802154_rdev_by_wpan_phy_idx(int wpan_phy_idx); +struct wpan_phy *wpan_phy_idx_to_wpan_phy(int wpan_phy_idx); #endif /* __IEEE802154_CORE_H */ diff --git a/net/ieee802154/header_ops.c b/net/ieee802154/header_ops.c index a051b6993177..c7439f0fbbdf 100644 --- a/net/ieee802154/header_ops.c +++ b/net/ieee802154/header_ops.c @@ -83,35 +83,35 @@ ieee802154_hdr_push_sechdr(u8 *buf, const struct ieee802154_sechdr *hdr) } int -ieee802154_hdr_push(struct sk_buff *skb, const struct ieee802154_hdr *hdr) +ieee802154_hdr_push(struct sk_buff *skb, struct ieee802154_hdr *hdr) { - u8 buf[MAC802154_FRAME_HARD_HEADER_LEN]; + u8 buf[IEEE802154_MAX_HEADER_LEN]; int pos = 2; int rc; - struct ieee802154_hdr_fc fc = hdr->fc; + struct ieee802154_hdr_fc *fc = &hdr->fc; buf[pos++] = hdr->seq; - fc.dest_addr_mode = hdr->dest.mode; + fc->dest_addr_mode = hdr->dest.mode; rc = ieee802154_hdr_push_addr(buf + pos, &hdr->dest, false); if (rc < 0) return -EINVAL; pos += rc; - fc.source_addr_mode = hdr->source.mode; + fc->source_addr_mode = hdr->source.mode; if (hdr->source.pan_id == hdr->dest.pan_id && hdr->dest.mode != IEEE802154_ADDR_NONE) - fc.intra_pan = true; + fc->intra_pan = true; - rc = ieee802154_hdr_push_addr(buf + pos, &hdr->source, fc.intra_pan); + rc = ieee802154_hdr_push_addr(buf + pos, &hdr->source, fc->intra_pan); if (rc < 0) return -EINVAL; pos += rc; - if (fc.security_enabled) { - fc.version = 1; + if (fc->security_enabled) { + fc->version = 1; rc = ieee802154_hdr_push_sechdr(buf + pos, &hdr->sec); if (rc < 0) @@ -120,7 +120,7 @@ ieee802154_hdr_push(struct sk_buff *skb, const struct ieee802154_hdr *hdr) pos += rc; } - memcpy(buf, &fc, 2); + memcpy(buf, fc, 2); memcpy(skb_push(skb, pos), buf, pos); diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index 1b00a14850cb..16ef0d9f566e 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -232,8 +232,86 @@ static const struct nla_policy nl802154_policy[NL802154_ATTR_MAX+1] = { [NL802154_ATTR_SUPPORTED_COMMANDS] = { .type = NLA_NESTED }, [NL802154_ATTR_ACKREQ_DEFAULT] = { .type = NLA_U8 }, + +#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL + [NL802154_ATTR_SEC_ENABLED] = { .type = NLA_U8, }, + [NL802154_ATTR_SEC_OUT_LEVEL] = { .type = NLA_U32, }, + [NL802154_ATTR_SEC_OUT_KEY_ID] = { .type = NLA_NESTED, }, + [NL802154_ATTR_SEC_FRAME_COUNTER] = { .type = NLA_U32 }, + + [NL802154_ATTR_SEC_LEVEL] = { .type = NLA_NESTED }, + [NL802154_ATTR_SEC_DEVICE] = { .type = NLA_NESTED }, + [NL802154_ATTR_SEC_DEVKEY] = { .type = NLA_NESTED }, + [NL802154_ATTR_SEC_KEY] = { .type = NLA_NESTED }, +#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ }; +#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL +static int +nl802154_prepare_wpan_dev_dump(struct sk_buff *skb, + struct netlink_callback *cb, + struct cfg802154_registered_device **rdev, + struct wpan_dev **wpan_dev) +{ + int err; + + rtnl_lock(); + + if (!cb->args[0]) { + err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl802154_fam.hdrsize, + nl802154_fam.attrbuf, nl802154_fam.maxattr, + nl802154_policy); + if (err) + goto out_unlock; + + *wpan_dev = __cfg802154_wpan_dev_from_attrs(sock_net(skb->sk), + nl802154_fam.attrbuf); + if (IS_ERR(*wpan_dev)) { + err = PTR_ERR(*wpan_dev); + goto out_unlock; + } + *rdev = wpan_phy_to_rdev((*wpan_dev)->wpan_phy); + /* 0 is the first index - add 1 to parse only once */ + cb->args[0] = (*rdev)->wpan_phy_idx + 1; + cb->args[1] = (*wpan_dev)->identifier; + } else { + /* subtract the 1 again here */ + struct wpan_phy *wpan_phy = wpan_phy_idx_to_wpan_phy(cb->args[0] - 1); + struct wpan_dev *tmp; + + if (!wpan_phy) { + err = -ENODEV; + goto out_unlock; + } + *rdev = wpan_phy_to_rdev(wpan_phy); + *wpan_dev = NULL; + + list_for_each_entry(tmp, &(*rdev)->wpan_dev_list, list) { + if (tmp->identifier == cb->args[1]) { + *wpan_dev = tmp; + break; + } + } + + if (!*wpan_dev) { + err = -ENODEV; + goto out_unlock; + } + } + + return 0; + out_unlock: + rtnl_unlock(); + return err; +} + +static void +nl802154_finish_wpan_dev_dump(struct cfg802154_registered_device *rdev) +{ + rtnl_unlock(); +} +#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ + /* message building helper */ static inline void *nl802154hdr_put(struct sk_buff *skb, u32 portid, u32 seq, int flags, u8 cmd) @@ -612,6 +690,107 @@ static inline u64 wpan_dev_id(struct wpan_dev *wpan_dev) ((u64)wpan_phy_to_rdev(wpan_dev->wpan_phy)->wpan_phy_idx << 32); } +#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL +#include <net/ieee802154_netdev.h> + +static int +ieee802154_llsec_send_key_id(struct sk_buff *msg, + const struct ieee802154_llsec_key_id *desc) +{ + struct nlattr *nl_dev_addr; + + if (nla_put_u32(msg, NL802154_KEY_ID_ATTR_MODE, desc->mode)) + return -ENOBUFS; + + switch (desc->mode) { + case NL802154_KEY_ID_MODE_IMPLICIT: + nl_dev_addr = nla_nest_start(msg, NL802154_KEY_ID_ATTR_IMPLICIT); + if (!nl_dev_addr) + return -ENOBUFS; + + if (nla_put_le16(msg, NL802154_DEV_ADDR_ATTR_PAN_ID, + desc->device_addr.pan_id) || + nla_put_u32(msg, NL802154_DEV_ADDR_ATTR_MODE, + desc->device_addr.mode)) + return -ENOBUFS; + + switch (desc->device_addr.mode) { + case NL802154_DEV_ADDR_SHORT: + if (nla_put_le16(msg, NL802154_DEV_ADDR_ATTR_SHORT, + desc->device_addr.short_addr)) + return -ENOBUFS; + break; + case NL802154_DEV_ADDR_EXTENDED: + if (nla_put_le64(msg, NL802154_DEV_ADDR_ATTR_EXTENDED, + desc->device_addr.extended_addr)) + return -ENOBUFS; + break; + default: + /* userspace should handle unknown */ + break; + } + + nla_nest_end(msg, nl_dev_addr); + break; + case NL802154_KEY_ID_MODE_INDEX: + break; + case NL802154_KEY_ID_MODE_INDEX_SHORT: + /* TODO renmae short_source? */ + if (nla_put_le32(msg, NL802154_KEY_ID_ATTR_SOURCE_SHORT, + desc->short_source)) + return -ENOBUFS; + break; + case NL802154_KEY_ID_MODE_INDEX_EXTENDED: + if (nla_put_le64(msg, NL802154_KEY_ID_ATTR_SOURCE_EXTENDED, + desc->extended_source)) + return -ENOBUFS; + break; + default: + /* userspace should handle unknown */ + break; + } + + /* TODO key_id to key_idx ? Check naming */ + if (desc->mode != NL802154_KEY_ID_MODE_IMPLICIT) { + if (nla_put_u8(msg, NL802154_KEY_ID_ATTR_INDEX, desc->id)) + return -ENOBUFS; + } + + return 0; +} + +static int nl802154_get_llsec_params(struct sk_buff *msg, + struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev) +{ + struct nlattr *nl_key_id; + struct ieee802154_llsec_params params; + int ret; + + ret = rdev_get_llsec_params(rdev, wpan_dev, ¶ms); + if (ret < 0) + return ret; + + if (nla_put_u8(msg, NL802154_ATTR_SEC_ENABLED, params.enabled) || + nla_put_u32(msg, NL802154_ATTR_SEC_OUT_LEVEL, params.out_level) || + nla_put_be32(msg, NL802154_ATTR_SEC_FRAME_COUNTER, + params.frame_counter)) + return -ENOBUFS; + + nl_key_id = nla_nest_start(msg, NL802154_ATTR_SEC_OUT_KEY_ID); + if (!nl_key_id) + return -ENOBUFS; + + ret = ieee802154_llsec_send_key_id(msg, ¶ms.out_key); + if (ret < 0) + return ret; + + nla_nest_end(msg, nl_key_id); + + return 0; +} +#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ + static int nl802154_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flags, struct cfg802154_registered_device *rdev, @@ -663,6 +842,11 @@ nl802154_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flags, if (nla_put_u8(msg, NL802154_ATTR_ACKREQ_DEFAULT, wpan_dev->ackreq)) goto nla_put_failure; +#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL + if (nl802154_get_llsec_params(msg, rdev, wpan_dev) < 0) + goto nla_put_failure; +#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ + genlmsg_end(msg, hdr); return 0; @@ -753,10 +937,8 @@ static int nl802154_new_interface(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } - /* TODO add nla_get_le64 to netlink */ if (info->attrs[NL802154_ATTR_EXTENDED_ADDR]) - extended_addr = (__force __le64)nla_get_u64( - info->attrs[NL802154_ATTR_EXTENDED_ADDR]); + extended_addr = nla_get_le64(info->attrs[NL802154_ATTR_EXTENDED_ADDR]); if (!rdev->ops->add_virtual_intf) return -EOPNOTSUPP; @@ -1034,7 +1216,7 @@ static int nl802154_set_lbt_mode(struct sk_buff *skb, struct genl_info *info) struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; - bool mode; + int mode; if (netif_running(dev)) return -EBUSY; @@ -1042,7 +1224,11 @@ static int nl802154_set_lbt_mode(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[NL802154_ATTR_LBT_MODE]) return -EINVAL; - mode = !!nla_get_u8(info->attrs[NL802154_ATTR_LBT_MODE]); + mode = nla_get_u8(info->attrs[NL802154_ATTR_LBT_MODE]); + + if (mode != 0 && mode != 1) + return -EINVAL; + if (!wpan_phy_supported_bool(mode, rdev->wpan_phy.supported.lbt)) return -EINVAL; @@ -1055,7 +1241,7 @@ nl802154_set_ackreq_default(struct sk_buff *skb, struct genl_info *info) struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; - bool ackreq; + int ackreq; if (netif_running(dev)) return -EBUSY; @@ -1063,10 +1249,846 @@ nl802154_set_ackreq_default(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[NL802154_ATTR_ACKREQ_DEFAULT]) return -EINVAL; - ackreq = !!nla_get_u8(info->attrs[NL802154_ATTR_ACKREQ_DEFAULT]); + ackreq = nla_get_u8(info->attrs[NL802154_ATTR_ACKREQ_DEFAULT]); + + if (ackreq != 0 && ackreq != 1) + return -EINVAL; + return rdev_set_ackreq_default(rdev, wpan_dev, ackreq); } +#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL +static const struct nla_policy nl802154_dev_addr_policy[NL802154_DEV_ADDR_ATTR_MAX + 1] = { + [NL802154_DEV_ADDR_ATTR_PAN_ID] = { .type = NLA_U16 }, + [NL802154_DEV_ADDR_ATTR_MODE] = { .type = NLA_U32 }, + [NL802154_DEV_ADDR_ATTR_SHORT] = { .type = NLA_U16 }, + [NL802154_DEV_ADDR_ATTR_EXTENDED] = { .type = NLA_U64 }, +}; + +static int +ieee802154_llsec_parse_dev_addr(struct nlattr *nla, + struct ieee802154_addr *addr) +{ + struct nlattr *attrs[NL802154_DEV_ADDR_ATTR_MAX + 1]; + + if (!nla || nla_parse_nested(attrs, NL802154_DEV_ADDR_ATTR_MAX, nla, + nl802154_dev_addr_policy)) + return -EINVAL; + + if (!attrs[NL802154_DEV_ADDR_ATTR_PAN_ID] && + !attrs[NL802154_DEV_ADDR_ATTR_MODE] && + !(attrs[NL802154_DEV_ADDR_ATTR_SHORT] || + attrs[NL802154_DEV_ADDR_ATTR_EXTENDED])) + return -EINVAL; + + addr->pan_id = nla_get_le16(attrs[NL802154_DEV_ADDR_ATTR_PAN_ID]); + addr->mode = nla_get_u32(attrs[NL802154_DEV_ADDR_ATTR_MODE]); + switch (addr->mode) { + case NL802154_DEV_ADDR_SHORT: + addr->short_addr = nla_get_le16(attrs[NL802154_DEV_ADDR_ATTR_SHORT]); + break; + case NL802154_DEV_ADDR_EXTENDED: + addr->extended_addr = nla_get_le64(attrs[NL802154_DEV_ADDR_ATTR_EXTENDED]); + break; + default: + return -EINVAL; + } + + return 0; +} + +static const struct nla_policy nl802154_key_id_policy[NL802154_KEY_ID_ATTR_MAX + 1] = { + [NL802154_KEY_ID_ATTR_MODE] = { .type = NLA_U32 }, + [NL802154_KEY_ID_ATTR_INDEX] = { .type = NLA_U8 }, + [NL802154_KEY_ID_ATTR_IMPLICIT] = { .type = NLA_NESTED }, + [NL802154_KEY_ID_ATTR_SOURCE_SHORT] = { .type = NLA_U32 }, + [NL802154_KEY_ID_ATTR_SOURCE_EXTENDED] = { .type = NLA_U64 }, +}; + +static int +ieee802154_llsec_parse_key_id(struct nlattr *nla, + struct ieee802154_llsec_key_id *desc) +{ + struct nlattr *attrs[NL802154_KEY_ID_ATTR_MAX + 1]; + + if (!nla || nla_parse_nested(attrs, NL802154_KEY_ID_ATTR_MAX, nla, + nl802154_key_id_policy)) + return -EINVAL; + + if (!attrs[NL802154_KEY_ID_ATTR_MODE]) + return -EINVAL; + + desc->mode = nla_get_u32(attrs[NL802154_KEY_ID_ATTR_MODE]); + switch (desc->mode) { + case NL802154_KEY_ID_MODE_IMPLICIT: + if (!attrs[NL802154_KEY_ID_ATTR_IMPLICIT]) + return -EINVAL; + + if (ieee802154_llsec_parse_dev_addr(attrs[NL802154_KEY_ID_ATTR_IMPLICIT], + &desc->device_addr) < 0) + return -EINVAL; + break; + case NL802154_KEY_ID_MODE_INDEX: + break; + case NL802154_KEY_ID_MODE_INDEX_SHORT: + if (!attrs[NL802154_KEY_ID_ATTR_SOURCE_SHORT]) + return -EINVAL; + + desc->short_source = nla_get_le32(attrs[NL802154_KEY_ID_ATTR_SOURCE_SHORT]); + break; + case NL802154_KEY_ID_MODE_INDEX_EXTENDED: + if (!attrs[NL802154_KEY_ID_ATTR_SOURCE_EXTENDED]) + return -EINVAL; + + desc->extended_source = nla_get_le64(attrs[NL802154_KEY_ID_ATTR_SOURCE_EXTENDED]); + break; + default: + return -EINVAL; + } + + if (desc->mode != NL802154_KEY_ID_MODE_IMPLICIT) { + if (!attrs[NL802154_KEY_ID_ATTR_INDEX]) + return -EINVAL; + + /* TODO change id to idx */ + desc->id = nla_get_u8(attrs[NL802154_KEY_ID_ATTR_INDEX]); + } + + return 0; +} + +static int nl802154_set_llsec_params(struct sk_buff *skb, + struct genl_info *info) +{ + struct cfg802154_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wpan_dev *wpan_dev = dev->ieee802154_ptr; + struct ieee802154_llsec_params params; + u32 changed = 0; + int ret; + + if (info->attrs[NL802154_ATTR_SEC_ENABLED]) { + u8 enabled; + + enabled = nla_get_u8(info->attrs[NL802154_ATTR_SEC_ENABLED]); + if (enabled != 0 && enabled != 1) + return -EINVAL; + + params.enabled = nla_get_u8(info->attrs[NL802154_ATTR_SEC_ENABLED]); + changed |= IEEE802154_LLSEC_PARAM_ENABLED; + } + + if (info->attrs[NL802154_ATTR_SEC_OUT_KEY_ID]) { + ret = ieee802154_llsec_parse_key_id(info->attrs[NL802154_ATTR_SEC_OUT_KEY_ID], + ¶ms.out_key); + if (ret < 0) + return ret; + + changed |= IEEE802154_LLSEC_PARAM_OUT_KEY; + } + + if (info->attrs[NL802154_ATTR_SEC_OUT_LEVEL]) { + params.out_level = nla_get_u32(info->attrs[NL802154_ATTR_SEC_OUT_LEVEL]); + if (params.out_level > NL802154_SECLEVEL_MAX) + return -EINVAL; + + changed |= IEEE802154_LLSEC_PARAM_OUT_LEVEL; + } + + if (info->attrs[NL802154_ATTR_SEC_FRAME_COUNTER]) { + params.frame_counter = nla_get_be32(info->attrs[NL802154_ATTR_SEC_FRAME_COUNTER]); + changed |= IEEE802154_LLSEC_PARAM_FRAME_COUNTER; + } + + return rdev_set_llsec_params(rdev, wpan_dev, ¶ms, changed); +} + +static int nl802154_send_key(struct sk_buff *msg, u32 cmd, u32 portid, + u32 seq, int flags, + struct cfg802154_registered_device *rdev, + struct net_device *dev, + const struct ieee802154_llsec_key_entry *key) +{ + void *hdr; + u32 commands[NL802154_CMD_FRAME_NR_IDS / 32]; + struct nlattr *nl_key, *nl_key_id; + + hdr = nl802154hdr_put(msg, portid, seq, flags, cmd); + if (!hdr) + return -1; + + if (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex)) + goto nla_put_failure; + + nl_key = nla_nest_start(msg, NL802154_ATTR_SEC_KEY); + if (!nl_key) + goto nla_put_failure; + + nl_key_id = nla_nest_start(msg, NL802154_KEY_ATTR_ID); + if (!nl_key_id) + goto nla_put_failure; + + if (ieee802154_llsec_send_key_id(msg, &key->id) < 0) + goto nla_put_failure; + + nla_nest_end(msg, nl_key_id); + + if (nla_put_u8(msg, NL802154_KEY_ATTR_USAGE_FRAMES, + key->key->frame_types)) + goto nla_put_failure; + + if (key->key->frame_types & BIT(NL802154_FRAME_CMD)) { + /* TODO for each nested */ + memset(commands, 0, sizeof(commands)); + commands[7] = key->key->cmd_frame_ids; + if (nla_put(msg, NL802154_KEY_ATTR_USAGE_CMDS, + sizeof(commands), commands)) + goto nla_put_failure; + } + + if (nla_put(msg, NL802154_KEY_ATTR_BYTES, NL802154_KEY_SIZE, + key->key->key)) + goto nla_put_failure; + + nla_nest_end(msg, nl_key); + genlmsg_end(msg, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int +nl802154_dump_llsec_key(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct cfg802154_registered_device *rdev = NULL; + struct ieee802154_llsec_key_entry *key; + struct ieee802154_llsec_table *table; + struct wpan_dev *wpan_dev; + int err; + + err = nl802154_prepare_wpan_dev_dump(skb, cb, &rdev, &wpan_dev); + if (err) + return err; + + if (!wpan_dev->netdev) { + err = -EINVAL; + goto out_err; + } + + rdev_lock_llsec_table(rdev, wpan_dev); + rdev_get_llsec_table(rdev, wpan_dev, &table); + + /* TODO make it like station dump */ + if (cb->args[2]) + goto out; + + list_for_each_entry(key, &table->keys, list) { + if (nl802154_send_key(skb, NL802154_CMD_NEW_SEC_KEY, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + rdev, wpan_dev->netdev, key) < 0) { + /* TODO */ + err = -EIO; + rdev_unlock_llsec_table(rdev, wpan_dev); + goto out_err; + } + } + + cb->args[2] = 1; + +out: + rdev_unlock_llsec_table(rdev, wpan_dev); + err = skb->len; +out_err: + nl802154_finish_wpan_dev_dump(rdev); + + return err; +} + +static const struct nla_policy nl802154_key_policy[NL802154_KEY_ATTR_MAX + 1] = { + [NL802154_KEY_ATTR_ID] = { NLA_NESTED }, + /* TODO handle it as for_each_nested and NLA_FLAG? */ + [NL802154_KEY_ATTR_USAGE_FRAMES] = { NLA_U8 }, + /* TODO handle it as for_each_nested, not static array? */ + [NL802154_KEY_ATTR_USAGE_CMDS] = { .len = NL802154_CMD_FRAME_NR_IDS / 8 }, + [NL802154_KEY_ATTR_BYTES] = { .len = NL802154_KEY_SIZE }, +}; + +static int nl802154_add_llsec_key(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg802154_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wpan_dev *wpan_dev = dev->ieee802154_ptr; + struct nlattr *attrs[NL802154_KEY_ATTR_MAX + 1]; + struct ieee802154_llsec_key key = { }; + struct ieee802154_llsec_key_id id = { }; + u32 commands[NL802154_CMD_FRAME_NR_IDS / 32] = { }; + + if (nla_parse_nested(attrs, NL802154_KEY_ATTR_MAX, + info->attrs[NL802154_ATTR_SEC_KEY], + nl802154_key_policy)) + return -EINVAL; + + if (!attrs[NL802154_KEY_ATTR_USAGE_FRAMES] || + !attrs[NL802154_KEY_ATTR_BYTES]) + return -EINVAL; + + if (ieee802154_llsec_parse_key_id(attrs[NL802154_KEY_ATTR_ID], &id) < 0) + return -ENOBUFS; + + key.frame_types = nla_get_u8(attrs[NL802154_KEY_ATTR_USAGE_FRAMES]); + if (key.frame_types > BIT(NL802154_FRAME_MAX) || + ((key.frame_types & BIT(NL802154_FRAME_CMD)) && + !attrs[NL802154_KEY_ATTR_USAGE_CMDS])) + return -EINVAL; + + if (attrs[NL802154_KEY_ATTR_USAGE_CMDS]) { + /* TODO for each nested */ + nla_memcpy(commands, attrs[NL802154_KEY_ATTR_USAGE_CMDS], + NL802154_CMD_FRAME_NR_IDS / 8); + + /* TODO understand the -EINVAL logic here? last condition */ + if (commands[0] || commands[1] || commands[2] || commands[3] || + commands[4] || commands[5] || commands[6] || + commands[7] > BIT(NL802154_CMD_FRAME_MAX)) + return -EINVAL; + + key.cmd_frame_ids = commands[7]; + } else { + key.cmd_frame_ids = 0; + } + + nla_memcpy(key.key, attrs[NL802154_KEY_ATTR_BYTES], NL802154_KEY_SIZE); + + if (ieee802154_llsec_parse_key_id(attrs[NL802154_KEY_ATTR_ID], &id) < 0) + return -ENOBUFS; + + return rdev_add_llsec_key(rdev, wpan_dev, &id, &key); +} + +static int nl802154_del_llsec_key(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg802154_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wpan_dev *wpan_dev = dev->ieee802154_ptr; + struct nlattr *attrs[NL802154_KEY_ATTR_MAX + 1]; + struct ieee802154_llsec_key_id id; + + if (nla_parse_nested(attrs, NL802154_KEY_ATTR_MAX, + info->attrs[NL802154_ATTR_SEC_KEY], + nl802154_key_policy)) + return -EINVAL; + + if (ieee802154_llsec_parse_key_id(attrs[NL802154_KEY_ATTR_ID], &id) < 0) + return -ENOBUFS; + + return rdev_del_llsec_key(rdev, wpan_dev, &id); +} + +static int nl802154_send_device(struct sk_buff *msg, u32 cmd, u32 portid, + u32 seq, int flags, + struct cfg802154_registered_device *rdev, + struct net_device *dev, + const struct ieee802154_llsec_device *dev_desc) +{ + void *hdr; + struct nlattr *nl_device; + + hdr = nl802154hdr_put(msg, portid, seq, flags, cmd); + if (!hdr) + return -1; + + if (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex)) + goto nla_put_failure; + + nl_device = nla_nest_start(msg, NL802154_ATTR_SEC_DEVICE); + if (!nl_device) + goto nla_put_failure; + + if (nla_put_u32(msg, NL802154_DEV_ATTR_FRAME_COUNTER, + dev_desc->frame_counter) || + nla_put_le16(msg, NL802154_DEV_ATTR_PAN_ID, dev_desc->pan_id) || + nla_put_le16(msg, NL802154_DEV_ATTR_SHORT_ADDR, + dev_desc->short_addr) || + nla_put_le64(msg, NL802154_DEV_ATTR_EXTENDED_ADDR, + dev_desc->hwaddr) || + nla_put_u8(msg, NL802154_DEV_ATTR_SECLEVEL_EXEMPT, + dev_desc->seclevel_exempt) || + nla_put_u32(msg, NL802154_DEV_ATTR_KEY_MODE, dev_desc->key_mode)) + goto nla_put_failure; + + nla_nest_end(msg, nl_device); + genlmsg_end(msg, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int +nl802154_dump_llsec_dev(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct cfg802154_registered_device *rdev = NULL; + struct ieee802154_llsec_device *dev; + struct ieee802154_llsec_table *table; + struct wpan_dev *wpan_dev; + int err; + + err = nl802154_prepare_wpan_dev_dump(skb, cb, &rdev, &wpan_dev); + if (err) + return err; + + if (!wpan_dev->netdev) { + err = -EINVAL; + goto out_err; + } + + rdev_lock_llsec_table(rdev, wpan_dev); + rdev_get_llsec_table(rdev, wpan_dev, &table); + + /* TODO make it like station dump */ + if (cb->args[2]) + goto out; + + list_for_each_entry(dev, &table->devices, list) { + if (nl802154_send_device(skb, NL802154_CMD_NEW_SEC_LEVEL, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + rdev, wpan_dev->netdev, dev) < 0) { + /* TODO */ + err = -EIO; + rdev_unlock_llsec_table(rdev, wpan_dev); + goto out_err; + } + } + + cb->args[2] = 1; + +out: + rdev_unlock_llsec_table(rdev, wpan_dev); + err = skb->len; +out_err: + nl802154_finish_wpan_dev_dump(rdev); + + return err; +} + +static const struct nla_policy nl802154_dev_policy[NL802154_DEV_ATTR_MAX + 1] = { + [NL802154_DEV_ATTR_FRAME_COUNTER] = { NLA_U32 }, + [NL802154_DEV_ATTR_PAN_ID] = { .type = NLA_U16 }, + [NL802154_DEV_ATTR_SHORT_ADDR] = { .type = NLA_U16 }, + [NL802154_DEV_ATTR_EXTENDED_ADDR] = { .type = NLA_U64 }, + [NL802154_DEV_ATTR_SECLEVEL_EXEMPT] = { NLA_U8 }, + [NL802154_DEV_ATTR_KEY_MODE] = { NLA_U32 }, +}; + +static int +ieee802154_llsec_parse_device(struct nlattr *nla, + struct ieee802154_llsec_device *dev) +{ + struct nlattr *attrs[NL802154_DEV_ATTR_MAX + 1]; + + if (!nla || nla_parse_nested(attrs, NL802154_DEV_ATTR_MAX, nla, + nl802154_dev_policy)) + return -EINVAL; + + memset(dev, 0, sizeof(*dev)); + + if (!attrs[NL802154_DEV_ATTR_FRAME_COUNTER] || + !attrs[NL802154_DEV_ATTR_PAN_ID] || + !attrs[NL802154_DEV_ATTR_SHORT_ADDR] || + !attrs[NL802154_DEV_ATTR_EXTENDED_ADDR] || + !attrs[NL802154_DEV_ATTR_SECLEVEL_EXEMPT] || + !attrs[NL802154_DEV_ATTR_KEY_MODE]) + return -EINVAL; + + /* TODO be32 */ + dev->frame_counter = nla_get_u32(attrs[NL802154_DEV_ATTR_FRAME_COUNTER]); + dev->pan_id = nla_get_le16(attrs[NL802154_DEV_ATTR_PAN_ID]); + dev->short_addr = nla_get_le16(attrs[NL802154_DEV_ATTR_SHORT_ADDR]); + /* TODO rename hwaddr to extended_addr */ + dev->hwaddr = nla_get_le64(attrs[NL802154_DEV_ATTR_EXTENDED_ADDR]); + dev->seclevel_exempt = nla_get_u8(attrs[NL802154_DEV_ATTR_SECLEVEL_EXEMPT]); + dev->key_mode = nla_get_u32(attrs[NL802154_DEV_ATTR_KEY_MODE]); + + if (dev->key_mode > NL802154_DEVKEY_MAX || + (dev->seclevel_exempt != 0 && dev->seclevel_exempt != 1)) + return -EINVAL; + + return 0; +} + +static int nl802154_add_llsec_dev(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg802154_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wpan_dev *wpan_dev = dev->ieee802154_ptr; + struct ieee802154_llsec_device dev_desc; + + if (ieee802154_llsec_parse_device(info->attrs[NL802154_ATTR_SEC_DEVICE], + &dev_desc) < 0) + return -EINVAL; + + return rdev_add_device(rdev, wpan_dev, &dev_desc); +} + +static int nl802154_del_llsec_dev(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg802154_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wpan_dev *wpan_dev = dev->ieee802154_ptr; + struct nlattr *attrs[NL802154_DEV_ATTR_MAX + 1]; + __le64 extended_addr; + + if (nla_parse_nested(attrs, NL802154_DEV_ATTR_MAX, + info->attrs[NL802154_ATTR_SEC_DEVICE], + nl802154_dev_policy)) + return -EINVAL; + + if (!attrs[NL802154_DEV_ATTR_EXTENDED_ADDR]) + return -EINVAL; + + extended_addr = nla_get_le64(attrs[NL802154_DEV_ATTR_EXTENDED_ADDR]); + return rdev_del_device(rdev, wpan_dev, extended_addr); +} + +static int nl802154_send_devkey(struct sk_buff *msg, u32 cmd, u32 portid, + u32 seq, int flags, + struct cfg802154_registered_device *rdev, + struct net_device *dev, __le64 extended_addr, + const struct ieee802154_llsec_device_key *devkey) +{ + void *hdr; + struct nlattr *nl_devkey, *nl_key_id; + + hdr = nl802154hdr_put(msg, portid, seq, flags, cmd); + if (!hdr) + return -1; + + if (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex)) + goto nla_put_failure; + + nl_devkey = nla_nest_start(msg, NL802154_ATTR_SEC_DEVKEY); + if (!nl_devkey) + goto nla_put_failure; + + if (nla_put_le64(msg, NL802154_DEVKEY_ATTR_EXTENDED_ADDR, + extended_addr) || + nla_put_u32(msg, NL802154_DEVKEY_ATTR_FRAME_COUNTER, + devkey->frame_counter)) + goto nla_put_failure; + + nl_key_id = nla_nest_start(msg, NL802154_DEVKEY_ATTR_ID); + if (!nl_key_id) + goto nla_put_failure; + + if (ieee802154_llsec_send_key_id(msg, &devkey->key_id) < 0) + goto nla_put_failure; + + nla_nest_end(msg, nl_key_id); + nla_nest_end(msg, nl_devkey); + genlmsg_end(msg, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int +nl802154_dump_llsec_devkey(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct cfg802154_registered_device *rdev = NULL; + struct ieee802154_llsec_device_key *kpos; + struct ieee802154_llsec_device *dpos; + struct ieee802154_llsec_table *table; + struct wpan_dev *wpan_dev; + int err; + + err = nl802154_prepare_wpan_dev_dump(skb, cb, &rdev, &wpan_dev); + if (err) + return err; + + if (!wpan_dev->netdev) { + err = -EINVAL; + goto out_err; + } + + rdev_lock_llsec_table(rdev, wpan_dev); + rdev_get_llsec_table(rdev, wpan_dev, &table); + + /* TODO make it like station dump */ + if (cb->args[2]) + goto out; + + /* TODO look if remove devkey and do some nested attribute */ + list_for_each_entry(dpos, &table->devices, list) { + list_for_each_entry(kpos, &dpos->keys, list) { + if (nl802154_send_devkey(skb, + NL802154_CMD_NEW_SEC_LEVEL, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI, rdev, + wpan_dev->netdev, + dpos->hwaddr, + kpos) < 0) { + /* TODO */ + err = -EIO; + rdev_unlock_llsec_table(rdev, wpan_dev); + goto out_err; + } + } + } + + cb->args[2] = 1; + +out: + rdev_unlock_llsec_table(rdev, wpan_dev); + err = skb->len; +out_err: + nl802154_finish_wpan_dev_dump(rdev); + + return err; +} + +static const struct nla_policy nl802154_devkey_policy[NL802154_DEVKEY_ATTR_MAX + 1] = { + [NL802154_DEVKEY_ATTR_FRAME_COUNTER] = { NLA_U32 }, + [NL802154_DEVKEY_ATTR_EXTENDED_ADDR] = { NLA_U64 }, + [NL802154_DEVKEY_ATTR_ID] = { NLA_NESTED }, +}; + +static int nl802154_add_llsec_devkey(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg802154_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wpan_dev *wpan_dev = dev->ieee802154_ptr; + struct nlattr *attrs[NL802154_DEVKEY_ATTR_MAX + 1]; + struct ieee802154_llsec_device_key key; + __le64 extended_addr; + + if (!info->attrs[NL802154_ATTR_SEC_DEVKEY] || + nla_parse_nested(attrs, NL802154_DEVKEY_ATTR_MAX, + info->attrs[NL802154_ATTR_SEC_DEVKEY], + nl802154_devkey_policy) < 0) + return -EINVAL; + + if (!attrs[NL802154_DEVKEY_ATTR_FRAME_COUNTER] || + !attrs[NL802154_DEVKEY_ATTR_EXTENDED_ADDR]) + return -EINVAL; + + /* TODO change key.id ? */ + if (ieee802154_llsec_parse_key_id(attrs[NL802154_DEVKEY_ATTR_ID], + &key.key_id) < 0) + return -ENOBUFS; + + /* TODO be32 */ + key.frame_counter = nla_get_u32(attrs[NL802154_DEVKEY_ATTR_FRAME_COUNTER]); + /* TODO change naming hwaddr -> extended_addr + * check unique identifier short+pan OR extended_addr + */ + extended_addr = nla_get_le64(attrs[NL802154_DEVKEY_ATTR_EXTENDED_ADDR]); + return rdev_add_devkey(rdev, wpan_dev, extended_addr, &key); +} + +static int nl802154_del_llsec_devkey(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg802154_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wpan_dev *wpan_dev = dev->ieee802154_ptr; + struct nlattr *attrs[NL802154_DEVKEY_ATTR_MAX + 1]; + struct ieee802154_llsec_device_key key; + __le64 extended_addr; + + if (nla_parse_nested(attrs, NL802154_DEVKEY_ATTR_MAX, + info->attrs[NL802154_ATTR_SEC_DEVKEY], + nl802154_devkey_policy)) + return -EINVAL; + + if (!attrs[NL802154_DEVKEY_ATTR_EXTENDED_ADDR]) + return -EINVAL; + + /* TODO change key.id ? */ + if (ieee802154_llsec_parse_key_id(attrs[NL802154_DEVKEY_ATTR_ID], + &key.key_id) < 0) + return -ENOBUFS; + + /* TODO change naming hwaddr -> extended_addr + * check unique identifier short+pan OR extended_addr + */ + extended_addr = nla_get_le64(attrs[NL802154_DEVKEY_ATTR_EXTENDED_ADDR]); + return rdev_del_devkey(rdev, wpan_dev, extended_addr, &key); +} + +static int nl802154_send_seclevel(struct sk_buff *msg, u32 cmd, u32 portid, + u32 seq, int flags, + struct cfg802154_registered_device *rdev, + struct net_device *dev, + const struct ieee802154_llsec_seclevel *sl) +{ + void *hdr; + struct nlattr *nl_seclevel; + + hdr = nl802154hdr_put(msg, portid, seq, flags, cmd); + if (!hdr) + return -1; + + if (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex)) + goto nla_put_failure; + + nl_seclevel = nla_nest_start(msg, NL802154_ATTR_SEC_LEVEL); + if (!nl_seclevel) + goto nla_put_failure; + + if (nla_put_u32(msg, NL802154_SECLEVEL_ATTR_FRAME, sl->frame_type) || + nla_put_u32(msg, NL802154_SECLEVEL_ATTR_LEVELS, sl->sec_levels) || + nla_put_u8(msg, NL802154_SECLEVEL_ATTR_DEV_OVERRIDE, + sl->device_override)) + goto nla_put_failure; + + if (sl->frame_type == NL802154_FRAME_CMD) { + if (nla_put_u32(msg, NL802154_SECLEVEL_ATTR_CMD_FRAME, + sl->cmd_frame_id)) + goto nla_put_failure; + } + + nla_nest_end(msg, nl_seclevel); + genlmsg_end(msg, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int +nl802154_dump_llsec_seclevel(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct cfg802154_registered_device *rdev = NULL; + struct ieee802154_llsec_seclevel *sl; + struct ieee802154_llsec_table *table; + struct wpan_dev *wpan_dev; + int err; + + err = nl802154_prepare_wpan_dev_dump(skb, cb, &rdev, &wpan_dev); + if (err) + return err; + + if (!wpan_dev->netdev) { + err = -EINVAL; + goto out_err; + } + + rdev_lock_llsec_table(rdev, wpan_dev); + rdev_get_llsec_table(rdev, wpan_dev, &table); + + /* TODO make it like station dump */ + if (cb->args[2]) + goto out; + + list_for_each_entry(sl, &table->security_levels, list) { + if (nl802154_send_seclevel(skb, NL802154_CMD_NEW_SEC_LEVEL, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + rdev, wpan_dev->netdev, sl) < 0) { + /* TODO */ + err = -EIO; + rdev_unlock_llsec_table(rdev, wpan_dev); + goto out_err; + } + } + + cb->args[2] = 1; + +out: + rdev_unlock_llsec_table(rdev, wpan_dev); + err = skb->len; +out_err: + nl802154_finish_wpan_dev_dump(rdev); + + return err; +} + +static const struct nla_policy nl802154_seclevel_policy[NL802154_SECLEVEL_ATTR_MAX + 1] = { + [NL802154_SECLEVEL_ATTR_LEVELS] = { .type = NLA_U8 }, + [NL802154_SECLEVEL_ATTR_FRAME] = { .type = NLA_U32 }, + [NL802154_SECLEVEL_ATTR_CMD_FRAME] = { .type = NLA_U32 }, + [NL802154_SECLEVEL_ATTR_DEV_OVERRIDE] = { .type = NLA_U8 }, +}; + +static int +llsec_parse_seclevel(struct nlattr *nla, struct ieee802154_llsec_seclevel *sl) +{ + struct nlattr *attrs[NL802154_SECLEVEL_ATTR_MAX + 1]; + + if (!nla || nla_parse_nested(attrs, NL802154_SECLEVEL_ATTR_MAX, nla, + nl802154_seclevel_policy)) + return -EINVAL; + + memset(sl, 0, sizeof(*sl)); + + if (!attrs[NL802154_SECLEVEL_ATTR_LEVELS] || + !attrs[NL802154_SECLEVEL_ATTR_FRAME] || + !attrs[NL802154_SECLEVEL_ATTR_DEV_OVERRIDE]) + return -EINVAL; + + sl->sec_levels = nla_get_u8(attrs[NL802154_SECLEVEL_ATTR_LEVELS]); + sl->frame_type = nla_get_u32(attrs[NL802154_SECLEVEL_ATTR_FRAME]); + sl->device_override = nla_get_u8(attrs[NL802154_SECLEVEL_ATTR_DEV_OVERRIDE]); + if (sl->frame_type > NL802154_FRAME_MAX || + (sl->device_override != 0 && sl->device_override != 1)) + return -EINVAL; + + if (sl->frame_type == NL802154_FRAME_CMD) { + if (!attrs[NL802154_SECLEVEL_ATTR_CMD_FRAME]) + return -EINVAL; + + sl->cmd_frame_id = nla_get_u32(attrs[NL802154_SECLEVEL_ATTR_CMD_FRAME]); + if (sl->cmd_frame_id > NL802154_CMD_FRAME_MAX) + return -EINVAL; + } + + return 0; +} + +static int nl802154_add_llsec_seclevel(struct sk_buff *skb, + struct genl_info *info) +{ + struct cfg802154_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wpan_dev *wpan_dev = dev->ieee802154_ptr; + struct ieee802154_llsec_seclevel sl; + + if (llsec_parse_seclevel(info->attrs[NL802154_ATTR_SEC_LEVEL], + &sl) < 0) + return -EINVAL; + + return rdev_add_seclevel(rdev, wpan_dev, &sl); +} + +static int nl802154_del_llsec_seclevel(struct sk_buff *skb, + struct genl_info *info) +{ + struct cfg802154_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wpan_dev *wpan_dev = dev->ieee802154_ptr; + struct ieee802154_llsec_seclevel sl; + + if (!info->attrs[NL802154_ATTR_SEC_LEVEL] || + llsec_parse_seclevel(info->attrs[NL802154_ATTR_SEC_LEVEL], + &sl) < 0) + return -EINVAL; + + return rdev_del_seclevel(rdev, wpan_dev, &sl); +} +#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ + #define NL802154_FLAG_NEED_WPAN_PHY 0x01 #define NL802154_FLAG_NEED_NETDEV 0x02 #define NL802154_FLAG_NEED_RTNL 0x04 @@ -1281,6 +2303,119 @@ static const struct genl_ops nl802154_ops[] = { .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, +#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL + { + .cmd = NL802154_CMD_SET_SEC_PARAMS, + .doit = nl802154_set_llsec_params, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_NETDEV | + NL802154_FLAG_NEED_RTNL, + }, + { + .cmd = NL802154_CMD_GET_SEC_KEY, + /* TODO .doit by matching key id? */ + .dumpit = nl802154_dump_llsec_key, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_NETDEV | + NL802154_FLAG_NEED_RTNL, + }, + { + .cmd = NL802154_CMD_NEW_SEC_KEY, + .doit = nl802154_add_llsec_key, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_NETDEV | + NL802154_FLAG_NEED_RTNL, + }, + { + .cmd = NL802154_CMD_DEL_SEC_KEY, + .doit = nl802154_del_llsec_key, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_NETDEV | + NL802154_FLAG_NEED_RTNL, + }, + /* TODO unique identifier must short+pan OR extended_addr */ + { + .cmd = NL802154_CMD_GET_SEC_DEV, + /* TODO .doit by matching extended_addr? */ + .dumpit = nl802154_dump_llsec_dev, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_NETDEV | + NL802154_FLAG_NEED_RTNL, + }, + { + .cmd = NL802154_CMD_NEW_SEC_DEV, + .doit = nl802154_add_llsec_dev, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_NETDEV | + NL802154_FLAG_NEED_RTNL, + }, + { + .cmd = NL802154_CMD_DEL_SEC_DEV, + .doit = nl802154_del_llsec_dev, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_NETDEV | + NL802154_FLAG_NEED_RTNL, + }, + /* TODO remove complete devkey, put it as nested? */ + { + .cmd = NL802154_CMD_GET_SEC_DEVKEY, + /* TODO doit by matching ??? */ + .dumpit = nl802154_dump_llsec_devkey, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_NETDEV | + NL802154_FLAG_NEED_RTNL, + }, + { + .cmd = NL802154_CMD_NEW_SEC_DEVKEY, + .doit = nl802154_add_llsec_devkey, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_NETDEV | + NL802154_FLAG_NEED_RTNL, + }, + { + .cmd = NL802154_CMD_DEL_SEC_DEVKEY, + .doit = nl802154_del_llsec_devkey, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_NETDEV | + NL802154_FLAG_NEED_RTNL, + }, + { + .cmd = NL802154_CMD_GET_SEC_LEVEL, + /* TODO .doit by matching frame_type? */ + .dumpit = nl802154_dump_llsec_seclevel, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_NETDEV | + NL802154_FLAG_NEED_RTNL, + }, + { + .cmd = NL802154_CMD_NEW_SEC_LEVEL, + .doit = nl802154_add_llsec_seclevel, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_NETDEV | + NL802154_FLAG_NEED_RTNL, + }, + { + .cmd = NL802154_CMD_DEL_SEC_LEVEL, + /* TODO match frame_type only? */ + .doit = nl802154_del_llsec_seclevel, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_NETDEV | + NL802154_FLAG_NEED_RTNL, + }, +#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ }; /* initialisation/exit functions */ diff --git a/net/ieee802154/rdev-ops.h b/net/ieee802154/rdev-ops.h index 03b357501cc5..4441c63b3ea6 100644 --- a/net/ieee802154/rdev-ops.h +++ b/net/ieee802154/rdev-ops.h @@ -208,4 +208,113 @@ rdev_set_ackreq_default(struct cfg802154_registered_device *rdev, return ret; } +#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL +/* TODO this is already a nl802154, so move into ieee802154 */ +static inline void +rdev_get_llsec_table(struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev, + struct ieee802154_llsec_table **table) +{ + rdev->ops->get_llsec_table(&rdev->wpan_phy, wpan_dev, table); +} + +static inline void +rdev_lock_llsec_table(struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev) +{ + rdev->ops->lock_llsec_table(&rdev->wpan_phy, wpan_dev); +} + +static inline void +rdev_unlock_llsec_table(struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev) +{ + rdev->ops->unlock_llsec_table(&rdev->wpan_phy, wpan_dev); +} + +static inline int +rdev_get_llsec_params(struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev, + struct ieee802154_llsec_params *params) +{ + return rdev->ops->get_llsec_params(&rdev->wpan_phy, wpan_dev, params); +} + +static inline int +rdev_set_llsec_params(struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev, + const struct ieee802154_llsec_params *params, + u32 changed) +{ + return rdev->ops->set_llsec_params(&rdev->wpan_phy, wpan_dev, params, + changed); +} + +static inline int +rdev_add_llsec_key(struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev, + const struct ieee802154_llsec_key_id *id, + const struct ieee802154_llsec_key *key) +{ + return rdev->ops->add_llsec_key(&rdev->wpan_phy, wpan_dev, id, key); +} + +static inline int +rdev_del_llsec_key(struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev, + const struct ieee802154_llsec_key_id *id) +{ + return rdev->ops->del_llsec_key(&rdev->wpan_phy, wpan_dev, id); +} + +static inline int +rdev_add_seclevel(struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev, + const struct ieee802154_llsec_seclevel *sl) +{ + return rdev->ops->add_seclevel(&rdev->wpan_phy, wpan_dev, sl); +} + +static inline int +rdev_del_seclevel(struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev, + const struct ieee802154_llsec_seclevel *sl) +{ + return rdev->ops->del_seclevel(&rdev->wpan_phy, wpan_dev, sl); +} + +static inline int +rdev_add_device(struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev, + const struct ieee802154_llsec_device *dev_desc) +{ + return rdev->ops->add_device(&rdev->wpan_phy, wpan_dev, dev_desc); +} + +static inline int +rdev_del_device(struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev, __le64 extended_addr) +{ + return rdev->ops->del_device(&rdev->wpan_phy, wpan_dev, extended_addr); +} + +static inline int +rdev_add_devkey(struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev, __le64 extended_addr, + const struct ieee802154_llsec_device_key *devkey) +{ + return rdev->ops->add_devkey(&rdev->wpan_phy, wpan_dev, extended_addr, + devkey); +} + +static inline int +rdev_del_devkey(struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev, __le64 extended_addr, + const struct ieee802154_llsec_device_key *devkey) +{ + return rdev->ops->del_devkey(&rdev->wpan_phy, wpan_dev, extended_addr, + devkey); +} +#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ + #endif /* __CFG802154_RDEV_OPS */ diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index b6eacf30ee7a..a548be247e15 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -273,7 +273,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) goto out; } - mtu = dev->mtu; + mtu = IEEE802154_MTU; pr_debug("name = %s, mtu = %u\n", dev->name, mtu); if (size > mtu) { @@ -637,7 +637,7 @@ static int dgram_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) err = -ENXIO; goto out; } - mtu = dev->mtu; + mtu = IEEE802154_MTU; pr_debug("name = %s, mtu = %u\n", dev->name, mtu); if (size > mtu) { @@ -676,8 +676,8 @@ static int dgram_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) cb->seclevel = ro->seclevel; cb->seclevel_override = ro->seclevel_override; - err = dev_hard_header(skb, dev, ETH_P_IEEE802154, &dst_addr, - ro->bound ? &ro->src_addr : NULL, size); + err = wpan_dev_hard_header(skb, dev, &dst_addr, + ro->bound ? &ro->src_addr : NULL, size); if (err < 0) goto out_skb; diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 6fb3c90ad726..416dfa004cfb 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -331,20 +331,6 @@ config NET_FOU_IP_TUNNELS When this option is enabled IP tunnels can be configured to use FOU or GUE encapsulation. -config GENEVE_CORE - tristate "Generic Network Virtualization Encapsulation library" - depends on INET - select NET_UDP_TUNNEL - ---help--- - This allows one to create Geneve virtual interfaces that provide - Layer 2 Networks over Layer 3 Networks. Geneve is often used - to tunnel virtual network infrastructure in virtualized environments. - For more information see: - http://tools.ietf.org/html/draft-gross-geneve-01 - - To compile this driver as a module, choose M here: the module - - config INET_AH tristate "IP: AH transformation" select XFRM_ALGO diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index efc43f300b8c..89aacb630a53 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -57,7 +57,6 @@ obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o obj-$(CONFIG_NETLABEL) += cipso_ipv4.o -obj-$(CONFIG_GENEVE_CORE) += geneve_core.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o xfrm4_protocol.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 675e88cac2b4..11c4ca13ec3b 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -119,7 +119,7 @@ #ifdef CONFIG_IP_MROUTE #include <linux/mroute.h> #endif -#include <net/vrf.h> +#include <net/l3mdev.h> /* The inetsw table contains everything that inet_create needs to @@ -219,17 +219,13 @@ int inet_listen(struct socket *sock, int backlog) * shutdown() (rather than close()). */ if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 && - !inet_csk(sk)->icsk_accept_queue.fastopenq) { + !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) { if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0) - err = fastopen_init_queue(sk, backlog); + fastopen_queue_tune(sk, backlog); else if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT2) != 0) - err = fastopen_init_queue(sk, + fastopen_queue_tune(sk, ((uint)sysctl_tcp_fastopen) >> 16); - else - err = 0; - if (err) - goto out; tcp_fastopen_init_key_once(true); } @@ -428,7 +424,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) struct net *net = sock_net(sk); unsigned short snum; int chk_addr_ret; - int tb_id = RT_TABLE_LOCAL; + u32 tb_id = RT_TABLE_LOCAL; int err; /* If the socket has its own bind function then use it. (RAW) */ @@ -450,7 +446,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) goto out; } - tb_id = vrf_dev_table_ifindex(net, sk->sk_bound_dev_if) ? : tb_id; + tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id; chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id); /* Not specified by any standard per-se, however it breaks too @@ -1043,22 +1039,16 @@ void inet_register_protosw(struct inet_protosw *p) goto out_illegal; /* If we are trying to override a permanent protocol, bail. */ - answer = NULL; last_perm = &inetsw[p->type]; list_for_each(lh, &inetsw[p->type]) { answer = list_entry(lh, struct inet_protosw, list); - /* Check only the non-wild match. */ - if (INET_PROTOSW_PERMANENT & answer->flags) { - if (protocol == answer->protocol) - break; - last_perm = lh; - } - - answer = NULL; + if ((INET_PROTOSW_PERMANENT & answer->flags) == 0) + break; + if (protocol == answer->protocol) + goto out_permanent; + last_perm = lh; } - if (answer) - goto out_permanent; /* Add the new entry after the last permanent entry if any, so that * the new entry does not override a permanent entry when matched with @@ -1452,38 +1442,51 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family, } EXPORT_SYMBOL_GPL(inet_ctl_sock_create); +u64 snmp_get_cpu_field(void __percpu *mib, int cpu, int offt) +{ + return *(((unsigned long *)per_cpu_ptr(mib, cpu)) + offt); +} +EXPORT_SYMBOL_GPL(snmp_get_cpu_field); + unsigned long snmp_fold_field(void __percpu *mib, int offt) { unsigned long res = 0; int i; for_each_possible_cpu(i) - res += *(((unsigned long *) per_cpu_ptr(mib, i)) + offt); + res += snmp_get_cpu_field(mib, i, offt); return res; } EXPORT_SYMBOL_GPL(snmp_fold_field); #if BITS_PER_LONG==32 +u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offt, + size_t syncp_offset) +{ + void *bhptr; + struct u64_stats_sync *syncp; + u64 v; + unsigned int start; + + bhptr = per_cpu_ptr(mib, cpu); + syncp = (struct u64_stats_sync *)(bhptr + syncp_offset); + do { + start = u64_stats_fetch_begin_irq(syncp); + v = *(((u64 *)bhptr) + offt); + } while (u64_stats_fetch_retry_irq(syncp, start)); + + return v; +} +EXPORT_SYMBOL_GPL(snmp_get_cpu_field64); + u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset) { u64 res = 0; int cpu; for_each_possible_cpu(cpu) { - void *bhptr; - struct u64_stats_sync *syncp; - u64 v; - unsigned int start; - - bhptr = per_cpu_ptr(mib, cpu); - syncp = (struct u64_stats_sync *)(bhptr + syncp_offset); - do { - start = u64_stats_fetch_begin_irq(syncp); - v = *(((u64 *) bhptr) + offt); - } while (u64_stats_fetch_retry_irq(syncp, start)); - - res += v; + res += snmp_get_cpu_field64(mib, cpu, offt, syncp_offset); } return res; } diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 30409b75e925..01308e6e6127 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -113,6 +113,8 @@ #include <net/arp.h> #include <net/ax25.h> #include <net/netrom.h> +#include <net/dst_metadata.h> +#include <net/ip_tunnels.h> #include <linux/uaccess.h> @@ -296,7 +298,8 @@ static void arp_send_dst(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, - const unsigned char *target_hw, struct sk_buff *oskb) + const unsigned char *target_hw, + struct dst_entry *dst) { struct sk_buff *skb; @@ -309,9 +312,7 @@ static void arp_send_dst(int type, int ptype, __be32 dest_ip, if (!skb) return; - if (oskb) - skb_dst_copy(skb, oskb); - + skb_dst_set(skb, dst); arp_xmit(skb); } @@ -333,6 +334,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) __be32 target = *(__be32 *)neigh->primary_key; int probes = atomic_read(&neigh->probes); struct in_device *in_dev; + struct dst_entry *dst = NULL; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); @@ -381,9 +383,10 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) } } + if (skb && !(dev->priv_flags & IFF_XMIT_DST_RELEASE)) + dst = dst_clone(skb_dst(skb)); arp_send_dst(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, - dst_hw, dev->dev_addr, NULL, - dev->priv_flags & IFF_XMIT_DST_RELEASE ? NULL : skb); + dst_hw, dev->dev_addr, NULL, dst); } static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip) @@ -621,14 +624,20 @@ out: } EXPORT_SYMBOL(arp_create); +static int arp_xmit_finish(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + return dev_queue_xmit(skb); +} + /* * Send an arp packet. */ void arp_xmit(struct sk_buff *skb) { /* Send it off, maybe filter it using firewalling first. */ - NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, NULL, skb, - NULL, skb->dev, dev_queue_xmit_sk); + NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, + dev_net(skb->dev), NULL, skb, NULL, skb->dev, + arp_xmit_finish); } EXPORT_SYMBOL(arp_xmit); @@ -636,7 +645,7 @@ EXPORT_SYMBOL(arp_xmit); * Process an arp request. */ -static int arp_process(struct sock *sk, struct sk_buff *skb) +static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb->dev; struct in_device *in_dev = __in_dev_get_rcu(dev); @@ -648,7 +657,7 @@ static int arp_process(struct sock *sk, struct sk_buff *skb) u16 dev_type = dev->type; int addr_type; struct neighbour *n; - struct net *net = dev_net(dev); + struct dst_entry *reply_dst = NULL; bool is_garp = false; /* arp_rcv below verifies the ARP header and verifies the device @@ -749,13 +758,18 @@ static int arp_process(struct sock *sk, struct sk_buff *skb) * cache. */ + if (arp->ar_op == htons(ARPOP_REQUEST) && skb_metadata_dst(skb)) + reply_dst = (struct dst_entry *) + iptunnel_metadata_reply(skb_metadata_dst(skb), + GFP_ATOMIC); + /* Special case: IPv4 duplicate address detection packet (RFC2131) */ if (sip == 0) { if (arp->ar_op == htons(ARPOP_REQUEST) && inet_addr_type_dev_table(net, dev, tip) == RTN_LOCAL && !arp_ignore(in_dev, sip, tip)) - arp_send(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, - dev->dev_addr, sha); + arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, + sha, dev->dev_addr, sha, reply_dst); goto out; } @@ -774,9 +788,10 @@ static int arp_process(struct sock *sk, struct sk_buff *skb) if (!dont_send) { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) { - arp_send(ARPOP_REPLY, ETH_P_ARP, sip, - dev, tip, sha, dev->dev_addr, - sha); + arp_send_dst(ARPOP_REPLY, ETH_P_ARP, + sip, dev, tip, sha, + dev->dev_addr, sha, + reply_dst); neigh_release(n); } } @@ -794,9 +809,10 @@ static int arp_process(struct sock *sk, struct sk_buff *skb) if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED || skb->pkt_type == PACKET_HOST || NEIGH_VAR(in_dev->arp_parms, PROXY_DELAY) == 0) { - arp_send(ARPOP_REPLY, ETH_P_ARP, sip, - dev, tip, sha, dev->dev_addr, - sha); + arp_send_dst(ARPOP_REPLY, ETH_P_ARP, + sip, dev, tip, sha, + dev->dev_addr, sha, + reply_dst); } else { pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb); @@ -859,7 +875,7 @@ out: static void parp_redo(struct sk_buff *skb) { - arp_process(NULL, skb); + arp_process(dev_net(skb->dev), NULL, skb); } @@ -892,8 +908,9 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev, memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); - return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, NULL, skb, - dev, NULL, arp_process); + return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, + dev_net(dev), NULL, skb, dev, NULL, + arp_process); consumeskb: consume_skb(skb); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 2d9cb1748f81..735008472844 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1654,7 +1654,8 @@ static size_t inet_get_link_af_size(const struct net_device *dev) return nla_total_size(IPV4_DEVCONF_MAX * 4); /* IFLA_INET_CONF */ } -static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev) +static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev, + u32 ext_filter_mask) { struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr); struct nlattr *nla; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 7fa277176c33..d7c2bb0c4f65 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -45,7 +45,8 @@ #include <net/ip_fib.h> #include <net/rtnetlink.h> #include <net/xfrm.h> -#include <net/vrf.h> +#include <net/l3mdev.h> +#include <trace/events/fib.h> #ifndef CONFIG_IP_MULTIPLE_TABLES @@ -212,7 +213,7 @@ void fib_flush_external(struct net *net) */ static inline unsigned int __inet_dev_addr_type(struct net *net, const struct net_device *dev, - __be32 addr, int tb_id) + __be32 addr, u32 tb_id) { struct flowi4 fl4 = { .daddr = addr }; struct fib_result res; @@ -239,7 +240,7 @@ static inline unsigned int __inet_dev_addr_type(struct net *net, return ret; } -unsigned int inet_addr_type_table(struct net *net, __be32 addr, int tb_id) +unsigned int inet_addr_type_table(struct net *net, __be32 addr, u32 tb_id) { return __inet_dev_addr_type(net, NULL, addr, tb_id); } @@ -254,7 +255,7 @@ EXPORT_SYMBOL(inet_addr_type); unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, __be32 addr) { - int rt_table = vrf_dev_table(dev) ? : RT_TABLE_LOCAL; + u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL; return __inet_dev_addr_type(net, dev, addr, rt_table); } @@ -267,7 +268,7 @@ unsigned int inet_addr_type_dev_table(struct net *net, const struct net_device *dev, __be32 addr) { - int rt_table = vrf_dev_table(dev) ? : RT_TABLE_LOCAL; + u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL; return __inet_dev_addr_type(net, NULL, addr, rt_table); } @@ -331,7 +332,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, bool dev_match; fl4.flowi4_oif = 0; - fl4.flowi4_iif = vrf_master_ifindex_rcu(dev); + fl4.flowi4_iif = l3mdev_master_ifindex_rcu(dev); if (!fl4.flowi4_iif) fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX; fl4.daddr = src; @@ -339,11 +340,14 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, fl4.flowi4_tos = tos; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_tun_key.tun_id = 0; + fl4.flowi4_flags = 0; no_addr = idev->ifa_list == NULL; fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0; + trace_fib_validate_source(dev, &fl4); + net = dev_net(dev); if (fib_lookup(net, &fl4, &res, 0)) goto last_resort; @@ -363,7 +367,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, if (nh->nh_dev == dev) { dev_match = true; break; - } else if (vrf_master_ifindex_rcu(nh->nh_dev) == dev->ifindex) { + } else if (l3mdev_master_ifindex_rcu(nh->nh_dev) == dev->ifindex) { dev_match = true; break; } @@ -800,7 +804,7 @@ out: static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa) { struct net *net = dev_net(ifa->ifa_dev->dev); - int tb_id = vrf_dev_table_rtnl(ifa->ifa_dev->dev); + u32 tb_id = l3mdev_fib_table(ifa->ifa_dev->dev); struct fib_table *tb; struct fib_config cfg = { .fc_protocol = RTPROT_KERNEL, diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 18123d50f576..f2bda9e89c61 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -318,7 +318,6 @@ static const struct fib_rules_ops __net_initconst fib4_rules_ops_template = { .delete = fib4_rule_delete, .compare = fib4_rule_compare, .fill = fib4_rule_fill, - .default_pref = fib_default_rule_pref, .nlmsg_payload = fib4_rule_nlmsg_payload, .flush_cache = fib4_rule_flush_cache, .nlgroup = RTNLGRP_IPV4_RULE, diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 1b2d01170a4d..af77298c8b4f 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -57,8 +57,7 @@ static unsigned int fib_info_cnt; static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE]; #ifdef CONFIG_IP_ROUTE_MULTIPATH - -static DEFINE_SPINLOCK(fib_multipath_lock); +u32 fib_multipath_secret __read_mostly; #define for_nexthops(fi) { \ int nhsel; const struct fib_nh *nh; \ @@ -532,7 +531,67 @@ errout: return ret; } -#endif +static void fib_rebalance(struct fib_info *fi) +{ + int total; + int w; + struct in_device *in_dev; + + if (fi->fib_nhs < 2) + return; + + total = 0; + for_nexthops(fi) { + if (nh->nh_flags & RTNH_F_DEAD) + continue; + + in_dev = __in_dev_get_rcu(nh->nh_dev); + + if (in_dev && + IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && + nh->nh_flags & RTNH_F_LINKDOWN) + continue; + + total += nh->nh_weight; + } endfor_nexthops(fi); + + w = 0; + change_nexthops(fi) { + int upper_bound; + + in_dev = __in_dev_get_rcu(nexthop_nh->nh_dev); + + if (nexthop_nh->nh_flags & RTNH_F_DEAD) { + upper_bound = -1; + } else if (in_dev && + IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && + nexthop_nh->nh_flags & RTNH_F_LINKDOWN) { + upper_bound = -1; + } else { + w += nexthop_nh->nh_weight; + upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, + total) - 1; + } + + atomic_set(&nexthop_nh->nh_upper_bound, upper_bound); + } endfor_nexthops(fi); + + net_get_random_once(&fib_multipath_secret, + sizeof(fib_multipath_secret)); +} + +static inline void fib_add_weight(struct fib_info *fi, + const struct fib_nh *nh) +{ + fi->fib_weight += nh->nh_weight; +} + +#else /* CONFIG_IP_ROUTE_MULTIPATH */ + +#define fib_rebalance(fi) do { } while (0) +#define fib_add_weight(fi, nh) do { } while (0) + +#endif /* CONFIG_IP_ROUTE_MULTIPATH */ static int fib_encap_match(struct net *net, u16 encap_type, struct nlattr *encap, @@ -863,7 +922,7 @@ static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc) { if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst || fib_prefsrc != cfg->fc_dst) { - int tb_id = cfg->fc_table; + u32 tb_id = cfg->fc_table; if (tb_id == RT_TABLE_MAIN) tb_id = RT_TABLE_LOCAL; @@ -876,6 +935,50 @@ static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc) return true; } +static int +fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) +{ + bool ecn_ca = false; + struct nlattr *nla; + int remaining; + + if (!cfg->fc_mx) + return 0; + + nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { + int type = nla_type(nla); + u32 val; + + if (!type) + continue; + if (type > RTAX_MAX) + return -EINVAL; + + if (type == RTAX_CC_ALGO) { + char tmp[TCP_CA_NAME_MAX]; + + nla_strlcpy(tmp, nla, sizeof(tmp)); + val = tcp_ca_get_key_by_name(tmp, &ecn_ca); + if (val == TCP_CA_UNSPEC) + return -EINVAL; + } else { + val = nla_get_u32(nla); + } + if (type == RTAX_ADVMSS && val > 65535 - 40) + val = 65535 - 40; + if (type == RTAX_MTU && val > 65535 - 15) + val = 65535 - 15; + if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) + return -EINVAL; + fi->fib_metrics[type - 1] = val; + } + + if (ecn_ca) + fi->fib_metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; + + return 0; +} + struct fib_info *fib_create_info(struct fib_config *cfg) { int err; @@ -948,36 +1051,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg) goto failure; } endfor_nexthops(fi) - if (cfg->fc_mx) { - struct nlattr *nla; - int remaining; - - nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { - int type = nla_type(nla); - - if (type) { - u32 val; - - if (type > RTAX_MAX) - goto err_inval; - if (type == RTAX_CC_ALGO) { - char tmp[TCP_CA_NAME_MAX]; - - nla_strlcpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(tmp); - if (val == TCP_CA_UNSPEC) - goto err_inval; - } else { - val = nla_get_u32(nla); - } - if (type == RTAX_ADVMSS && val > 65535 - 40) - val = 65535 - 40; - if (type == RTAX_MTU && val > 65535 - 15) - val = 65535 - 15; - fi->fib_metrics[type - 1] = val; - } - } - } + err = fib_convert_metrics(fi, cfg); + if (err) + goto failure; if (cfg->fc_mp) { #ifdef CONFIG_IP_ROUTE_MULTIPATH @@ -1077,8 +1153,11 @@ struct fib_info *fib_create_info(struct fib_config *cfg) change_nexthops(fi) { fib_info_update_nh_saddr(net, nexthop_nh); + fib_add_weight(fi, nexthop_nh); } endfor_nexthops(fi) + fib_rebalance(fi); + link_it: ofi = fib_find_info(fi); if (ofi) { @@ -1300,12 +1379,6 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event) nexthop_nh->nh_flags |= RTNH_F_LINKDOWN; break; } -#ifdef CONFIG_IP_ROUTE_MULTIPATH - spin_lock_bh(&fib_multipath_lock); - fi->fib_power -= nexthop_nh->nh_power; - nexthop_nh->nh_power = 0; - spin_unlock_bh(&fib_multipath_lock); -#endif dead++; } #ifdef CONFIG_IP_ROUTE_MULTIPATH @@ -1328,6 +1401,8 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event) } ret++; } + + fib_rebalance(fi); } return ret; @@ -1450,20 +1525,15 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags) !__in_dev_get_rtnl(dev)) continue; alive++; -#ifdef CONFIG_IP_ROUTE_MULTIPATH - spin_lock_bh(&fib_multipath_lock); - nexthop_nh->nh_power = 0; - nexthop_nh->nh_flags &= ~nh_flags; - spin_unlock_bh(&fib_multipath_lock); -#else nexthop_nh->nh_flags &= ~nh_flags; -#endif } endfor_nexthops(fi) if (alive > 0) { fi->fib_flags &= ~nh_flags; ret++; } + + fib_rebalance(fi); } return ret; @@ -1471,62 +1541,40 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags) #ifdef CONFIG_IP_ROUTE_MULTIPATH -/* - * The algorithm is suboptimal, but it provides really - * fair weighted route distribution. - */ -void fib_select_multipath(struct fib_result *res) +void fib_select_multipath(struct fib_result *res, int hash) { struct fib_info *fi = res->fi; - struct in_device *in_dev; - int w; - spin_lock_bh(&fib_multipath_lock); - if (fi->fib_power <= 0) { - int power = 0; - change_nexthops(fi) { - in_dev = __in_dev_get_rcu(nexthop_nh->nh_dev); - if (nexthop_nh->nh_flags & RTNH_F_DEAD) - continue; - if (in_dev && - IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && - nexthop_nh->nh_flags & RTNH_F_LINKDOWN) - continue; - power += nexthop_nh->nh_weight; - nexthop_nh->nh_power = nexthop_nh->nh_weight; - } endfor_nexthops(fi); - fi->fib_power = power; - if (power <= 0) { - spin_unlock_bh(&fib_multipath_lock); - /* Race condition: route has just become dead. */ - res->nh_sel = 0; - return; - } - } - - - /* w should be random number [0..fi->fib_power-1], - * it is pretty bad approximation. - */ - - w = jiffies % fi->fib_power; + for_nexthops(fi) { + if (hash > atomic_read(&nh->nh_upper_bound)) + continue; - change_nexthops(fi) { - if (!(nexthop_nh->nh_flags & RTNH_F_DEAD) && - nexthop_nh->nh_power) { - w -= nexthop_nh->nh_power; - if (w <= 0) { - nexthop_nh->nh_power--; - fi->fib_power--; - res->nh_sel = nhsel; - spin_unlock_bh(&fib_multipath_lock); - return; - } - } + res->nh_sel = nhsel; + return; } endfor_nexthops(fi); /* Race condition: route has just become dead. */ res->nh_sel = 0; - spin_unlock_bh(&fib_multipath_lock); } #endif + +void fib_select_path(struct net *net, struct fib_result *res, + struct flowi4 *fl4, int mp_hash) +{ +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (res->fi->fib_nhs > 1 && fl4->flowi4_oif == 0) { + if (mp_hash < 0) + mp_hash = fib_multipath_hash(fl4->saddr, fl4->daddr); + fib_select_multipath(res, mp_hash); + } + else +#endif + if (!res->prefixlen && + res->table->tb_num_default > 1 && + res->type == RTN_UNICAST && !fl4->flowi4_oif) + fib_select_default(fl4, res); + + if (!fl4->saddr) + fl4->saddr = FIB_RES_PREFSRC(net, *res); +} +EXPORT_SYMBOL_GPL(fib_select_path); diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 5154f81c5326..6c2af797f2f9 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -81,6 +81,7 @@ #include <net/sock.h> #include <net/ip_fib.h> #include <net/switchdev.h> +#include <trace/events/fib.h> #include "fib_lookup.h" #define MAX_STAT_DEPTH 32 @@ -1278,6 +1279,8 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, unsigned long index; t_key cindex; + trace_fib_table_lookup(tb->tb_id, flp); + pn = t->kv; cindex = 0; @@ -1423,7 +1426,7 @@ found: nh->nh_flags & RTNH_F_LINKDOWN && !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE)) continue; - if (!(flp->flowi4_flags & FLOWI_FLAG_VRFSRC)) { + if (!(flp->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) { if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif) continue; @@ -1442,6 +1445,8 @@ found: #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->semantic_match_passed); #endif + trace_fib_table_lookup_nh(nh); + return err; } } diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 2d1646cff057..e0fcbbbcfe54 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -566,7 +566,7 @@ static int parse_nl_config(struct genl_info *info, if (info->attrs[FOU_ATTR_AF]) { u8 family = nla_get_u8(info->attrs[FOU_ATTR_AF]); - if (family != AF_INET && family != AF_INET6) + if (family != AF_INET) return -EINVAL; cfg->udp_config.family = family; diff --git a/net/ipv4/geneve_core.c b/net/ipv4/geneve_core.c deleted file mode 100644 index 311a4ba6950a..000000000000 --- a/net/ipv4/geneve_core.c +++ /dev/null @@ -1,447 +0,0 @@ -/* - * Geneve: Generic Network Virtualization Encapsulation - * - * Copyright (c) 2014 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/module.h> -#include <linux/errno.h> -#include <linux/slab.h> -#include <linux/skbuff.h> -#include <linux/list.h> -#include <linux/netdevice.h> -#include <linux/in.h> -#include <linux/ip.h> -#include <linux/udp.h> -#include <linux/igmp.h> -#include <linux/etherdevice.h> -#include <linux/if_ether.h> -#include <linux/if_vlan.h> -#include <linux/ethtool.h> -#include <linux/mutex.h> -#include <net/arp.h> -#include <net/ndisc.h> -#include <net/ip.h> -#include <net/ip_tunnels.h> -#include <net/icmp.h> -#include <net/udp.h> -#include <net/rtnetlink.h> -#include <net/route.h> -#include <net/dsfield.h> -#include <net/inet_ecn.h> -#include <net/net_namespace.h> -#include <net/netns/generic.h> -#include <net/geneve.h> -#include <net/protocol.h> -#include <net/udp_tunnel.h> -#if IS_ENABLED(CONFIG_IPV6) -#include <net/ipv6.h> -#include <net/addrconf.h> -#include <net/ip6_tunnel.h> -#include <net/ip6_checksum.h> -#endif - -/* Protects sock_list and refcounts. */ -static DEFINE_MUTEX(geneve_mutex); - -/* per-network namespace private data for this module */ -struct geneve_net { - struct list_head sock_list; -}; - -static int geneve_net_id; - -static struct geneve_sock *geneve_find_sock(struct net *net, - sa_family_t family, __be16 port) -{ - struct geneve_net *gn = net_generic(net, geneve_net_id); - struct geneve_sock *gs; - - list_for_each_entry(gs, &gn->sock_list, list) { - if (inet_sk(gs->sock->sk)->inet_sport == port && - inet_sk(gs->sock->sk)->sk.sk_family == family) - return gs; - } - - return NULL; -} - -static void geneve_build_header(struct genevehdr *geneveh, - __be16 tun_flags, u8 vni[3], - u8 options_len, u8 *options) -{ - geneveh->ver = GENEVE_VER; - geneveh->opt_len = options_len / 4; - geneveh->oam = !!(tun_flags & TUNNEL_OAM); - geneveh->critical = !!(tun_flags & TUNNEL_CRIT_OPT); - geneveh->rsvd1 = 0; - memcpy(geneveh->vni, vni, 3); - geneveh->proto_type = htons(ETH_P_TEB); - geneveh->rsvd2 = 0; - - memcpy(geneveh->options, options, options_len); -} - -/* Transmit a fully formatted Geneve frame. - * - * When calling this function. The skb->data should point - * to the geneve header which is fully formed. - * - * This function will add other UDP tunnel headers. - */ -int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt, - struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, - __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, - __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt, - bool csum, bool xnet) -{ - struct genevehdr *gnvh; - int min_headroom; - int err; - - min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len - + GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr) - + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); - - err = skb_cow_head(skb, min_headroom); - if (unlikely(err)) { - kfree_skb(skb); - return err; - } - - skb = vlan_hwaccel_push_inside(skb); - if (unlikely(!skb)) - return -ENOMEM; - - skb = udp_tunnel_handle_offloads(skb, csum); - if (IS_ERR(skb)) - return PTR_ERR(skb); - - gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len); - geneve_build_header(gnvh, tun_flags, vni, opt_len, opt); - - skb_set_inner_protocol(skb, htons(ETH_P_TEB)); - - return udp_tunnel_xmit_skb(rt, gs->sock->sk, skb, src, dst, - tos, ttl, df, src_port, dst_port, xnet, - !csum); -} -EXPORT_SYMBOL_GPL(geneve_xmit_skb); - -static int geneve_hlen(struct genevehdr *gh) -{ - return sizeof(*gh) + gh->opt_len * 4; -} - -static struct sk_buff **geneve_gro_receive(struct sk_buff **head, - struct sk_buff *skb, - struct udp_offload *uoff) -{ - struct sk_buff *p, **pp = NULL; - struct genevehdr *gh, *gh2; - unsigned int hlen, gh_len, off_gnv; - const struct packet_offload *ptype; - __be16 type; - int flush = 1; - - off_gnv = skb_gro_offset(skb); - hlen = off_gnv + sizeof(*gh); - gh = skb_gro_header_fast(skb, off_gnv); - if (skb_gro_header_hard(skb, hlen)) { - gh = skb_gro_header_slow(skb, hlen, off_gnv); - if (unlikely(!gh)) - goto out; - } - - if (gh->ver != GENEVE_VER || gh->oam) - goto out; - gh_len = geneve_hlen(gh); - - hlen = off_gnv + gh_len; - if (skb_gro_header_hard(skb, hlen)) { - gh = skb_gro_header_slow(skb, hlen, off_gnv); - if (unlikely(!gh)) - goto out; - } - - flush = 0; - - for (p = *head; p; p = p->next) { - if (!NAPI_GRO_CB(p)->same_flow) - continue; - - gh2 = (struct genevehdr *)(p->data + off_gnv); - if (gh->opt_len != gh2->opt_len || - memcmp(gh, gh2, gh_len)) { - NAPI_GRO_CB(p)->same_flow = 0; - continue; - } - } - - type = gh->proto_type; - - rcu_read_lock(); - ptype = gro_find_receive_by_type(type); - if (!ptype) { - flush = 1; - goto out_unlock; - } - - skb_gro_pull(skb, gh_len); - skb_gro_postpull_rcsum(skb, gh, gh_len); - pp = ptype->callbacks.gro_receive(head, skb); - -out_unlock: - rcu_read_unlock(); -out: - NAPI_GRO_CB(skb)->flush |= flush; - - return pp; -} - -static int geneve_gro_complete(struct sk_buff *skb, int nhoff, - struct udp_offload *uoff) -{ - struct genevehdr *gh; - struct packet_offload *ptype; - __be16 type; - int gh_len; - int err = -ENOSYS; - - udp_tunnel_gro_complete(skb, nhoff); - - gh = (struct genevehdr *)(skb->data + nhoff); - gh_len = geneve_hlen(gh); - type = gh->proto_type; - - rcu_read_lock(); - ptype = gro_find_complete_by_type(type); - if (ptype) - err = ptype->callbacks.gro_complete(skb, nhoff + gh_len); - - rcu_read_unlock(); - return err; -} - -static void geneve_notify_add_rx_port(struct geneve_sock *gs) -{ - struct sock *sk = gs->sock->sk; - sa_family_t sa_family = sk->sk_family; - int err; - - if (sa_family == AF_INET) { - err = udp_add_offload(&gs->udp_offloads); - if (err) - pr_warn("geneve: udp_add_offload failed with status %d\n", - err); - } -} - -static void geneve_notify_del_rx_port(struct geneve_sock *gs) -{ - struct sock *sk = gs->sock->sk; - sa_family_t sa_family = sk->sk_family; - - if (sa_family == AF_INET) - udp_del_offload(&gs->udp_offloads); -} - -/* Callback from net/ipv4/udp.c to receive packets */ -static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) -{ - struct genevehdr *geneveh; - struct geneve_sock *gs; - int opts_len; - - /* Need Geneve and inner Ethernet header to be present */ - if (unlikely(!pskb_may_pull(skb, GENEVE_BASE_HLEN))) - goto error; - - /* Return packets with reserved bits set */ - geneveh = geneve_hdr(skb); - - if (unlikely(geneveh->ver != GENEVE_VER)) - goto error; - - if (unlikely(geneveh->proto_type != htons(ETH_P_TEB))) - goto error; - - opts_len = geneveh->opt_len * 4; - if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, - htons(ETH_P_TEB))) - goto drop; - - gs = rcu_dereference_sk_user_data(sk); - if (!gs) - goto drop; - - gs->rcv(gs, skb); - return 0; - -drop: - /* Consume bad packet */ - kfree_skb(skb); - return 0; - -error: - /* Let the UDP layer deal with the skb */ - return 1; -} - -static struct socket *geneve_create_sock(struct net *net, bool ipv6, - __be16 port) -{ - struct socket *sock; - struct udp_port_cfg udp_conf; - int err; - - memset(&udp_conf, 0, sizeof(udp_conf)); - - if (ipv6) { - udp_conf.family = AF_INET6; - } else { - udp_conf.family = AF_INET; - udp_conf.local_ip.s_addr = htonl(INADDR_ANY); - } - - udp_conf.local_udp_port = port; - - /* Open UDP socket */ - err = udp_sock_create(net, &udp_conf, &sock); - if (err < 0) - return ERR_PTR(err); - - return sock; -} - -/* Create new listen socket if needed */ -static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port, - geneve_rcv_t *rcv, void *data, - bool ipv6) -{ - struct geneve_net *gn = net_generic(net, geneve_net_id); - struct geneve_sock *gs; - struct socket *sock; - struct udp_tunnel_sock_cfg tunnel_cfg; - - gs = kzalloc(sizeof(*gs), GFP_KERNEL); - if (!gs) - return ERR_PTR(-ENOMEM); - - sock = geneve_create_sock(net, ipv6, port); - if (IS_ERR(sock)) { - kfree(gs); - return ERR_CAST(sock); - } - - gs->sock = sock; - gs->refcnt = 1; - gs->rcv = rcv; - gs->rcv_data = data; - - /* Initialize the geneve udp offloads structure */ - gs->udp_offloads.port = port; - gs->udp_offloads.callbacks.gro_receive = geneve_gro_receive; - gs->udp_offloads.callbacks.gro_complete = geneve_gro_complete; - geneve_notify_add_rx_port(gs); - - /* Mark socket as an encapsulation socket */ - tunnel_cfg.sk_user_data = gs; - tunnel_cfg.encap_type = 1; - tunnel_cfg.encap_rcv = geneve_udp_encap_recv; - tunnel_cfg.encap_destroy = NULL; - setup_udp_tunnel_sock(net, sock, &tunnel_cfg); - - list_add(&gs->list, &gn->sock_list); - - return gs; -} - -struct geneve_sock *geneve_sock_add(struct net *net, __be16 port, - geneve_rcv_t *rcv, void *data, - bool no_share, bool ipv6) -{ - struct geneve_sock *gs; - - mutex_lock(&geneve_mutex); - - gs = geneve_find_sock(net, ipv6 ? AF_INET6 : AF_INET, port); - if (gs) { - if (!no_share && gs->rcv == rcv) - gs->refcnt++; - else - gs = ERR_PTR(-EBUSY); - } else { - gs = geneve_socket_create(net, port, rcv, data, ipv6); - } - - mutex_unlock(&geneve_mutex); - - return gs; -} -EXPORT_SYMBOL_GPL(geneve_sock_add); - -void geneve_sock_release(struct geneve_sock *gs) -{ - mutex_lock(&geneve_mutex); - - if (--gs->refcnt) - goto unlock; - - list_del(&gs->list); - geneve_notify_del_rx_port(gs); - udp_tunnel_sock_release(gs->sock); - kfree_rcu(gs, rcu); - -unlock: - mutex_unlock(&geneve_mutex); -} -EXPORT_SYMBOL_GPL(geneve_sock_release); - -static __net_init int geneve_init_net(struct net *net) -{ - struct geneve_net *gn = net_generic(net, geneve_net_id); - - INIT_LIST_HEAD(&gn->sock_list); - - return 0; -} - -static struct pernet_operations geneve_net_ops = { - .init = geneve_init_net, - .id = &geneve_net_id, - .size = sizeof(struct geneve_net), -}; - -static int __init geneve_init_module(void) -{ - int rc; - - rc = register_pernet_subsys(&geneve_net_ops); - if (rc) - return rc; - - pr_info("Geneve core logic\n"); - - return 0; -} -module_init(geneve_init_module); - -static void __exit geneve_cleanup_module(void) -{ - unregister_pernet_subsys(&geneve_net_ops); -} -module_exit(geneve_cleanup_module); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jesse Gross <jesse@nicira.com>"); -MODULE_DESCRIPTION("Driver library for GENEVE encapsulated traffic"); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index f16488efa1c8..f3c356b7c1f0 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -96,7 +96,7 @@ #include <net/xfrm.h> #include <net/inet_common.h> #include <net/ip_fib.h> -#include <net/vrf.h> +#include <net/l3mdev.h> /* * Build xmit assembly blocks @@ -309,9 +309,10 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, rc = false; if (icmp_global_allow()) { + int vif = l3mdev_master_ifindex(dst->dev); struct inet_peer *peer; - peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1); + peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1); rc = inet_peer_xrlim_allow(peer, net->ipv4.sysctl_icmp_ratelimit); if (peer) @@ -426,7 +427,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) fl4.flowi4_mark = mark; fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); fl4.flowi4_proto = IPPROTO_ICMP; - fl4.flowi4_oif = vrf_master_ifindex(skb->dev) ? : skb->dev->ifindex; + fl4.flowi4_oif = l3mdev_master_ifindex(skb->dev); security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) @@ -439,6 +440,22 @@ out_unlock: icmp_xmit_unlock(sk); } +#ifdef CONFIG_IP_ROUTE_MULTIPATH + +/* Source and destination is swapped. See ip_multipath_icmp_hash */ +static int icmp_multipath_hash_skb(const struct sk_buff *skb) +{ + const struct iphdr *iph = ip_hdr(skb); + + return fib_multipath_hash(iph->daddr, iph->saddr); +} + +#else + +#define icmp_multipath_hash_skb(skb) (-1) + +#endif + static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, struct sk_buff *skb_in, @@ -460,10 +477,11 @@ static struct rtable *icmp_route_lookup(struct net *net, fl4->flowi4_proto = IPPROTO_ICMP; fl4->fl4_icmp_type = type; fl4->fl4_icmp_code = code; - fl4->flowi4_oif = vrf_master_ifindex(skb_in->dev) ? : skb_in->dev->ifindex; + fl4->flowi4_oif = l3mdev_master_ifindex(skb_in->dev); security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); - rt = __ip_route_output_key(net, fl4); + rt = __ip_route_output_key_hash(net, fl4, + icmp_multipath_hash_skb(skb_in)); if (IS_ERR(rt)) return rt; @@ -641,7 +659,9 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) */ saddr = iph->daddr; - if (!(rt->rt_flags & RTCF_LOCAL)) { + if (!((type == ICMP_REDIRECT) && + net->ipv4.sysctl_icmp_redirects_use_orig_daddr) && + !(rt->rt_flags & RTCF_LOCAL)) { struct net_device *dev = NULL; rcu_read_lock(); @@ -1204,6 +1224,11 @@ static int __net_init icmp_sk_init(struct net *net) net->ipv4.sysctl_icmp_ratemask = 0x1818; net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr = 0; + /* Control paramerer - use the daddr of originating packets as saddr + * in redirect messages? + */ + net->ipv4.sysctl_icmp_redirects_use_orig_daddr = 0; + return 0; fail: diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 9fdfd9deac11..64aaf3522a59 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -110,6 +110,9 @@ #define IP_MAX_MEMBERSHIPS 20 #define IP_MAX_MSF 10 +/* IGMP reports for link-local multicast groups are enabled by default */ +int sysctl_igmp_llm_reports __read_mostly = 1; + #ifdef CONFIG_IP_MULTICAST /* Parameter names and values are taken from igmp-v2-06 draft */ @@ -394,7 +397,7 @@ static int igmpv3_sendpack(struct sk_buff *skb) pig->csum = ip_compute_csum(igmp_hdr(skb), igmplen); - return ip_local_out(skb); + return ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); } static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel) @@ -437,6 +440,8 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, if (pmc->multiaddr == IGMP_ALL_HOSTS) return skb; + if (ipv4_is_local_multicast(pmc->multiaddr) && !sysctl_igmp_llm_reports) + return skb; isquery = type == IGMPV3_MODE_IS_INCLUDE || type == IGMPV3_MODE_IS_EXCLUDE; @@ -545,6 +550,9 @@ static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc) for_each_pmc_rcu(in_dev, pmc) { if (pmc->multiaddr == IGMP_ALL_HOSTS) continue; + if (ipv4_is_local_multicast(pmc->multiaddr) && + !sysctl_igmp_llm_reports) + continue; spin_lock_bh(&pmc->lock); if (pmc->sfcount[MCAST_EXCLUDE]) type = IGMPV3_MODE_IS_EXCLUDE; @@ -678,7 +686,11 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, if (type == IGMPV3_HOST_MEMBERSHIP_REPORT) return igmpv3_send_report(in_dev, pmc); - else if (type == IGMP_HOST_LEAVE_MESSAGE) + + if (ipv4_is_local_multicast(group) && !sysctl_igmp_llm_reports) + return 0; + + if (type == IGMP_HOST_LEAVE_MESSAGE) dst = IGMP_ALL_ROUTER; else dst = group; @@ -727,7 +739,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, ih->group = group; ih->csum = ip_compute_csum((void *)ih, sizeof(struct igmphdr)); - return ip_local_out(skb); + return ip_local_out(net, skb->sk, skb); } static void igmp_gq_timer_expire(unsigned long data) @@ -851,6 +863,8 @@ static bool igmp_heard_report(struct in_device *in_dev, __be32 group) if (group == IGMP_ALL_HOSTS) return false; + if (ipv4_is_local_multicast(group) && !sysctl_igmp_llm_reports) + return false; rcu_read_lock(); for_each_pmc_rcu(in_dev, im) { @@ -957,6 +971,9 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, continue; if (im->multiaddr == IGMP_ALL_HOSTS) continue; + if (ipv4_is_local_multicast(im->multiaddr) && + !sysctl_igmp_llm_reports) + continue; spin_lock_bh(&im->lock); if (im->tm_running) im->gsquery = im->gsquery && mark; @@ -1181,6 +1198,8 @@ static void igmp_group_dropped(struct ip_mc_list *im) #ifdef CONFIG_IP_MULTICAST if (im->multiaddr == IGMP_ALL_HOSTS) return; + if (ipv4_is_local_multicast(im->multiaddr) && !sysctl_igmp_llm_reports) + return; reporter = im->reporter; igmp_stop_timer(im); @@ -1213,6 +1232,8 @@ static void igmp_group_added(struct ip_mc_list *im) #ifdef CONFIG_IP_MULTICAST if (im->multiaddr == IGMP_ALL_HOSTS) return; + if (ipv4_is_local_multicast(im->multiaddr) && !sysctl_igmp_llm_reports) + return; if (in_dev->dead) return; @@ -1518,6 +1539,9 @@ static void ip_mc_rejoin_groups(struct in_device *in_dev) for_each_pmc_rtnl(in_dev, im) { if (im->multiaddr == IGMP_ALL_HOSTS) continue; + if (ipv4_is_local_multicast(im->multiaddr) && + !sysctl_igmp_llm_reports) + continue; /* a failover is happening and switches * must be notified immediately @@ -2545,7 +2569,7 @@ void ip_mc_drop_socket(struct sock *sk) } /* called with rcu_read_lock() */ -int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 proto) +int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u8 proto) { struct ip_mc_list *im; struct ip_mc_list __rcu **mc_hash; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 134957159c27..514b9e910bd4 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -330,14 +330,12 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) if (error) goto out_err; } - req = reqsk_queue_remove(queue); + req = reqsk_queue_remove(queue, sk); newsk = req->sk; - sk_acceptq_removed(sk); if (sk->sk_protocol == IPPROTO_TCP && - tcp_rsk(req)->tfo_listener && - queue->fastopenq) { - spin_lock_bh(&queue->fastopenq->lock); + tcp_rsk(req)->tfo_listener) { + spin_lock_bh(&queue->fastopenq.lock); if (tcp_rsk(req)->tfo_listener) { /* We are still waiting for the final ACK from 3WHS * so can't free req now. Instead, we set req->sk to @@ -348,7 +346,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) req->sk = NULL; req = NULL; } - spin_unlock_bh(&queue->fastopenq->lock); + spin_unlock_bh(&queue->fastopenq.lock); } out: release_sock(sk); @@ -408,7 +406,7 @@ void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len) } EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); -struct dst_entry *inet_csk_route_req(struct sock *sk, +struct dst_entry *inet_csk_route_req(const struct sock *sk, struct flowi4 *fl4, const struct request_sock *req) { @@ -439,7 +437,7 @@ no_route: } EXPORT_SYMBOL_GPL(inet_csk_route_req); -struct dst_entry *inet_csk_route_child_sock(struct sock *sk, +struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, struct sock *newsk, const struct request_sock *req) { @@ -478,65 +476,12 @@ no_route: } EXPORT_SYMBOL_GPL(inet_csk_route_child_sock); -static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport, - const u32 rnd, const u32 synq_hsize) -{ - return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1); -} - #if IS_ENABLED(CONFIG_IPV6) #define AF_INET_FAMILY(fam) ((fam) == AF_INET) #else #define AF_INET_FAMILY(fam) true #endif -/* Note: this is temporary : - * req sock will no longer be in listener hash table -*/ -struct request_sock *inet_csk_search_req(struct sock *sk, - const __be16 rport, - const __be32 raddr, - const __be32 laddr) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - struct request_sock *req; - u32 hash = inet_synq_hash(raddr, rport, lopt->hash_rnd, - lopt->nr_table_entries); - - spin_lock(&icsk->icsk_accept_queue.syn_wait_lock); - for (req = lopt->syn_table[hash]; req != NULL; req = req->dl_next) { - const struct inet_request_sock *ireq = inet_rsk(req); - - if (ireq->ir_rmt_port == rport && - ireq->ir_rmt_addr == raddr && - ireq->ir_loc_addr == laddr && - AF_INET_FAMILY(req->rsk_ops->family)) { - atomic_inc(&req->rsk_refcnt); - WARN_ON(req->sk); - break; - } - } - spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock); - - return req; -} -EXPORT_SYMBOL_GPL(inet_csk_search_req); - -void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, - unsigned long timeout) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - const u32 h = inet_synq_hash(inet_rsk(req)->ir_rmt_addr, - inet_rsk(req)->ir_rmt_port, - lopt->hash_rnd, lopt->nr_table_entries); - - reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); - inet_csk_reqsk_queue_added(sk, timeout); -} -EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); - /* Only thing we need from tcp.h */ extern int sysctl_tcp_synack_retries; @@ -563,7 +508,7 @@ static inline void syn_ack_recalc(struct request_sock *req, const int thresh, req->num_timeout >= rskq_defer_accept - 1; } -int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req) +int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req) { int err = req->rsk_ops->rtx_syn_ack(parent, req); @@ -573,26 +518,20 @@ int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req) } EXPORT_SYMBOL(inet_rtx_syn_ack); -/* return true if req was found in the syn_table[] */ +/* return true if req was found in the ehash table */ static bool reqsk_queue_unlink(struct request_sock_queue *queue, struct request_sock *req) { - struct listen_sock *lopt = queue->listen_opt; - struct request_sock **prev; - bool found = false; + struct inet_hashinfo *hashinfo = req_to_sk(req)->sk_prot->h.hashinfo; + spinlock_t *lock; + bool found; - spin_lock(&queue->syn_wait_lock); + lock = inet_ehash_lockp(hashinfo, req->rsk_hash); - for (prev = &lopt->syn_table[req->rsk_hash]; *prev != NULL; - prev = &(*prev)->dl_next) { - if (*prev == req) { - *prev = req->dl_next; - found = true; - break; - } - } + spin_lock(lock); + found = __sk_nulls_del_node_init_rcu(req_to_sk(req)); + spin_unlock(lock); - spin_unlock(&queue->syn_wait_lock); if (timer_pending(&req->rsk_timer) && del_timer_sync(&req->rsk_timer)) reqsk_put(req); return found; @@ -613,15 +552,12 @@ static void reqsk_timer_handler(unsigned long data) struct sock *sk_listener = req->rsk_listener; struct inet_connection_sock *icsk = inet_csk(sk_listener); struct request_sock_queue *queue = &icsk->icsk_accept_queue; - struct listen_sock *lopt = queue->listen_opt; int qlen, expire = 0, resend = 0; int max_retries, thresh; u8 defer_accept; - if (sk_listener->sk_state != TCP_LISTEN || !lopt) { - reqsk_put(req); - return; - } + if (sk_listener->sk_state != TCP_LISTEN) + goto drop; max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; thresh = max_retries; @@ -642,9 +578,9 @@ static void reqsk_timer_handler(unsigned long data) * embrions; and abort old ones without pity, if old * ones are about to clog our table. */ - qlen = listen_sock_qlen(lopt); - if (qlen >> (lopt->max_qlen_log - 1)) { - int young = listen_sock_young(lopt) << 1; + qlen = reqsk_queue_len(queue); + if ((qlen << 1) > max(8U, sk_listener->sk_max_ack_backlog)) { + int young = reqsk_queue_len_young(queue) << 1; while (thresh > 2) { if (qlen < young) @@ -666,41 +602,41 @@ static void reqsk_timer_handler(unsigned long data) unsigned long timeo; if (req->num_timeout++ == 0) - atomic_inc(&lopt->young_dec); + atomic_dec(&queue->young); timeo = min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX); mod_timer_pinned(&req->rsk_timer, jiffies + timeo); return; } +drop: inet_csk_reqsk_queue_drop(sk_listener, req); reqsk_put(req); } -void reqsk_queue_hash_req(struct request_sock_queue *queue, - u32 hash, struct request_sock *req, - unsigned long timeout) +static void reqsk_queue_hash_req(struct request_sock *req, + unsigned long timeout) { - struct listen_sock *lopt = queue->listen_opt; - req->num_retrans = 0; req->num_timeout = 0; req->sk = NULL; + setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req); + mod_timer_pinned(&req->rsk_timer, jiffies + timeout); + + inet_ehash_insert(req_to_sk(req), NULL); /* before letting lookups find us, make sure all req fields * are committed to memory and refcnt initialized. */ smp_wmb(); - atomic_set(&req->rsk_refcnt, 2); - setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req); - req->rsk_hash = hash; - - spin_lock(&queue->syn_wait_lock); - req->dl_next = lopt->syn_table[hash]; - lopt->syn_table[hash] = req; - spin_unlock(&queue->syn_wait_lock); + atomic_set(&req->rsk_refcnt, 2 + 1); +} - mod_timer_pinned(&req->rsk_timer, jiffies + timeout); +void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, + unsigned long timeout) +{ + reqsk_queue_hash_req(req, timeout); + inet_csk_reqsk_queue_added(sk); } -EXPORT_SYMBOL(reqsk_queue_hash_req); +EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); /** * inet_csk_clone_lock - clone an inet socket, and lock its clone @@ -793,12 +729,10 @@ EXPORT_SYMBOL(inet_csk_prepare_forced_close); int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) { - struct inet_sock *inet = inet_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); - int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries); + struct inet_sock *inet = inet_sk(sk); - if (rc != 0) - return rc; + reqsk_queue_alloc(&icsk->icsk_accept_queue); sk->sk_max_ack_backlog = 0; sk->sk_ack_backlog = 0; @@ -820,7 +754,6 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) } sk->sk_state = TCP_CLOSE; - __reqsk_queue_destroy(&icsk->icsk_accept_queue); return -EADDRINUSE; } EXPORT_SYMBOL_GPL(inet_csk_listen_start); @@ -833,11 +766,7 @@ void inet_csk_listen_stop(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct request_sock_queue *queue = &icsk->icsk_accept_queue; - struct request_sock *acc_req; - struct request_sock *req; - - /* make all the listen_opt local to us */ - acc_req = reqsk_queue_yank_acceptq(queue); + struct request_sock *next, *req; /* Following specs, it would be better either to send FIN * (and enter FIN-WAIT-1, it is normal close) @@ -847,13 +776,9 @@ void inet_csk_listen_stop(struct sock *sk) * To be honest, we are not able to make either * of the variants now. --ANK */ - reqsk_queue_destroy(queue); - - while ((req = acc_req) != NULL) { + while ((req = reqsk_queue_remove(queue, sk)) != NULL) { struct sock *child = req->sk; - acc_req = req->dl_next; - local_bh_disable(); bh_lock_sock(child); WARN_ON(sock_owned_by_user(child)); @@ -883,18 +808,19 @@ void inet_csk_listen_stop(struct sock *sk) local_bh_enable(); sock_put(child); - sk_acceptq_removed(sk); reqsk_put(req); + cond_resched(); } - if (queue->fastopenq) { + if (queue->fastopenq.rskq_rst_head) { /* Free all the reqs queued in rskq_rst_head. */ - spin_lock_bh(&queue->fastopenq->lock); - acc_req = queue->fastopenq->rskq_rst_head; - queue->fastopenq->rskq_rst_head = NULL; - spin_unlock_bh(&queue->fastopenq->lock); - while ((req = acc_req) != NULL) { - acc_req = req->dl_next; + spin_lock_bh(&queue->fastopenq.lock); + req = queue->fastopenq.rskq_rst_head; + queue->fastopenq.rskq_rst_head = NULL; + spin_unlock_bh(&queue->fastopenq.lock); + while (req != NULL) { + next = req->dl_next; reqsk_put(req); + req = next; } } WARN_ON(sk->sk_ack_backlog); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index c3b1f3a0f4cf..ab9f8a66615d 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -730,91 +730,21 @@ static void twsk_build_assert(void) #endif } -static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, - struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, - const struct nlattr *bc) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct inet_sock *inet = inet_sk(sk); - struct inet_diag_entry entry; - int j, s_j, reqnum, s_reqnum; - struct listen_sock *lopt; - int err = 0; - - s_j = cb->args[3]; - s_reqnum = cb->args[4]; - - if (s_j > 0) - s_j--; - - entry.family = sk->sk_family; - - spin_lock(&icsk->icsk_accept_queue.syn_wait_lock); - - lopt = icsk->icsk_accept_queue.listen_opt; - if (!lopt || !listen_sock_qlen(lopt)) - goto out; - - if (bc) { - entry.sport = inet->inet_num; - entry.userlocks = sk->sk_userlocks; - } - - for (j = s_j; j < lopt->nr_table_entries; j++) { - struct request_sock *req, *head = lopt->syn_table[j]; - - reqnum = 0; - for (req = head; req; reqnum++, req = req->dl_next) { - struct inet_request_sock *ireq = inet_rsk(req); - - if (reqnum < s_reqnum) - continue; - if (r->id.idiag_dport != ireq->ir_rmt_port && - r->id.idiag_dport) - continue; - - if (bc) { - /* Note: entry.sport and entry.userlocks are already set */ - entry_fill_addrs(&entry, req_to_sk(req)); - entry.dport = ntohs(ireq->ir_rmt_port); - - if (!inet_diag_bc_run(bc, &entry)) - continue; - } - - err = inet_req_diag_fill(req_to_sk(req), skb, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI, cb->nlh); - if (err < 0) { - cb->args[3] = j + 1; - cb->args[4] = reqnum; - goto out; - } - } - - s_reqnum = 0; - } - -out: - spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock); - - return err; -} - void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, struct nlattr *bc) { struct net *net = sock_net(skb->sk); int i, num, s_i, s_num; + u32 idiag_states = r->idiag_states; + if (idiag_states & TCPF_SYN_RECV) + idiag_states |= TCPF_NEW_SYN_RECV; s_i = cb->args[1]; s_num = num = cb->args[2]; if (cb->args[0] == 0) { - if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV))) + if (!(idiag_states & TCPF_LISTEN)) goto skip_listen_ht; for (i = s_i; i < INET_LHTABLE_SIZE; i++) { @@ -844,21 +774,11 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, r->id.idiag_sport) goto next_listen; - if (!(r->idiag_states & TCPF_LISTEN) || - r->id.idiag_dport || + if (r->id.idiag_dport || cb->args[3] > 0) - goto syn_recv; - - if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) { - spin_unlock_bh(&ilb->lock); - goto done; - } - -syn_recv: - if (!(r->idiag_states & TCPF_SYN_RECV)) goto next_listen; - if (inet_diag_dump_reqs(skb, sk, cb, r, bc) < 0) { + if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) { spin_unlock_bh(&ilb->lock); goto done; } @@ -879,7 +799,7 @@ skip_listen_ht: s_i = num = s_num = 0; } - if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV))) + if (!(idiag_states & ~TCPF_LISTEN)) goto out; for (i = s_i; i <= hashinfo->ehash_mask; i++) { @@ -906,7 +826,7 @@ skip_listen_ht: goto next_normal; state = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_substate : sk->sk_state; - if (!(r->idiag_states & (1 << state))) + if (!(idiag_states & (1 << state))) goto next_normal; if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 89120196a949..08643a3616af 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -126,7 +126,7 @@ void inet_put_port(struct sock *sk) } EXPORT_SYMBOL(inet_put_port); -int __inet_inherit_port(struct sock *sk, struct sock *child) +int __inet_inherit_port(const struct sock *sk, struct sock *child) { struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; unsigned short port = inet_sk(child)->inet_num; @@ -185,6 +185,8 @@ static inline int compute_score(struct sock *sk, struct net *net, return -1; score += 4; } + if (sk->sk_incoming_cpu == raw_smp_processor_id()) + score++; } return score; } @@ -398,14 +400,18 @@ static u32 inet_sk_port_offset(const struct sock *sk) inet->inet_dport); } -void __inet_hash_nolisten(struct sock *sk, struct sock *osk) +/* insert a socket into ehash, and eventually remove another one + * (The another one can be a SYN_RECV or TIMEWAIT + */ +int inet_ehash_insert(struct sock *sk, struct sock *osk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct hlist_nulls_head *list; struct inet_ehash_bucket *head; spinlock_t *lock; + int ret = 0; - WARN_ON(!sk_unhashed(sk)); + WARN_ON_ONCE(!sk_unhashed(sk)); sk->sk_hash = sk_ehashfn(sk); head = inet_ehash_bucket(hashinfo, sk->sk_hash); @@ -419,6 +425,12 @@ void __inet_hash_nolisten(struct sock *sk, struct sock *osk) sk_nulls_del_node_init_rcu(osk); } spin_unlock(lock); + return ret; +} + +void __inet_hash_nolisten(struct sock *sk, struct sock *osk) +{ + inet_ehash_insert(sk, osk); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); } EXPORT_SYMBOL_GPL(__inet_hash_nolisten); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index ae22cc24fbe8..c67f9bd7699c 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -123,13 +123,15 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, /* * Step 2: Hash TW into tcp ehash chain. * Notes : - * - tw_refcnt is set to 3 because : + * - tw_refcnt is set to 4 because : * - We have one reference from bhash chain. * - We have one reference from ehash chain. + * - We have one reference from timer. + * - One reference for ourself (our caller will release it). * We can use atomic_set() because prior spin_lock()/spin_unlock() * committed into memory all tw fields. */ - atomic_set(&tw->tw_refcnt, 1 + 1 + 1); + atomic_set(&tw->tw_refcnt, 4); inet_twsk_add_node_rcu(tw, &ehead->chain); /* Step 3: Remove SK from hash chain */ @@ -217,7 +219,7 @@ void inet_twsk_deschedule_put(struct inet_timewait_sock *tw) } EXPORT_SYMBOL(inet_twsk_deschedule_put); -void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo) +void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm) { /* timeout := RTO * 3.5 * @@ -245,12 +247,14 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo) */ tw->tw_kill = timeo <= 4*HZ; - if (!mod_timer_pinned(&tw->tw_timer, jiffies + timeo)) { - atomic_inc(&tw->tw_refcnt); + if (!rearm) { + BUG_ON(mod_timer_pinned(&tw->tw_timer, jiffies + timeo)); atomic_inc(&tw->tw_dr->tw_count); + } else { + mod_timer_pending(&tw->tw_timer, jiffies + timeo); } } -EXPORT_SYMBOL_GPL(inet_twsk_schedule); +EXPORT_SYMBOL_GPL(__inet_twsk_schedule); void inet_twsk_purge(struct inet_hashinfo *hashinfo, struct inet_timewait_death_row *twdr, int family) diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 241afd743d2c..86fa45809540 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -157,22 +157,6 @@ void __init inet_initpeers(void) INIT_DEFERRABLE_WORK(&gc_work, inetpeer_gc_worker); } -static int addr_compare(const struct inetpeer_addr *a, - const struct inetpeer_addr *b) -{ - int i, n = (a->family == AF_INET ? 1 : 4); - - for (i = 0; i < n; i++) { - if (a->addr.a6[i] == b->addr.a6[i]) - continue; - if ((__force u32)a->addr.a6[i] < (__force u32)b->addr.a6[i]) - return -1; - return 1; - } - - return 0; -} - #define rcu_deref_locked(X, BASE) \ rcu_dereference_protected(X, lockdep_is_held(&(BASE)->lock.lock)) @@ -188,7 +172,7 @@ static int addr_compare(const struct inetpeer_addr *a, *stackptr++ = &_base->root; \ for (u = rcu_deref_locked(_base->root, _base); \ u != peer_avl_empty;) { \ - int cmp = addr_compare(_daddr, &u->daddr); \ + int cmp = inetpeer_addr_cmp(_daddr, &u->daddr); \ if (cmp == 0) \ break; \ if (cmp == -1) \ @@ -215,7 +199,7 @@ static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr, int count = 0; while (u != peer_avl_empty) { - int cmp = addr_compare(daddr, &u->daddr); + int cmp = inetpeer_addr_cmp(daddr, &u->daddr); if (cmp == 0) { /* Before taking a reference, check if this entry was * deleted (refcnt=-1) diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 2d3aa408fbdc..da0d7ce85844 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -61,18 +61,18 @@ static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) } -static int ip_forward_finish(struct sock *sk, struct sk_buff *skb) +static int ip_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { struct ip_options *opt = &(IPCB(skb)->opt); - IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); - IP_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len); + IP_INC_STATS_BH(net, IPSTATS_MIB_OUTFORWDATAGRAMS); + IP_ADD_STATS_BH(net, IPSTATS_MIB_OUTOCTETS, skb->len); if (unlikely(opt->optlen)) ip_forward_options(skb); skb_sender_cpu_clear(skb); - return dst_output_sk(sk, skb); + return dst_output(net, sk, skb); } int ip_forward(struct sk_buff *skb) @@ -81,6 +81,7 @@ int ip_forward(struct sk_buff *skb) struct iphdr *iph; /* Our header */ struct rtable *rt; /* Route we use */ struct ip_options *opt = &(IPCB(skb)->opt); + struct net *net; /* that should never happen */ if (skb->pkt_type != PACKET_HOST) @@ -99,6 +100,7 @@ int ip_forward(struct sk_buff *skb) return NET_RX_SUCCESS; skb_forward_csum(skb); + net = dev_net(skb->dev); /* * According to the RFC, we must first decrease the TTL field. If @@ -119,7 +121,7 @@ int ip_forward(struct sk_buff *skb) IPCB(skb)->flags |= IPSKB_FORWARDED; mtu = ip_dst_mtu_maybe_forward(&rt->dst, true); if (ip_exceeds_mtu(skb, mtu)) { - IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); goto drop; @@ -143,8 +145,9 @@ int ip_forward(struct sk_buff *skb) skb->priority = rt_tos2priority(iph->tos); - return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, NULL, skb, - skb->dev, rt->dst.dev, ip_forward_finish); + return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, + net, NULL, skb, skb->dev, rt->dst.dev, + ip_forward_finish); sr_failed: /* @@ -155,7 +158,7 @@ sr_failed: too_many_hops: /* Tell the sender its packet died... */ - IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_INHDRERRORS); + IP_INC_STATS_BH(net, IPSTATS_MIB_INHDRERRORS); icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); drop: kfree_skb(skb); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 15762e758861..5482745d5d68 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -48,7 +48,7 @@ #include <linux/inet.h> #include <linux/netfilter_ipv4.h> #include <net/inet_ecn.h> -#include <net/vrf.h> +#include <net/l3mdev.h> /* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6 * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c @@ -78,7 +78,7 @@ struct ipq { u8 ecn; /* RFC3168 support */ u16 max_df_size; /* largest frag with DF set seen */ int iif; - int vif; /* VRF device index */ + int vif; /* L3 master device index */ unsigned int rid; struct inet_peer *peer; }; @@ -151,7 +151,8 @@ static void ip4_frag_init(struct inet_frag_queue *q, const void *a) qp->vif = arg->vif; qp->user = arg->user; qp->peer = sysctl_ipfrag_max_dist ? - inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, 1) : NULL; + inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, arg->vif, 1) : + NULL; } static void ip4_frag_free(struct inet_frag_queue *q) @@ -653,11 +654,10 @@ out_fail: } /* Process an incoming IP datagram fragment. */ -int ip_defrag(struct sk_buff *skb, u32 user) +int ip_defrag(struct net *net, struct sk_buff *skb, u32 user) { struct net_device *dev = skb->dev ? : skb_dst(skb)->dev; - int vif = vrf_master_ifindex_rcu(dev); - struct net *net = dev_net(dev); + int vif = l3mdev_master_ifindex_rcu(dev); struct ipq *qp; IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS); @@ -682,7 +682,7 @@ int ip_defrag(struct sk_buff *skb, u32 user) } EXPORT_SYMBOL(ip_defrag); -struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user) +struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user) { struct iphdr iph; int netoff; @@ -711,7 +711,7 @@ struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user) if (pskb_trim_rcsum(skb, netoff + len)) return skb; memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); - if (ip_defrag(skb, user)) + if (ip_defrag(net, skb, user)) return NULL; skb_clear_hash(skb); } diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 1bf328182697..bd0679d90519 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -400,25 +400,14 @@ static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) if (tunnel) { skb_pop_mac_header(skb); if (tunnel->collect_md) { - struct ip_tunnel_info *info; + __be16 flags; + __be64 tun_id; - tun_dst = metadata_dst_alloc(0, GFP_ATOMIC); + flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY); + tun_id = key_to_tunnel_id(tpi->key); + tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0); if (!tun_dst) return PACKET_REJECT; - - info = &tun_dst->u.tun_info; - info->key.u.ipv4.src = iph->saddr; - info->key.u.ipv4.dst = iph->daddr; - info->key.tos = iph->tos; - info->key.ttl = iph->ttl; - - info->mode = IP_TUNNEL_INFO_RX; - info->key.tun_flags = tpi->flags & - (TUNNEL_CSUM | TUNNEL_KEY); - info->key.tun_id = key_to_tunnel_id(tpi->key); - - info->key.tp_src = 0; - info->key.tp_dst = 0; } ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); @@ -522,7 +511,8 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev) int err; tun_info = skb_tunnel_info(skb); - if (unlikely(!tun_info || tun_info->mode != IP_TUNNEL_INFO_TX)) + if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || + ip_tunnel_info_af(tun_info) != AF_INET)) goto err_free_skb; key = &tun_info->key; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index f4fc8a77aaa7..b1209b63381f 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -157,6 +157,7 @@ bool ip_call_ra_chain(struct sk_buff *skb) u8 protocol = ip_hdr(skb)->protocol; struct sock *last = NULL; struct net_device *dev = skb->dev; + struct net *net = dev_net(dev); for (ra = rcu_dereference(ip_ra_chain); ra; ra = rcu_dereference(ra->next)) { struct sock *sk = ra->sk; @@ -167,9 +168,9 @@ bool ip_call_ra_chain(struct sk_buff *skb) if (sk && inet_sk(sk)->inet_num == protocol && (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dev->ifindex) && - net_eq(sock_net(sk), dev_net(dev))) { + net_eq(sock_net(sk), net)) { if (ip_is_fragment(ip_hdr(skb))) { - if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN)) + if (ip_defrag(net, skb, IP_DEFRAG_CALL_RA_CHAIN)) return true; } if (last) { @@ -188,10 +189,8 @@ bool ip_call_ra_chain(struct sk_buff *skb) return false; } -static int ip_local_deliver_finish(struct sock *sk, struct sk_buff *skb) +static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct net *net = dev_net(skb->dev); - __skb_pull(skb, skb_network_header_len(skb)); rcu_read_lock(); @@ -248,14 +247,15 @@ int ip_local_deliver(struct sk_buff *skb) /* * Reassemble IP fragments. */ + struct net *net = dev_net(skb->dev); if (ip_is_fragment(ip_hdr(skb))) { - if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER)) + if (ip_defrag(net, skb, IP_DEFRAG_LOCAL_DELIVER)) return 0; } - return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, NULL, skb, - skb->dev, NULL, + return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, + net, NULL, skb, skb->dev, NULL, ip_local_deliver_finish); } @@ -311,7 +311,7 @@ drop: int sysctl_ip_early_demux __read_mostly = 1; EXPORT_SYMBOL(sysctl_ip_early_demux); -static int ip_rcv_finish(struct sock *sk, struct sk_buff *skb) +static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; @@ -337,8 +337,7 @@ static int ip_rcv_finish(struct sock *sk, struct sk_buff *skb) iph->tos, skb->dev); if (unlikely(err)) { if (err == -EXDEV) - NET_INC_STATS_BH(dev_net(skb->dev), - LINUX_MIB_IPRPFILTER); + NET_INC_STATS_BH(net, LINUX_MIB_IPRPFILTER); goto drop; } } @@ -359,11 +358,9 @@ static int ip_rcv_finish(struct sock *sk, struct sk_buff *skb) rt = skb_rtable(skb); if (rt->rt_type == RTN_MULTICAST) { - IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INMCAST, - skb->len); + IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INMCAST, skb->len); } else if (rt->rt_type == RTN_BROADCAST) - IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INBCAST, - skb->len); + IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INBCAST, skb->len); return dst_input(skb); @@ -378,6 +375,7 @@ drop: int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { const struct iphdr *iph; + struct net *net; u32 len; /* When the interface is in promisc. mode, drop all the crap @@ -387,11 +385,12 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, goto drop; - IP_UPD_PO_STATS_BH(dev_net(dev), IPSTATS_MIB_IN, skb->len); + net = dev_net(dev); + IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_IN, skb->len); skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) { - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS); + IP_INC_STATS_BH(net, IPSTATS_MIB_INDISCARDS); goto out; } @@ -417,7 +416,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1); BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0); BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE); - IP_ADD_STATS_BH(dev_net(dev), + IP_ADD_STATS_BH(net, IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK), max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs)); @@ -431,7 +430,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, len = ntohs(iph->tot_len); if (skb->len < len) { - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS); + IP_INC_STATS_BH(net, IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; } else if (len < (iph->ihl*4)) goto inhdr_error; @@ -441,7 +440,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, * Note this now means skb->len holds ntohs(iph->tot_len). */ if (pskb_trim_rcsum(skb, len)) { - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS); + IP_INC_STATS_BH(net, IPSTATS_MIB_INDISCARDS); goto drop; } @@ -453,14 +452,14 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, /* Must drop socket now because of tproxy. */ skb_orphan(skb); - return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, NULL, skb, - dev, NULL, + return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, + net, NULL, skb, dev, NULL, ip_rcv_finish); csum_error: - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_CSUMERRORS); + IP_INC_STATS_BH(net, IPSTATS_MIB_CSUMERRORS); inhdr_error: - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS); + IP_INC_STATS_BH(net, IPSTATS_MIB_INHDRERRORS); drop: kfree_skb(skb); out: diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 0138fada0951..67404e1fe7d4 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -83,9 +83,10 @@ int sysctl_ip_default_ttl __read_mostly = IPDEFTTL; EXPORT_SYMBOL(sysctl_ip_default_ttl); -static int ip_fragment(struct sock *sk, struct sk_buff *skb, - unsigned int mtu, - int (*output)(struct sock *, struct sk_buff *)); +static int +ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, + unsigned int mtu, + int (*output)(struct net *, struct sock *, struct sk_buff *)); /* Generate a checksum for an outgoing IP datagram. */ void ip_send_check(struct iphdr *iph) @@ -95,32 +96,28 @@ void ip_send_check(struct iphdr *iph) } EXPORT_SYMBOL(ip_send_check); -static int __ip_local_out_sk(struct sock *sk, struct sk_buff *skb) +int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { struct iphdr *iph = ip_hdr(skb); iph->tot_len = htons(skb->len); ip_send_check(iph); - return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, sk, skb, NULL, - skb_dst(skb)->dev, dst_output_sk); -} - -int __ip_local_out(struct sk_buff *skb) -{ - return __ip_local_out_sk(skb->sk, skb); + return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, + net, sk, skb, NULL, skb_dst(skb)->dev, + dst_output); } -int ip_local_out_sk(struct sock *sk, struct sk_buff *skb) +int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; - err = __ip_local_out(skb); + err = __ip_local_out(net, sk, skb); if (likely(err == 1)) - err = dst_output_sk(sk, skb); + err = dst_output(net, sk, skb); return err; } -EXPORT_SYMBOL_GPL(ip_local_out_sk); +EXPORT_SYMBOL_GPL(ip_local_out); static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) { @@ -135,11 +132,12 @@ static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) * Add an ip header to a skbuff and send it out. * */ -int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, +int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk, __be32 saddr, __be32 daddr, struct ip_options_rcu *opt) { struct inet_sock *inet = inet_sk(sk); struct rtable *rt = skb_rtable(skb); + struct net *net = sock_net(sk); struct iphdr *iph; /* Build the IP header. */ @@ -149,15 +147,17 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, iph->version = 4; iph->ihl = 5; iph->tos = inet->tos; - if (ip_dont_fragment(sk, &rt->dst)) - iph->frag_off = htons(IP_DF); - else - iph->frag_off = 0; iph->ttl = ip_select_ttl(inet, &rt->dst); iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr); iph->saddr = saddr; iph->protocol = sk->sk_protocol; - ip_select_ident(sock_net(sk), skb, sk); + if (ip_dont_fragment(sk, &rt->dst)) { + iph->frag_off = htons(IP_DF); + iph->id = 0; + } else { + iph->frag_off = 0; + __ip_select_ident(net, iph, 1); + } if (opt && opt->opt.optlen) { iph->ihl += opt->opt.optlen>>2; @@ -168,11 +168,11 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, skb->mark = sk->sk_mark; /* Send it out. */ - return ip_local_out(skb); + return ip_local_out(net, skb->sk, skb); } EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); -static int ip_finish_output2(struct sock *sk, struct sk_buff *skb) +static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct rtable *rt = (struct rtable *)dst; @@ -182,9 +182,9 @@ static int ip_finish_output2(struct sock *sk, struct sk_buff *skb) u32 nexthop; if (rt->rt_type == RTN_MULTICAST) { - IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len); + IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTMCAST, skb->len); } else if (rt->rt_type == RTN_BROADCAST) - IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len); + IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTBCAST, skb->len); /* Be paranoid, rather than too clever. */ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { @@ -220,8 +220,8 @@ static int ip_finish_output2(struct sock *sk, struct sk_buff *skb) return -EINVAL; } -static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb, - unsigned int mtu) +static int ip_finish_output_gso(struct net *net, struct sock *sk, + struct sk_buff *skb, unsigned int mtu) { netdev_features_t features; struct sk_buff *segs; @@ -230,7 +230,7 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb, /* common case: locally created skb or seglen is <= mtu */ if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) || skb_gso_network_seglen(skb) <= mtu) - return ip_finish_output2(sk, skb); + return ip_finish_output2(net, sk, skb); /* Slowpath - GSO segment length is exceeding the dst MTU. * @@ -253,7 +253,7 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb, int err; segs->next = NULL; - err = ip_fragment(sk, segs, mtu, ip_finish_output2); + err = ip_fragment(net, sk, segs, mtu, ip_finish_output2); if (err && ret == 0) ret = err; @@ -263,7 +263,7 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb, return ret; } -static int ip_finish_output(struct sock *sk, struct sk_buff *skb) +static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { unsigned int mtu; @@ -271,20 +271,20 @@ static int ip_finish_output(struct sock *sk, struct sk_buff *skb) /* Policy lookup after SNAT yielded a new policy */ if (skb_dst(skb)->xfrm) { IPCB(skb)->flags |= IPSKB_REROUTED; - return dst_output_sk(sk, skb); + return dst_output(net, sk, skb); } #endif mtu = ip_skb_dst_mtu(skb); if (skb_is_gso(skb)) - return ip_finish_output_gso(sk, skb, mtu); + return ip_finish_output_gso(net, sk, skb, mtu); if (skb->len > mtu || (IPCB(skb)->flags & IPSKB_FRAG_PMTU)) - return ip_fragment(sk, skb, mtu, ip_finish_output2); + return ip_fragment(net, sk, skb, mtu, ip_finish_output2); - return ip_finish_output2(sk, skb); + return ip_finish_output2(net, sk, skb); } -int ip_mc_output(struct sock *sk, struct sk_buff *skb) +int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); struct net_device *dev = rt->dst.dev; @@ -292,7 +292,7 @@ int ip_mc_output(struct sock *sk, struct sk_buff *skb) /* * If the indicated interface is up and running, send the packet. */ - IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len); + IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len); skb->dev = dev; skb->protocol = htons(ETH_P_IP); @@ -320,7 +320,7 @@ int ip_mc_output(struct sock *sk, struct sk_buff *skb) struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); if (newskb) NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, - sk, newskb, NULL, newskb->dev, + net, sk, newskb, NULL, newskb->dev, dev_loopback_xmit); } @@ -335,26 +335,28 @@ int ip_mc_output(struct sock *sk, struct sk_buff *skb) if (rt->rt_flags&RTCF_BROADCAST) { struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); if (newskb) - NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, newskb, - NULL, newskb->dev, dev_loopback_xmit); + NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, + net, sk, newskb, NULL, newskb->dev, + dev_loopback_xmit); } - return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb, NULL, - skb->dev, ip_finish_output, + return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, + net, sk, skb, NULL, skb->dev, + ip_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } -int ip_output(struct sock *sk, struct sk_buff *skb) +int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev; - IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len); + IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len); skb->dev = dev; skb->protocol = htons(ETH_P_IP); - return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb, - NULL, dev, + return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, + net, sk, skb, NULL, dev, ip_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } @@ -377,6 +379,7 @@ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4) int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) { struct inet_sock *inet = inet_sk(sk); + struct net *net = sock_net(sk); struct ip_options_rcu *inet_opt; struct flowi4 *fl4; struct rtable *rt; @@ -407,7 +410,7 @@ int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) * keep trying until route appears or the connection times * itself out. */ - rt = ip_route_output_ports(sock_net(sk), fl4, sk, + rt = ip_route_output_ports(net, fl4, sk, daddr, inet->inet_saddr, inet->inet_dport, inet->inet_sport, @@ -444,20 +447,20 @@ packet_routed: ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0); } - ip_select_ident_segs(sock_net(sk), skb, sk, + ip_select_ident_segs(net, skb, sk, skb_shinfo(skb)->gso_segs ?: 1); /* TODO : should we use skb->sk here instead of sk ? */ skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; - res = ip_local_out(skb); + res = ip_local_out(net, sk, skb); rcu_read_unlock(); return res; no_route: rcu_read_unlock(); - IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); + IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); kfree_skb(skb); return -EHOSTUNREACH; } @@ -486,29 +489,26 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) skb_copy_secmark(to, from); } -static int ip_fragment(struct sock *sk, struct sk_buff *skb, +static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned int mtu, - int (*output)(struct sock *, struct sk_buff *)) + int (*output)(struct net *, struct sock *, struct sk_buff *)) { struct iphdr *iph = ip_hdr(skb); if ((iph->frag_off & htons(IP_DF)) == 0) - return ip_do_fragment(sk, skb, output); + return ip_do_fragment(net, sk, skb, output); if (unlikely(!skb->ignore_df || (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size > mtu))) { - struct rtable *rt = skb_rtable(skb); - struct net_device *dev = rt->dst.dev; - - IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); kfree_skb(skb); return -EMSGSIZE; } - return ip_do_fragment(sk, skb, output); + return ip_do_fragment(net, sk, skb, output); } /* @@ -518,8 +518,8 @@ static int ip_fragment(struct sock *sk, struct sk_buff *skb, * single device frame, and queue such a frame for sending. */ -int ip_do_fragment(struct sock *sk, struct sk_buff *skb, - int (*output)(struct sock *, struct sk_buff *)) +int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, + int (*output)(struct net *, struct sock *, struct sk_buff *)) { struct iphdr *iph; int ptr; @@ -621,10 +621,10 @@ int ip_do_fragment(struct sock *sk, struct sk_buff *skb, ip_send_check(iph); } - err = output(sk, skb); + err = output(net, sk, skb); if (!err) - IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); + IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES); if (err || !frag) break; @@ -634,7 +634,7 @@ int ip_do_fragment(struct sock *sk, struct sk_buff *skb, } if (err == 0) { - IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); + IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS); return 0; } @@ -643,7 +643,7 @@ int ip_do_fragment(struct sock *sk, struct sk_buff *skb, kfree_skb(frag); frag = skb; } - IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); return err; slow_path_clean: @@ -761,19 +761,19 @@ slow_path: ip_send_check(iph); - err = output(sk, skb2); + err = output(net, sk, skb2); if (err) goto fail; - IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); + IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES); } consume_skb(skb); - IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); + IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS); return err; fail: kfree_skb(skb); - IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); return err; } EXPORT_SYMBOL(ip_do_fragment); @@ -1434,7 +1434,7 @@ int ip_send_skb(struct net *net, struct sk_buff *skb) { int err; - err = ip_local_out(skb); + err = ip_local_out(net, skb->sk, skb); if (err) { if (err > 0) err = net_xmit_errno(err); @@ -1561,7 +1561,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, } oif = arg->bound_dev_if; - if (!oif && netif_index_is_vrf(net, skb->skb_iif)) + if (!oif && netif_index_is_l3_master(net, skb->skb_iif)) oif = skb->skb_iif; flowi4_init_output(&fl4, oif, diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 934f2ac8ad61..6cb9009c3d96 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -46,12 +46,14 @@ #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/rtnetlink.h> +#include <net/dst_metadata.h> int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, __u8 proto, __u8 tos, __u8 ttl, __be16 df, bool xnet) { - int pkt_len = skb->len; + int pkt_len = skb->len - skb_inner_network_offset(skb); + struct net *net = dev_net(rt->dst.dev); struct iphdr *iph; int err; @@ -75,10 +77,9 @@ int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, iph->daddr = dst; iph->saddr = src; iph->ttl = ttl; - __ip_select_ident(dev_net(rt->dst.dev), iph, - skb_shinfo(skb)->gso_segs ?: 1); + __ip_select_ident(net, iph, skb_shinfo(skb)->gso_segs ?: 1); - err = ip_local_out_sk(sk, skb); + err = ip_local_out(net, sk, skb); if (unlikely(net_xmit_eval(err))) pkt_len = 0; return pkt_len; @@ -119,6 +120,33 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto) } EXPORT_SYMBOL_GPL(iptunnel_pull_header); +struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, + gfp_t flags) +{ + struct metadata_dst *res; + struct ip_tunnel_info *dst, *src; + + if (!md || md->u.tun_info.mode & IP_TUNNEL_INFO_TX) + return NULL; + + res = metadata_dst_alloc(0, flags); + if (!res) + return NULL; + + dst = &res->u.tun_info; + src = &md->u.tun_info; + dst->key.tun_id = src->key.tun_id; + if (src->mode & IP_TUNNEL_INFO_IPV6) + memcpy(&dst->key.u.ipv6.dst, &src->key.u.ipv6.src, + sizeof(struct in6_addr)); + else + dst->key.u.ipv4.dst = src->key.u.ipv4.src; + dst->mode = src->mode | IP_TUNNEL_INFO_TX; + + return res; +} +EXPORT_SYMBOL_GPL(iptunnel_metadata_reply); + struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, bool csum_help, int gso_type_mask) @@ -198,8 +226,6 @@ static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = { [LWTUNNEL_IP_SRC] = { .type = NLA_U32 }, [LWTUNNEL_IP_TTL] = { .type = NLA_U8 }, [LWTUNNEL_IP_TOS] = { .type = NLA_U8 }, - [LWTUNNEL_IP_SPORT] = { .type = NLA_U16 }, - [LWTUNNEL_IP_DPORT] = { .type = NLA_U16 }, [LWTUNNEL_IP_FLAGS] = { .type = NLA_U16 }, }; @@ -239,17 +265,10 @@ static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr, if (tb[LWTUNNEL_IP_TOS]) tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP_TOS]); - if (tb[LWTUNNEL_IP_SPORT]) - tun_info->key.tp_src = nla_get_be16(tb[LWTUNNEL_IP_SPORT]); - - if (tb[LWTUNNEL_IP_DPORT]) - tun_info->key.tp_dst = nla_get_be16(tb[LWTUNNEL_IP_DPORT]); - if (tb[LWTUNNEL_IP_FLAGS]) tun_info->key.tun_flags = nla_get_u16(tb[LWTUNNEL_IP_FLAGS]); tun_info->mode = IP_TUNNEL_INFO_TX; - tun_info->options = NULL; tun_info->options_len = 0; *ts = new_state; @@ -267,8 +286,6 @@ static int ip_tun_fill_encap_info(struct sk_buff *skb, nla_put_be32(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) || nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.tos) || nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ttl) || - nla_put_u16(skb, LWTUNNEL_IP_SPORT, tun_info->key.tp_src) || - nla_put_u16(skb, LWTUNNEL_IP_DPORT, tun_info->key.tp_dst) || nla_put_u16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags)) return -ENOMEM; @@ -282,8 +299,6 @@ static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate) + nla_total_size(4) /* LWTUNNEL_IP_SRC */ + nla_total_size(1) /* LWTUNNEL_IP_TOS */ + nla_total_size(1) /* LWTUNNEL_IP_TTL */ - + nla_total_size(2) /* LWTUNNEL_IP_SPORT */ - + nla_total_size(2) /* LWTUNNEL_IP_DPORT */ + nla_total_size(2); /* LWTUNNEL_IP_FLAGS */ } @@ -306,8 +321,6 @@ static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = { [LWTUNNEL_IP6_SRC] = { .len = sizeof(struct in6_addr) }, [LWTUNNEL_IP6_HOPLIMIT] = { .type = NLA_U8 }, [LWTUNNEL_IP6_TC] = { .type = NLA_U8 }, - [LWTUNNEL_IP6_SPORT] = { .type = NLA_U16 }, - [LWTUNNEL_IP6_DPORT] = { .type = NLA_U16 }, [LWTUNNEL_IP6_FLAGS] = { .type = NLA_U16 }, }; @@ -347,17 +360,10 @@ static int ip6_tun_build_state(struct net_device *dev, struct nlattr *attr, if (tb[LWTUNNEL_IP6_TC]) tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP6_TC]); - if (tb[LWTUNNEL_IP6_SPORT]) - tun_info->key.tp_src = nla_get_be16(tb[LWTUNNEL_IP6_SPORT]); - - if (tb[LWTUNNEL_IP6_DPORT]) - tun_info->key.tp_dst = nla_get_be16(tb[LWTUNNEL_IP6_DPORT]); - if (tb[LWTUNNEL_IP6_FLAGS]) tun_info->key.tun_flags = nla_get_u16(tb[LWTUNNEL_IP6_FLAGS]); - tun_info->mode = IP_TUNNEL_INFO_TX; - tun_info->options = NULL; + tun_info->mode = IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_IPV6; tun_info->options_len = 0; *ts = new_state; @@ -375,8 +381,6 @@ static int ip6_tun_fill_encap_info(struct sk_buff *skb, nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) || nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.tos) || nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.ttl) || - nla_put_u16(skb, LWTUNNEL_IP6_SPORT, tun_info->key.tp_src) || - nla_put_u16(skb, LWTUNNEL_IP6_DPORT, tun_info->key.tp_dst) || nla_put_u16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags)) return -ENOMEM; @@ -390,8 +394,6 @@ static int ip6_tun_encap_nlsize(struct lwtunnel_state *lwtstate) + nla_total_size(16) /* LWTUNNEL_IP6_SRC */ + nla_total_size(1) /* LWTUNNEL_IP6_HOPLIMIT */ + nla_total_size(1) /* LWTUNNEL_IP6_TC */ - + nla_total_size(2) /* LWTUNNEL_IP6_SPORT */ - + nla_total_size(2) /* LWTUNNEL_IP6_DPORT */ + nla_total_size(2); /* LWTUNNEL_IP6_FLAGS */ } diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 0c152087ca15..4d8f0b698777 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -197,7 +197,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, skb_dst_set(skb, dst); skb->dev = skb_dst(skb)->dev; - err = dst_output(skb); + err = dst_output(tunnel->net, skb->sk, skb); if (net_xmit_eval(err) == 0) err = skb->len; iptunnel_xmit_stats(err, &dev->stats, dev->tstats); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 3a2c0162c3ba..fc42525d8694 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -233,7 +233,6 @@ static const struct fib_rules_ops __net_initconst ipmr_rules_ops_template = { .match = ipmr_rule_match, .configure = ipmr_rule_configure, .compare = ipmr_rule_compare, - .default_pref = fib_default_rule_pref, .fill = ipmr_rule_fill, .nlgroup = RTNLGRP_IPV4_RULE, .policy = ipmr_rule_policy, @@ -1679,17 +1678,18 @@ static void ip_encap(struct net *net, struct sk_buff *skb, nf_reset(skb); } -static inline int ipmr_forward_finish(struct sock *sk, struct sk_buff *skb) +static inline int ipmr_forward_finish(struct net *net, struct sock *sk, + struct sk_buff *skb) { struct ip_options *opt = &(IPCB(skb)->opt); - IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); - IP_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len); + IP_INC_STATS_BH(net, IPSTATS_MIB_OUTFORWDATAGRAMS); + IP_ADD_STATS_BH(net, IPSTATS_MIB_OUTOCTETS, skb->len); if (unlikely(opt->optlen)) ip_forward_options(skb); - return dst_output_sk(sk, skb); + return dst_output(net, sk, skb); } /* @@ -1746,7 +1746,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, * to blackhole. */ - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS_BH(net, IPSTATS_MIB_FRAGFAILS); ip_rt_put(rt); goto out_free; } @@ -1788,8 +1788,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, * not mrouter) cannot join to more than one interface - it will * result in receiving multiple packets. */ - NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, NULL, skb, - skb->dev, dev, + NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, + net, NULL, skb, skb->dev, dev, ipmr_forward_finish); return; diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 61eafc9b4545..c3776ff6749f 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -17,9 +17,8 @@ #include <net/netfilter/nf_queue.h> /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ -int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type) +int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_type) { - struct net *net = dev_net(skb_dst(skb)->dev); const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; struct flowi4 fl4 = {}; @@ -104,7 +103,7 @@ static void nf_ip_saveroute(const struct sk_buff *skb, } } -static int nf_ip_reroute(struct sk_buff *skb, +static int nf_ip_reroute(struct net *net, struct sk_buff *skb, const struct nf_queue_entry *entry) { const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry); @@ -116,7 +115,7 @@ static int nf_ip_reroute(struct sk_buff *skb, skb->mark == rt_info->mark && iph->daddr == rt_info->daddr && iph->saddr == rt_info->saddr)) - return ip_route_me_harder(skb, RTN_UNSPEC); + return ip_route_me_harder(net, skb, RTN_UNSPEC); } return 0; } diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index c416cb355cb0..2dad3e1c5f11 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -247,10 +247,10 @@ struct arpt_entry *arpt_next_entry(const struct arpt_entry *entry) } unsigned int arpt_do_table(struct sk_buff *skb, - unsigned int hook, const struct nf_hook_state *state, struct xt_table *table) { + unsigned int hook = state->hook; static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); unsigned int verdict = NF_DROP; const struct arphdr *arp; @@ -285,6 +285,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, */ e = get_entry(table_base, private->hook_entry[hook]); + acpar.net = state->net; acpar.in = state->in; acpar.out = state->out; acpar.hooknum = hook; @@ -367,13 +368,10 @@ static inline bool unconditional(const struct arpt_arp *arp) /* Figures out from what hook each rule can be called: returns 0 if * there are loops. Puts hook bitmask in comefrom. - * - * Keeps track of largest call depth seen and stores it in newinfo->stacksize. */ -static int mark_source_chains(struct xt_table_info *newinfo, +static int mark_source_chains(const struct xt_table_info *newinfo, unsigned int valid_hooks, void *entry0) { - unsigned int calldepth, max_calldepth = 0; unsigned int hook; /* No recursion; use packet counter to save back ptrs (reset @@ -389,7 +387,6 @@ static int mark_source_chains(struct xt_table_info *newinfo, /* Set initial back pointer. */ e->counters.pcnt = pos; - calldepth = 0; for (;;) { const struct xt_standard_target *t @@ -444,8 +441,6 @@ static int mark_source_chains(struct xt_table_info *newinfo, (entry0 + pos + size); e->counters.pcnt = pos; pos += size; - if (calldepth > 0) - --calldepth; } else { int newpos = t->verdict; @@ -460,10 +455,6 @@ static int mark_source_chains(struct xt_table_info *newinfo, return 0; } - if (entry0 + newpos != arpt_next_entry(e) && - ++calldepth > max_calldepth) - max_calldepth = calldepth; - /* This a jump; chase it. */ duprintf("Jump rule %u -> %u\n", pos, newpos); @@ -480,7 +471,6 @@ static int mark_source_chains(struct xt_table_info *newinfo, next: duprintf("Finished chain %u\n", hook); } - newinfo->stacksize = max_calldepth; return 1; } @@ -670,6 +660,9 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, if (ret != 0) break; ++i; + if (strcmp(arpt_get_target(iter)->u.user.name, + XT_ERROR_TARGET) == 0) + ++newinfo->stacksize; } duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret); if (ret != 0) @@ -1442,6 +1435,9 @@ static int translate_compat_table(const char *name, break; } ++i; + if (strcmp(arpt_get_target(iter1)->u.user.name, + XT_ERROR_TARGET) == 0) + ++newinfo->stacksize; } if (ret) { /* diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index 93876d03120c..1897ee160920 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -27,13 +27,10 @@ static const struct xt_table packet_filter = { /* The work comes in here from netfilter.c */ static unsigned int -arptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, +arptable_filter_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - const struct net *net = dev_net(state->in ? state->in : state->out); - - return arpt_do_table(skb, ops->hooknum, state, - net->ipv4.arptable_filter); + return arpt_do_table(skb, state, state->net->ipv4.arptable_filter); } static struct nf_hook_ops *arpfilter_ops __read_mostly; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 787f99ed55e2..42d0946956db 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -246,7 +246,8 @@ get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e, return 0; } -static void trace_packet(const struct sk_buff *skb, +static void trace_packet(struct net *net, + const struct sk_buff *skb, unsigned int hook, const struct net_device *in, const struct net_device *out, @@ -258,7 +259,6 @@ static void trace_packet(const struct sk_buff *skb, const char *hookname, *chainname, *comment; const struct ipt_entry *iter; unsigned int rulenum = 0; - struct net *net = dev_net(in ? in : out); root = get_entry(private->entries, private->hook_entry[hook]); @@ -285,10 +285,10 @@ struct ipt_entry *ipt_next_entry(const struct ipt_entry *entry) /* Returns one of the generic firewall policies, like NF_ACCEPT. */ unsigned int ipt_do_table(struct sk_buff *skb, - unsigned int hook, const struct nf_hook_state *state, struct xt_table *table) { + unsigned int hook = state->hook; static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); const struct iphdr *ip; /* Initializing verdict to NF_DROP keeps gcc happy. */ @@ -315,6 +315,7 @@ ipt_do_table(struct sk_buff *skb, acpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET; acpar.thoff = ip_hdrlen(skb); acpar.hotdrop = false; + acpar.net = state->net; acpar.in = state->in; acpar.out = state->out; acpar.family = NFPROTO_IPV4; @@ -378,8 +379,8 @@ ipt_do_table(struct sk_buff *skb, #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) /* The packet is traced: log it */ if (unlikely(skb->nf_trace)) - trace_packet(skb, hook, state->in, state->out, - table->name, private, e); + trace_packet(state->net, skb, hook, state->in, + state->out, table->name, private, e); #endif /* Standard target? */ if (!t->u.kernel.target->target) { @@ -443,15 +444,11 @@ ipt_do_table(struct sk_buff *skb, } /* Figures out from what hook each rule can be called: returns 0 if - * there are loops. Puts hook bitmask in comefrom. - * - * Keeps track of largest call depth seen and stores it in newinfo->stacksize. - */ + there are loops. Puts hook bitmask in comefrom. */ static int -mark_source_chains(struct xt_table_info *newinfo, +mark_source_chains(const struct xt_table_info *newinfo, unsigned int valid_hooks, void *entry0) { - unsigned int calldepth, max_calldepth = 0; unsigned int hook; /* No recursion; use packet counter to save back ptrs (reset @@ -465,7 +462,6 @@ mark_source_chains(struct xt_table_info *newinfo, /* Set initial back pointer. */ e->counters.pcnt = pos; - calldepth = 0; for (;;) { const struct xt_standard_target *t @@ -527,9 +523,6 @@ mark_source_chains(struct xt_table_info *newinfo, (entry0 + pos + size); e->counters.pcnt = pos; pos += size; - WARN_ON_ONCE(calldepth == 0); - if (calldepth > 0) - --calldepth; } else { int newpos = t->verdict; @@ -543,14 +536,9 @@ mark_source_chains(struct xt_table_info *newinfo, newpos); return 0; } - if (entry0 + newpos != ipt_next_entry(e) && - !(e->ip.flags & IPT_F_GOTO) && - ++calldepth > max_calldepth) - max_calldepth = calldepth; - /* This a jump; chase it. */ - duprintf("Jump rule %u -> %u, calldepth %d\n", - pos, newpos, calldepth); + duprintf("Jump rule %u -> %u\n", + pos, newpos); } else { /* ... this is a fallthru */ newpos = pos + e->next_offset; @@ -564,7 +552,6 @@ mark_source_chains(struct xt_table_info *newinfo, next: duprintf("Finished chain %u\n", hook); } - newinfo->stacksize = max_calldepth; return 1; } @@ -844,6 +831,9 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, if (ret != 0) return ret; ++i; + if (strcmp(ipt_get_target(iter)->u.user.name, + XT_ERROR_TARGET) == 0) + ++newinfo->stacksize; } if (i != repl->num_entries) { @@ -1759,6 +1749,9 @@ translate_compat_table(struct net *net, if (ret != 0) break; ++i; + if (strcmp(ipt_get_target(iter1)->u.user.name, + XT_ERROR_TARGET) == 0) + ++newinfo->stacksize; } if (ret) { /* diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 45cb16a6a4a3..3f32c03e8b2e 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -507,14 +507,14 @@ static void arp_print(struct arp_payload *payload) #endif static unsigned int -arp_mangle(const struct nf_hook_ops *ops, +arp_mangle(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct arphdr *arp = arp_hdr(skb); struct arp_payload *payload; struct clusterip_config *c; - struct net *net = dev_net(state->in ? state->in : state->out); + struct net *net = state->net; /* we don't care about non-ethernet and non-ipv4 ARP */ if (arp->ar_hrd != htons(ARPHRD_ETHER) || diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index 87907d4bd259..1d16c0f28df0 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -59,7 +59,7 @@ reject_tg(struct sk_buff *skb, const struct xt_action_param *par) nf_send_unreach(skb, ICMP_PKT_FILTERED, hook); break; case IPT_TCP_RESET: - nf_send_reset(skb, hook); + nf_send_reset(par->net, skb, hook); case IPT_ICMP_ECHOREPLY: /* Doesn't happen. */ break; diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index 95ea633e8356..f1a8df8ecc1f 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -39,11 +39,14 @@ synproxy_build_ip(struct sk_buff *skb, __be32 saddr, __be32 daddr) } static void -synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb, +synproxy_send_tcp(const struct synproxy_net *snet, + const struct sk_buff *skb, struct sk_buff *nskb, struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo, struct iphdr *niph, struct tcphdr *nth, unsigned int tcp_hdr_size) { + struct net *net = nf_ct_net(snet->tmpl); + nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0); nskb->ip_summed = CHECKSUM_PARTIAL; nskb->csum_start = (unsigned char *)nth - nskb->head; @@ -51,7 +54,7 @@ synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb, skb_dst_set_noref(nskb, skb_dst(skb)); nskb->protocol = htons(ETH_P_IP); - if (ip_route_me_harder(nskb, RTN_UNSPEC)) + if (ip_route_me_harder(net, nskb, RTN_UNSPEC)) goto free_nskb; if (nfct) { @@ -60,7 +63,7 @@ synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb, nf_conntrack_get(nfct); } - ip_local_out(nskb); + ip_local_out(net, nskb->sk, nskb); return; free_nskb: @@ -68,7 +71,8 @@ free_nskb: } static void -synproxy_send_client_synack(const struct sk_buff *skb, const struct tcphdr *th, +synproxy_send_client_synack(const struct synproxy_net *snet, + const struct sk_buff *skb, const struct tcphdr *th, const struct synproxy_options *opts) { struct sk_buff *nskb; @@ -104,7 +108,7 @@ synproxy_send_client_synack(const struct sk_buff *skb, const struct tcphdr *th, synproxy_build_options(nth, opts); - synproxy_send_tcp(skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, + synproxy_send_tcp(snet, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); } @@ -148,7 +152,7 @@ synproxy_send_server_syn(const struct synproxy_net *snet, synproxy_build_options(nth, opts); - synproxy_send_tcp(skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, + synproxy_send_tcp(snet, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, niph, nth, tcp_hdr_size); } @@ -188,7 +192,7 @@ synproxy_send_server_ack(const struct synproxy_net *snet, synproxy_build_options(nth, opts); - synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); + synproxy_send_tcp(snet, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); } static void @@ -226,7 +230,7 @@ synproxy_send_client_ack(const struct synproxy_net *snet, synproxy_build_options(nth, opts); - synproxy_send_tcp(skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, + synproxy_send_tcp(snet, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); } @@ -258,7 +262,7 @@ static unsigned int synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_synproxy_info *info = par->targinfo; - struct synproxy_net *snet = synproxy_pernet(dev_net(par->in)); + struct synproxy_net *snet = synproxy_pernet(par->net); struct synproxy_options opts = {}; struct tcphdr *th, _th; @@ -287,7 +291,7 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) XT_SYNPROXY_OPT_SACK_PERM | XT_SYNPROXY_OPT_ECN); - synproxy_send_client_synack(skb, th, &opts); + synproxy_send_client_synack(snet, skb, th, &opts); return NF_DROP; } else if (th->ack && !(th->fin || th->rst || th->syn)) { @@ -299,11 +303,11 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) return XT_CONTINUE; } -static unsigned int ipv4_synproxy_hook(const struct nf_hook_ops *ops, +static unsigned int ipv4_synproxy_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *nhs) { - struct synproxy_net *snet = synproxy_pernet(dev_net(nhs->in ? : nhs->out)); + struct synproxy_net *snet = synproxy_pernet(nhs->net); enum ip_conntrack_info ctinfo; struct nf_conn *ct; struct nf_conn_synproxy *synproxy; diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index 8618fd150c96..74dd6671b66d 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -32,12 +32,11 @@ static __be32 rpfilter_get_saddr(__be32 addr) return addr; } -static bool rpfilter_lookup_reverse(struct flowi4 *fl4, +static bool rpfilter_lookup_reverse(struct net *net, struct flowi4 *fl4, const struct net_device *dev, u8 flags) { struct fib_result res; bool dev_match; - struct net *net = dev_net(dev); int ret __maybe_unused; if (fib_lookup(net, fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE)) @@ -98,7 +97,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) flow.flowi4_tos = RT_TOS(iph->tos); flow.flowi4_scope = RT_SCOPE_UNIVERSE; - return rpfilter_lookup_reverse(&flow, par->in, info->flags) ^ invert; + return rpfilter_lookup_reverse(par->net, &flow, par->in, info->flags) ^ invert; } static int rpfilter_check(const struct xt_mtchk_param *par) diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index a0f3beca52d2..397ef2dd133e 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -33,19 +33,16 @@ static const struct xt_table packet_filter = { }; static unsigned int -iptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, +iptable_filter_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - const struct net *net; - - if (ops->hooknum == NF_INET_LOCAL_OUT && + if (state->hook == NF_INET_LOCAL_OUT && (skb->len < sizeof(struct iphdr) || ip_hdrlen(skb) < sizeof(struct iphdr))) /* root is playing with raw sockets. */ return NF_ACCEPT; - net = dev_net(state->in ? state->in : state->out); - return ipt_do_table(skb, ops->hooknum, state, net->ipv4.iptable_filter); + return ipt_do_table(skb, state, state->net->ipv4.iptable_filter); } static struct nf_hook_ops *filter_ops __read_mostly; diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index 62cbb8c5f4a8..ba5d392a13c4 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -39,7 +39,6 @@ static const struct xt_table packet_mangler = { static unsigned int ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state) { - struct net_device *out = state->out; unsigned int ret; const struct iphdr *iph; u_int8_t tos; @@ -59,8 +58,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state) daddr = iph->daddr; tos = iph->tos; - ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, state, - dev_net(out)->ipv4.iptable_mangle); + ret = ipt_do_table(skb, state, state->net->ipv4.iptable_mangle); /* Reroute for ANY change. */ if (ret != NF_DROP && ret != NF_STOLEN) { iph = ip_hdr(skb); @@ -69,7 +67,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state) iph->daddr != daddr || skb->mark != mark || iph->tos != tos) { - err = ip_route_me_harder(skb, RTN_UNSPEC); + err = ip_route_me_harder(state->net, skb, RTN_UNSPEC); if (err < 0) ret = NF_DROP_ERR(err); } @@ -80,18 +78,17 @@ ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state) /* The work comes in here from netfilter.c. */ static unsigned int -iptable_mangle_hook(const struct nf_hook_ops *ops, +iptable_mangle_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - if (ops->hooknum == NF_INET_LOCAL_OUT) + if (state->hook == NF_INET_LOCAL_OUT) return ipt_mangle_out(skb, state); - if (ops->hooknum == NF_INET_POST_ROUTING) - return ipt_do_table(skb, ops->hooknum, state, - dev_net(state->out)->ipv4.iptable_mangle); + if (state->hook == NF_INET_POST_ROUTING) + return ipt_do_table(skb, state, + state->net->ipv4.iptable_mangle); /* PREROUTING/INPUT/FORWARD: */ - return ipt_do_table(skb, ops->hooknum, state, - dev_net(state->in)->ipv4.iptable_mangle); + return ipt_do_table(skb, state, state->net->ipv4.iptable_mangle); } static struct nf_hook_ops *mangle_ops __read_mostly; diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index 0d4d9cdf98a4..3a2e4d830a0b 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -28,42 +28,40 @@ static const struct xt_table nf_nat_ipv4_table = { .af = NFPROTO_IPV4, }; -static unsigned int iptable_nat_do_chain(const struct nf_hook_ops *ops, +static unsigned int iptable_nat_do_chain(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, struct nf_conn *ct) { - struct net *net = nf_ct_net(ct); - - return ipt_do_table(skb, ops->hooknum, state, net->ipv4.nat_table); + return ipt_do_table(skb, state, state->net->ipv4.nat_table); } -static unsigned int iptable_nat_ipv4_fn(const struct nf_hook_ops *ops, +static unsigned int iptable_nat_ipv4_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv4_fn(ops, skb, state, iptable_nat_do_chain); + return nf_nat_ipv4_fn(priv, skb, state, iptable_nat_do_chain); } -static unsigned int iptable_nat_ipv4_in(const struct nf_hook_ops *ops, +static unsigned int iptable_nat_ipv4_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv4_in(ops, skb, state, iptable_nat_do_chain); + return nf_nat_ipv4_in(priv, skb, state, iptable_nat_do_chain); } -static unsigned int iptable_nat_ipv4_out(const struct nf_hook_ops *ops, +static unsigned int iptable_nat_ipv4_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv4_out(ops, skb, state, iptable_nat_do_chain); + return nf_nat_ipv4_out(priv, skb, state, iptable_nat_do_chain); } -static unsigned int iptable_nat_ipv4_local_fn(const struct nf_hook_ops *ops, +static unsigned int iptable_nat_ipv4_local_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv4_local_fn(ops, skb, state, iptable_nat_do_chain); + return nf_nat_ipv4_local_fn(priv, skb, state, iptable_nat_do_chain); } static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = { diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 0356e6da4bb7..1ba02811acb0 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -20,19 +20,16 @@ static const struct xt_table packet_raw = { /* The work comes in here from netfilter.c. */ static unsigned int -iptable_raw_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, +iptable_raw_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - const struct net *net; - - if (ops->hooknum == NF_INET_LOCAL_OUT && + if (state->hook == NF_INET_LOCAL_OUT && (skb->len < sizeof(struct iphdr) || ip_hdrlen(skb) < sizeof(struct iphdr))) /* root is playing with raw sockets. */ return NF_ACCEPT; - net = dev_net(state->in ? state->in : state->out); - return ipt_do_table(skb, ops->hooknum, state, net->ipv4.iptable_raw); + return ipt_do_table(skb, state, state->net->ipv4.iptable_raw); } static struct nf_hook_ops *rawtable_ops __read_mostly; diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index 4bce3980ccd9..f534e2f05bad 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c @@ -37,20 +37,16 @@ static const struct xt_table security_table = { }; static unsigned int -iptable_security_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, +iptable_security_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - const struct net *net; - - if (ops->hooknum == NF_INET_LOCAL_OUT && + if (state->hook == NF_INET_LOCAL_OUT && (skb->len < sizeof(struct iphdr) || ip_hdrlen(skb) < sizeof(struct iphdr))) /* Somebody is playing with raw sockets. */ return NF_ACCEPT; - net = dev_net(state->in ? state->in : state->out); - return ipt_do_table(skb, ops->hooknum, state, - net->ipv4.iptable_security); + return ipt_do_table(skb, state, state->net->ipv4.iptable_security); } static struct nf_hook_ops *sectbl_ops __read_mostly; diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 8a2caaf3940b..752fb40adcf8 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -92,7 +92,7 @@ static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, return NF_ACCEPT; } -static unsigned int ipv4_helper(const struct nf_hook_ops *ops, +static unsigned int ipv4_helper(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { @@ -119,7 +119,7 @@ static unsigned int ipv4_helper(const struct nf_hook_ops *ops, ct, ctinfo); } -static unsigned int ipv4_confirm(const struct nf_hook_ops *ops, +static unsigned int ipv4_confirm(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { @@ -143,14 +143,14 @@ out: return nf_conntrack_confirm(skb); } -static unsigned int ipv4_conntrack_in(const struct nf_hook_ops *ops, +static unsigned int ipv4_conntrack_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_conntrack_in(dev_net(state->in), PF_INET, ops->hooknum, skb); + return nf_conntrack_in(state->net, PF_INET, state->hook, skb); } -static unsigned int ipv4_conntrack_local(const struct nf_hook_ops *ops, +static unsigned int ipv4_conntrack_local(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { @@ -158,7 +158,7 @@ static unsigned int ipv4_conntrack_local(const struct nf_hook_ops *ops, if (skb->len < sizeof(struct iphdr) || ip_hdrlen(skb) < sizeof(struct iphdr)) return NF_ACCEPT; - return nf_conntrack_in(dev_net(state->out), PF_INET, ops->hooknum, skb); + return nf_conntrack_in(state->net, PF_INET, state->hook, skb); } /* Connection tracking may drop packets, but never alters them, so diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index cdde3ec496e9..c567e1b5d799 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -30,7 +30,7 @@ static inline struct nf_icmp_net *icmp_pernet(struct net *net) } static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, - struct nf_conntrack_tuple *tuple) + struct net *net, struct nf_conntrack_tuple *tuple) { const struct icmphdr *hp; struct icmphdr _hdr; @@ -144,7 +144,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb) + ip_hdrlen(skb) + sizeof(struct icmphdr), - PF_INET, &origtuple)) { + PF_INET, net, &origtuple)) { pr_debug("icmp_error_message: failed to get tuple\n"); return -NF_ACCEPT; } diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index 9306ec4fab41..bf25f45b23d2 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -22,14 +22,15 @@ #endif #include <net/netfilter/nf_conntrack_zones.h> -static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user) +static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb, + u_int32_t user) { int err; skb_orphan(skb); local_bh_disable(); - err = ip_defrag(skb, user); + err = ip_defrag(net, skb, user); local_bh_enable(); if (!err) { @@ -61,7 +62,7 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum, return IP_DEFRAG_CONNTRACK_OUT + zone_id; } -static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops, +static unsigned int ipv4_conntrack_defrag(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { @@ -83,9 +84,9 @@ static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops, /* Gather fragments. */ if (ip_is_fragment(ip_hdr(skb))) { enum ip_defrag_users user = - nf_ct_defrag_user(ops->hooknum, skb); + nf_ct_defrag_user(state->hook, skb); - if (nf_ct_ipv4_gather_frags(skb, user)) + if (nf_ct_ipv4_gather_frags(state->net, skb, user)) return NF_STOLEN; } return NF_ACCEPT; diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c index b5bb37564b0e..ceb187308120 100644 --- a/net/ipv4/netfilter/nf_dup_ipv4.c +++ b/net/ipv4/netfilter/nf_dup_ipv4.c @@ -13,6 +13,7 @@ #include <linux/percpu.h> #include <linux/route.h> #include <linux/skbuff.h> +#include <linux/netfilter.h> #include <net/checksum.h> #include <net/icmp.h> #include <net/ip.h> @@ -22,25 +23,10 @@ #include <net/netfilter/nf_conntrack.h> #endif -static struct net *pick_net(struct sk_buff *skb) -{ -#ifdef CONFIG_NET_NS - const struct dst_entry *dst; - - if (skb->dev != NULL) - return dev_net(skb->dev); - dst = skb_dst(skb); - if (dst != NULL && dst->dev != NULL) - return dev_net(dst->dev); -#endif - return &init_net; -} - -static bool nf_dup_ipv4_route(struct sk_buff *skb, const struct in_addr *gw, - int oif) +static bool nf_dup_ipv4_route(struct net *net, struct sk_buff *skb, + const struct in_addr *gw, int oif) { const struct iphdr *iph = ip_hdr(skb); - struct net *net = pick_net(skb); struct rtable *rt; struct flowi4 fl4; @@ -64,7 +50,7 @@ static bool nf_dup_ipv4_route(struct sk_buff *skb, const struct in_addr *gw, return true; } -void nf_dup_ipv4(struct sk_buff *skb, unsigned int hooknum, +void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum, const struct in_addr *gw, int oif) { struct iphdr *iph; @@ -104,9 +90,9 @@ void nf_dup_ipv4(struct sk_buff *skb, unsigned int hooknum, --iph->ttl; ip_send_check(iph); - if (nf_dup_ipv4_route(skb, gw, oif)) { + if (nf_dup_ipv4_route(net, skb, gw, oif)) { __this_cpu_write(nf_skb_duplicated, true); - ip_local_out(skb); + ip_local_out(net, skb->sk, skb); __this_cpu_write(nf_skb_duplicated, false); } else { kfree_skb(skb); diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index 22f4579b0c2a..5075b7ecd26d 100644 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -255,9 +255,9 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb, EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation); unsigned int -nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, +nf_nat_ipv4_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, - unsigned int (*do_chain)(const struct nf_hook_ops *ops, + unsigned int (*do_chain)(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, struct nf_conn *ct)) @@ -266,7 +266,7 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, enum ip_conntrack_info ctinfo; struct nf_conn_nat *nat; /* maniptype == SRC for postrouting. */ - enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum); + enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook); /* We never see fragments: conntrack defrags on pre-routing * and local-out, and nf_nat_out protects post-routing. @@ -295,7 +295,7 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, case IP_CT_RELATED_REPLY: if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, - ops->hooknum)) + state->hook)) return NF_DROP; else return NF_ACCEPT; @@ -308,21 +308,21 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, if (!nf_nat_initialized(ct, maniptype)) { unsigned int ret; - ret = do_chain(ops, skb, state, ct); + ret = do_chain(priv, skb, state, ct); if (ret != NF_ACCEPT) return ret; - if (nf_nat_initialized(ct, HOOK2MANIP(ops->hooknum))) + if (nf_nat_initialized(ct, HOOK2MANIP(state->hook))) break; - ret = nf_nat_alloc_null_binding(ct, ops->hooknum); + ret = nf_nat_alloc_null_binding(ct, state->hook); if (ret != NF_ACCEPT) return ret; } else { pr_debug("Already setup manip %s for ct %p\n", maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", ct); - if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, + if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out)) goto oif_changed; } @@ -332,11 +332,11 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, /* ESTABLISHED */ NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED || ctinfo == IP_CT_ESTABLISHED_REPLY); - if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, state->out)) + if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out)) goto oif_changed; } - return nf_nat_packet(ct, ctinfo, ops->hooknum, skb); + return nf_nat_packet(ct, ctinfo, state->hook, skb); oif_changed: nf_ct_kill_acct(ct, ctinfo, skb); @@ -345,9 +345,9 @@ oif_changed: EXPORT_SYMBOL_GPL(nf_nat_ipv4_fn); unsigned int -nf_nat_ipv4_in(const struct nf_hook_ops *ops, struct sk_buff *skb, +nf_nat_ipv4_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, - unsigned int (*do_chain)(const struct nf_hook_ops *ops, + unsigned int (*do_chain)(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, struct nf_conn *ct)) @@ -355,7 +355,7 @@ nf_nat_ipv4_in(const struct nf_hook_ops *ops, struct sk_buff *skb, unsigned int ret; __be32 daddr = ip_hdr(skb)->daddr; - ret = nf_nat_ipv4_fn(ops, skb, state, do_chain); + ret = nf_nat_ipv4_fn(priv, skb, state, do_chain); if (ret != NF_DROP && ret != NF_STOLEN && daddr != ip_hdr(skb)->daddr) skb_dst_drop(skb); @@ -365,9 +365,9 @@ nf_nat_ipv4_in(const struct nf_hook_ops *ops, struct sk_buff *skb, EXPORT_SYMBOL_GPL(nf_nat_ipv4_in); unsigned int -nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb, +nf_nat_ipv4_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, - unsigned int (*do_chain)(const struct nf_hook_ops *ops, + unsigned int (*do_chain)(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, struct nf_conn *ct)) @@ -384,7 +384,7 @@ nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb, ip_hdrlen(skb) < sizeof(struct iphdr)) return NF_ACCEPT; - ret = nf_nat_ipv4_fn(ops, skb, state, do_chain); + ret = nf_nat_ipv4_fn(priv, skb, state, do_chain); #ifdef CONFIG_XFRM if (ret != NF_DROP && ret != NF_STOLEN && !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && @@ -396,7 +396,7 @@ nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb, (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && ct->tuplehash[dir].tuple.src.u.all != ct->tuplehash[!dir].tuple.dst.u.all)) { - err = nf_xfrm_me_harder(skb, AF_INET); + err = nf_xfrm_me_harder(state->net, skb, AF_INET); if (err < 0) ret = NF_DROP_ERR(err); } @@ -407,9 +407,9 @@ nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb, EXPORT_SYMBOL_GPL(nf_nat_ipv4_out); unsigned int -nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, +nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, - unsigned int (*do_chain)(const struct nf_hook_ops *ops, + unsigned int (*do_chain)(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, struct nf_conn *ct)) @@ -424,14 +424,14 @@ nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, ip_hdrlen(skb) < sizeof(struct iphdr)) return NF_ACCEPT; - ret = nf_nat_ipv4_fn(ops, skb, state, do_chain); + ret = nf_nat_ipv4_fn(priv, skb, state, do_chain); if (ret != NF_DROP && ret != NF_STOLEN && (ct = nf_ct_get(skb, &ctinfo)) != NULL) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); if (ct->tuplehash[dir].tuple.dst.u3.ip != ct->tuplehash[!dir].tuple.src.u3.ip) { - err = ip_route_me_harder(skb, RTN_UNSPEC); + err = ip_route_me_harder(state->net, skb, RTN_UNSPEC); if (err < 0) ret = NF_DROP_ERR(err); } @@ -440,7 +440,7 @@ nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && ct->tuplehash[dir].tuple.dst.u.all != ct->tuplehash[!dir].tuple.src.u.all) { - err = nf_xfrm_me_harder(skb, AF_INET); + err = nf_xfrm_me_harder(state->net, skb, AF_INET); if (err < 0) ret = NF_DROP_ERR(err); } diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index 3262e41ff76f..c747b2d9eb77 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -99,7 +99,7 @@ void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_put); /* Send RST reply */ -void nf_send_reset(struct sk_buff *oldskb, int hook) +void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook) { struct sk_buff *nskb; const struct iphdr *oiph; @@ -129,7 +129,7 @@ void nf_send_reset(struct sk_buff *oldskb, int hook) ip4_dst_hoplimit(skb_dst(nskb))); nf_reject_ip_tcphdr_put(nskb, oldskb, oth); - if (ip_route_me_harder(nskb, RTN_UNSPEC)) + if (ip_route_me_harder(net, nskb, RTN_UNSPEC)) goto free_nskb; /* "Never happens" */ @@ -157,7 +157,7 @@ void nf_send_reset(struct sk_buff *oldskb, int hook) dev_queue_xmit(nskb); } else #endif - ip_local_out(nskb); + ip_local_out(net, nskb->sk, nskb); return; diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c index 8412268bbad1..9d09d4f59545 100644 --- a/net/ipv4/netfilter/nf_tables_arp.c +++ b/net/ipv4/netfilter/nf_tables_arp.c @@ -15,15 +15,15 @@ #include <net/netfilter/nf_tables.h> static unsigned int -nft_do_chain_arp(const struct nf_hook_ops *ops, +nft_do_chain_arp(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nft_pktinfo pkt; - nft_set_pktinfo(&pkt, ops, skb, state); + nft_set_pktinfo(&pkt, skb, state); - return nft_do_chain(&pkt, ops); + return nft_do_chain(&pkt, priv); } static struct nft_af_info nft_af_arp __read_mostly = { diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c index aa180d3a69a5..ca9dc3c46c4f 100644 --- a/net/ipv4/netfilter/nf_tables_ipv4.c +++ b/net/ipv4/netfilter/nf_tables_ipv4.c @@ -18,18 +18,18 @@ #include <net/ip.h> #include <net/netfilter/nf_tables_ipv4.h> -static unsigned int nft_do_chain_ipv4(const struct nf_hook_ops *ops, +static unsigned int nft_do_chain_ipv4(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nft_pktinfo pkt; - nft_set_pktinfo_ipv4(&pkt, ops, skb, state); + nft_set_pktinfo_ipv4(&pkt, skb, state); - return nft_do_chain(&pkt, ops); + return nft_do_chain(&pkt, priv); } -static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops, +static unsigned int nft_ipv4_output(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { @@ -41,7 +41,7 @@ static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops, return NF_ACCEPT; } - return nft_do_chain_ipv4(ops, skb, state); + return nft_do_chain_ipv4(priv, skb, state); } struct nft_af_info nft_af_ipv4 __read_mostly = { diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c index bf5c30ae14e4..f5c66a7a4bf2 100644 --- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c +++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c @@ -26,44 +26,44 @@ #include <net/netfilter/nf_nat_l3proto.h> #include <net/ip.h> -static unsigned int nft_nat_do_chain(const struct nf_hook_ops *ops, +static unsigned int nft_nat_do_chain(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, struct nf_conn *ct) { struct nft_pktinfo pkt; - nft_set_pktinfo_ipv4(&pkt, ops, skb, state); + nft_set_pktinfo_ipv4(&pkt, skb, state); - return nft_do_chain(&pkt, ops); + return nft_do_chain(&pkt, priv); } -static unsigned int nft_nat_ipv4_fn(const struct nf_hook_ops *ops, +static unsigned int nft_nat_ipv4_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv4_fn(ops, skb, state, nft_nat_do_chain); + return nf_nat_ipv4_fn(priv, skb, state, nft_nat_do_chain); } -static unsigned int nft_nat_ipv4_in(const struct nf_hook_ops *ops, +static unsigned int nft_nat_ipv4_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv4_in(ops, skb, state, nft_nat_do_chain); + return nf_nat_ipv4_in(priv, skb, state, nft_nat_do_chain); } -static unsigned int nft_nat_ipv4_out(const struct nf_hook_ops *ops, +static unsigned int nft_nat_ipv4_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv4_out(ops, skb, state, nft_nat_do_chain); + return nf_nat_ipv4_out(priv, skb, state, nft_nat_do_chain); } -static unsigned int nft_nat_ipv4_local_fn(const struct nf_hook_ops *ops, +static unsigned int nft_nat_ipv4_local_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv4_local_fn(ops, skb, state, nft_nat_do_chain); + return nf_nat_ipv4_local_fn(priv, skb, state, nft_nat_do_chain); } static const struct nf_chain_type nft_chain_nat_ipv4 = { diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c index e335b0afdaf3..2375b0a8be46 100644 --- a/net/ipv4/netfilter/nft_chain_route_ipv4.c +++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c @@ -21,7 +21,7 @@ #include <net/route.h> #include <net/ip.h> -static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops, +static unsigned int nf_route_table_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { @@ -37,7 +37,7 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops, ip_hdrlen(skb) < sizeof(struct iphdr)) return NF_ACCEPT; - nft_set_pktinfo_ipv4(&pkt, ops, skb, state); + nft_set_pktinfo_ipv4(&pkt, skb, state); mark = skb->mark; iph = ip_hdr(skb); @@ -45,7 +45,7 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops, daddr = iph->daddr; tos = iph->tos; - ret = nft_do_chain(&pkt, ops); + ret = nft_do_chain(&pkt, priv); if (ret != NF_DROP && ret != NF_QUEUE) { iph = ip_hdr(skb); @@ -53,7 +53,7 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops, iph->daddr != daddr || skb->mark != mark || iph->tos != tos) - if (ip_route_me_harder(skb, RTN_UNSPEC)) + if (ip_route_me_harder(state->net, skb, RTN_UNSPEC)) ret = NF_DROP; } return ret; diff --git a/net/ipv4/netfilter/nft_dup_ipv4.c b/net/ipv4/netfilter/nft_dup_ipv4.c index 25419fbddcb6..bf855e64fc45 100644 --- a/net/ipv4/netfilter/nft_dup_ipv4.c +++ b/net/ipv4/netfilter/nft_dup_ipv4.c @@ -26,11 +26,11 @@ static void nft_dup_ipv4_eval(const struct nft_expr *expr, { struct nft_dup_ipv4 *priv = nft_expr_priv(expr); struct in_addr gw = { - .s_addr = regs->data[priv->sreg_addr], + .s_addr = (__force __be32)regs->data[priv->sreg_addr], }; int oif = regs->data[priv->sreg_dev]; - nf_dup_ipv4(pkt->skb, pkt->ops->hooknum, &gw, oif); + nf_dup_ipv4(pkt->net, pkt->skb, pkt->hook, &gw, oif); } static int nft_dup_ipv4_init(const struct nft_ctx *ctx, diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c index 40e414c4ca56..b72ffc58e255 100644 --- a/net/ipv4/netfilter/nft_masq_ipv4.c +++ b/net/ipv4/netfilter/nft_masq_ipv4.c @@ -26,7 +26,7 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr, memset(&range, 0, sizeof(range)); range.flags = priv->flags; - regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, pkt->ops->hooknum, + regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, pkt->hook, &range, pkt->out); } diff --git a/net/ipv4/netfilter/nft_redir_ipv4.c b/net/ipv4/netfilter/nft_redir_ipv4.c index d8d795df9c13..c09d4381427e 100644 --- a/net/ipv4/netfilter/nft_redir_ipv4.c +++ b/net/ipv4/netfilter/nft_redir_ipv4.c @@ -36,7 +36,7 @@ static void nft_redir_ipv4_eval(const struct nft_expr *expr, mr.range[0].flags |= priv->flags; regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &mr, - pkt->ops->hooknum); + pkt->hook); } static struct nft_expr_type nft_redir_ipv4_type; diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c index b07e58b51158..c24f41c816b3 100644 --- a/net/ipv4/netfilter/nft_reject_ipv4.c +++ b/net/ipv4/netfilter/nft_reject_ipv4.c @@ -27,11 +27,10 @@ static void nft_reject_ipv4_eval(const struct nft_expr *expr, switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: - nf_send_unreach(pkt->skb, priv->icmp_code, - pkt->ops->hooknum); + nf_send_unreach(pkt->skb, priv->icmp_code, pkt->hook); break; case NFT_REJECT_TCP_RST: - nf_send_reset(pkt->skb, pkt->ops->hooknum); + nf_send_reset(pkt->net, pkt->skb, pkt->hook); break; default: break; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 561cd4b8fc6e..8c0d0bdc2a7c 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -411,8 +411,9 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, icmp_out_count(net, ((struct icmphdr *) skb_transport_header(skb))->type); - err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT, sk, skb, - NULL, rt->dst.dev, dst_output_sk); + err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT, + net, sk, skb, NULL, rt->dst.dev, + dst_output); if (err > 0) err = net_xmit_errno(err); if (err) @@ -483,6 +484,7 @@ static int raw_getfrag(void *from, char *to, int offset, int len, int odd, static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct inet_sock *inet = inet_sk(sk); + struct net *net = sock_net(sk); struct ipcm_cookie ipc; struct rtable *rt = NULL; struct flowi4 fl4; @@ -542,7 +544,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipc.oif = sk->sk_bound_dev_if; if (msg->msg_controllen) { - err = ip_cmsg_send(sock_net(sk), msg, &ipc, false); + err = ip_cmsg_send(net, msg, &ipc, false); if (err) goto out; if (ipc.opt) @@ -597,6 +599,9 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), daddr, saddr, 0, 0); + if (!saddr && ipc.oif) + l3mdev_get_saddr(net, ipc.oif, &fl4); + if (!inet->hdrincl) { rfv.msg = msg; rfv.hlen = 0; @@ -607,7 +612,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } security_sk_classify_flow(sk, flowi4_to_flowi(&fl4)); - rt = ip_route_output_flow(sock_net(sk), &fl4, sk); + rt = ip_route_output_flow(net, &fl4, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index f3087aaa6dd8..85f184e429c6 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -112,7 +112,7 @@ #endif #include <net/secure_seq.h> #include <net/ip_tunnels.h> -#include <net/vrf.h> +#include <net/l3mdev.h> #define RT_FL_TOS(oldflp4) \ ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) @@ -838,6 +838,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) struct inet_peer *peer; struct net *net; int log_martians; + int vif; rcu_read_lock(); in_dev = __in_dev_get_rcu(rt->dst.dev); @@ -846,10 +847,11 @@ void ip_rt_send_redirect(struct sk_buff *skb) return; } log_martians = IN_DEV_LOG_MARTIANS(in_dev); + vif = l3mdev_master_ifindex_rcu(rt->dst.dev); rcu_read_unlock(); net = dev_net(rt->dst.dev); - peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1); + peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1); if (!peer) { icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt_nexthop(rt, ip_hdr(skb)->daddr)); @@ -938,7 +940,8 @@ static int ip_error(struct sk_buff *skb) break; } - peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1); + peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, + l3mdev_master_ifindex(skb->dev), 1); send = true; if (peer) { @@ -1149,7 +1152,7 @@ static void ipv4_link_failure(struct sk_buff *skb) dst_set_expires(&rt->dst, 0); } -static int ip_rt_bug(struct sock *sk, struct sk_buff *skb) +static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb) { pr_debug("%s: %pI4 -> %pI4, %s\n", __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, @@ -1435,12 +1438,34 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, } static struct rtable *rt_dst_alloc(struct net_device *dev, + unsigned int flags, u16 type, bool nopolicy, bool noxfrm, bool will_cache) { - return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, - (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) | - (nopolicy ? DST_NOPOLICY : 0) | - (noxfrm ? DST_NOXFRM : 0)); + struct rtable *rt; + + rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, + (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) | + (nopolicy ? DST_NOPOLICY : 0) | + (noxfrm ? DST_NOXFRM : 0)); + + if (rt) { + rt->rt_genid = rt_genid_ipv4(dev_net(dev)); + rt->rt_flags = flags; + rt->rt_type = type; + rt->rt_is_input = 0; + rt->rt_iif = 0; + rt->rt_pmtu = 0; + rt->rt_gateway = 0; + rt->rt_uses_gateway = 0; + rt->rt_table_id = 0; + INIT_LIST_HEAD(&rt->rt_uncached); + + rt->dst.output = ip_output; + if (flags & RTCF_LOCAL) + rt->dst.input = ip_local_deliver; + } + + return rt; } /* called in rcu_read_lock() section */ @@ -1449,6 +1474,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, { struct rtable *rth; struct in_device *in_dev = __in_dev_get_rcu(dev); + unsigned int flags = RTCF_MULTICAST; u32 itag = 0; int err; @@ -1461,9 +1487,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, skb->protocol != htons(ETH_P_IP)) goto e_inval; - if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) - if (ipv4_is_loopback(saddr)) - goto e_inval; + if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev)) + goto e_inval; if (ipv4_is_zeronet(saddr)) { if (!ipv4_is_local_multicast(daddr)) @@ -1474,7 +1499,10 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (err < 0) goto e_err; } - rth = rt_dst_alloc(dev_net(dev)->loopback_dev, + if (our) + flags |= RTCF_LOCAL; + + rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST, IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false); if (!rth) goto e_nobufs; @@ -1483,20 +1511,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->dst.tclassid = itag; #endif rth->dst.output = ip_rt_bug; - - rth->rt_genid = rt_genid_ipv4(dev_net(dev)); - rth->rt_flags = RTCF_MULTICAST; - rth->rt_type = RTN_MULTICAST; rth->rt_is_input= 1; - rth->rt_iif = 0; - rth->rt_pmtu = 0; - rth->rt_gateway = 0; - rth->rt_uses_gateway = 0; - INIT_LIST_HEAD(&rth->rt_uncached); - if (our) { - rth->dst.input= ip_local_deliver; - rth->rt_flags |= RTCF_LOCAL; - } #ifdef CONFIG_IP_MROUTE if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) @@ -1605,7 +1620,7 @@ static int __mkroute_input(struct sk_buff *skb, } } - rth = rt_dst_alloc(out_dev->dev, + rth = rt_dst_alloc(out_dev->dev, 0, res->type, IN_DEV_CONF_GET(in_dev, NOPOLICY), IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache); if (!rth) { @@ -1613,19 +1628,12 @@ static int __mkroute_input(struct sk_buff *skb, goto cleanup; } - rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev)); - rth->rt_flags = 0; - rth->rt_type = res->type; rth->rt_is_input = 1; - rth->rt_iif = 0; - rth->rt_pmtu = 0; - rth->rt_gateway = 0; - rth->rt_uses_gateway = 0; - INIT_LIST_HEAD(&rth->rt_uncached); + if (res->table) + rth->rt_table_id = res->table->tb_id; RT_CACHE_STAT_INC(in_slow_tot); rth->dst.input = ip_forward; - rth->dst.output = ip_output; rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag); if (lwtunnel_output_redirect(rth->dst.lwtstate)) { @@ -1643,6 +1651,48 @@ out: return err; } +#ifdef CONFIG_IP_ROUTE_MULTIPATH + +/* To make ICMP packets follow the right flow, the multipath hash is + * calculated from the inner IP addresses in reverse order. + */ +static int ip_multipath_icmp_hash(struct sk_buff *skb) +{ + const struct iphdr *outer_iph = ip_hdr(skb); + struct icmphdr _icmph; + const struct icmphdr *icmph; + struct iphdr _inner_iph; + const struct iphdr *inner_iph; + + if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0)) + goto standard_hash; + + icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph), + &_icmph); + if (!icmph) + goto standard_hash; + + if (icmph->type != ICMP_DEST_UNREACH && + icmph->type != ICMP_REDIRECT && + icmph->type != ICMP_TIME_EXCEEDED && + icmph->type != ICMP_PARAMETERPROB) { + goto standard_hash; + } + + inner_iph = skb_header_pointer(skb, + outer_iph->ihl * 4 + sizeof(_icmph), + sizeof(_inner_iph), &_inner_iph); + if (!inner_iph) + goto standard_hash; + + return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr); + +standard_hash: + return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr); +} + +#endif /* CONFIG_IP_ROUTE_MULTIPATH */ + static int ip_mkroute_input(struct sk_buff *skb, struct fib_result *res, const struct flowi4 *fl4, @@ -1650,8 +1700,15 @@ static int ip_mkroute_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, u32 tos) { #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res->fi && res->fi->fib_nhs > 1) - fib_select_multipath(res); + if (res->fi && res->fi->fib_nhs > 1) { + int h; + + if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP)) + h = ip_multipath_icmp_hash(skb); + else + h = fib_multipath_hash(saddr, daddr); + fib_select_multipath(res, h); + } #endif /* create a routing cache entry */ @@ -1693,7 +1750,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, */ tun_info = skb_tunnel_info(skb); - if (tun_info && tun_info->mode == IP_TUNNEL_INFO_RX) + if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id; else fl4.flowi4_tun_key.tun_id = 0; @@ -1703,6 +1760,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, goto martian_source; res.fi = NULL; + res.table = NULL; if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) goto brd_input; @@ -1730,10 +1788,11 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, * Now we are ready to route packet. */ fl4.flowi4_oif = 0; - fl4.flowi4_iif = vrf_master_ifindex_rcu(dev) ? : dev->ifindex; + fl4.flowi4_iif = l3mdev_fib_oif_rcu(dev); fl4.flowi4_mark = skb->mark; fl4.flowi4_tos = tos; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; + fl4.flowi4_flags = 0; fl4.daddr = daddr; fl4.saddr = saddr; err = fib_lookup(net, &fl4, &res, 0); @@ -1750,7 +1809,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &itag); if (err < 0) - goto martian_source_keep_err; + goto martian_source; goto local_input; } @@ -1772,7 +1831,7 @@ brd_input: err = fib_validate_source(skb, saddr, 0, tos, 0, dev, in_dev, &itag); if (err < 0) - goto martian_source_keep_err; + goto martian_source; } flags |= RTCF_BROADCAST; res.type = RTN_BROADCAST; @@ -1792,26 +1851,18 @@ local_input: } } - rth = rt_dst_alloc(net->loopback_dev, + rth = rt_dst_alloc(net->loopback_dev, flags | RTCF_LOCAL, res.type, IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache); if (!rth) goto e_nobufs; - rth->dst.input= ip_local_deliver; rth->dst.output= ip_rt_bug; #ifdef CONFIG_IP_ROUTE_CLASSID rth->dst.tclassid = itag; #endif - - rth->rt_genid = rt_genid_ipv4(net); - rth->rt_flags = flags|RTCF_LOCAL; - rth->rt_type = res.type; rth->rt_is_input = 1; - rth->rt_iif = 0; - rth->rt_pmtu = 0; - rth->rt_gateway = 0; - rth->rt_uses_gateway = 0; - INIT_LIST_HEAD(&rth->rt_uncached); + if (res.table) + rth->rt_table_id = res.table->tb_id; RT_CACHE_STAT_INC(in_slow_tot); if (res.type == RTN_UNREACHABLE) { @@ -1833,6 +1884,7 @@ no_route: RT_CACHE_STAT_INC(in_no_route); res.type = RTN_UNREACHABLE; res.fi = NULL; + res.table = NULL; goto local_input; /* @@ -1855,8 +1907,6 @@ e_nobufs: goto out; martian_source: - err = -EINVAL; -martian_source_keep_err: ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); goto out; } @@ -1984,28 +2034,19 @@ static struct rtable *__mkroute_output(const struct fib_result *res, } add: - rth = rt_dst_alloc(dev_out, + rth = rt_dst_alloc(dev_out, flags, type, IN_DEV_CONF_GET(in_dev, NOPOLICY), IN_DEV_CONF_GET(in_dev, NOXFRM), do_cache); if (!rth) return ERR_PTR(-ENOBUFS); - rth->dst.output = ip_output; - - rth->rt_genid = rt_genid_ipv4(dev_net(dev_out)); - rth->rt_flags = flags; - rth->rt_type = type; - rth->rt_is_input = 0; rth->rt_iif = orig_oif ? : 0; - rth->rt_pmtu = 0; - rth->rt_gateway = 0; - rth->rt_uses_gateway = 0; - INIT_LIST_HEAD(&rth->rt_uncached); + if (res->table) + rth->rt_table_id = res->table->tb_id; + RT_CACHE_STAT_INC(out_slow_tot); - if (flags & RTCF_LOCAL) - rth->dst.input = ip_local_deliver; if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) { @@ -2034,7 +2075,8 @@ add: * Major route resolver routine. */ -struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) +struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, + int mp_hash) { struct net_device *dev_out = NULL; __u8 tos = RT_FL_TOS(fl4); @@ -2042,6 +2084,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) struct fib_result res; struct rtable *rth; int orig_oif; + int err = -ENETUNREACH; res.tclassid = 0; res.fi = NULL; @@ -2132,11 +2175,10 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_HOST); } - if (netif_is_vrf(dev_out) && - !(fl4->flowi4_flags & FLOWI_FLAG_VRFSRC)) { - rth = vrf_dev_get_rth(dev_out); + + rth = l3mdev_get_rtable(dev_out, fl4); + if (rth) goto out; - } } if (!fl4->daddr) { @@ -2150,10 +2192,12 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) goto make_route; } - if (fib_lookup(net, fl4, &res, 0)) { + err = fib_lookup(net, fl4, &res, 0); + if (err) { res.fi = NULL; res.table = NULL; - if (fl4->flowi4_oif) { + if (fl4->flowi4_oif && + !netif_index_is_l3_master(net, fl4->flowi4_oif)) { /* Apparently, routing tables are wrong. Assume, that the destination is on link. @@ -2178,7 +2222,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) res.type = RTN_UNICAST; goto make_route; } - rth = ERR_PTR(-ENETUNREACH); + rth = ERR_PTR(err); goto out; } @@ -2195,18 +2239,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) goto make_route; } -#ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0) - fib_select_multipath(&res); - else -#endif - if (!res.prefixlen && - res.table->tb_num_default > 1 && - res.type == RTN_UNICAST && !fl4->flowi4_oif) - fib_select_default(fl4, &res); - - if (!fl4->saddr) - fl4->saddr = FIB_RES_PREFSRC(net, res); + fib_select_path(net, &res, fl4, mp_hash); dev_out = FIB_RES_DEV(res); fl4->flowi4_oif = dev_out->ifindex; @@ -2219,7 +2252,7 @@ out: rcu_read_unlock(); return rth; } -EXPORT_SYMBOL_GPL(__ip_route_output_key); +EXPORT_SYMBOL_GPL(__ip_route_output_key_hash); static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) { @@ -2271,7 +2304,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or new->__use = 1; new->input = dst_discard; - new->output = dst_discard_sk; + new->output = dst_discard_out; new->dev = ort->dst.dev; if (new->dev) @@ -2297,7 +2330,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or } struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, - struct sock *sk) + const struct sock *sk) { struct rtable *rt = __ip_route_output_key(net, flp4); @@ -2313,7 +2346,7 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, } EXPORT_SYMBOL_GPL(ip_route_output_flow); -static int rt_fill_info(struct net *net, __be32 dst, __be32 src, +static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id, struct flowi4 *fl4, struct sk_buff *skb, u32 portid, u32 seq, int event, int nowait, unsigned int flags) { @@ -2333,8 +2366,8 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, r->rtm_dst_len = 32; r->rtm_src_len = 0; r->rtm_tos = fl4->flowi4_tos; - r->rtm_table = RT_TABLE_MAIN; - if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN)) + r->rtm_table = table_id; + if (nla_put_u32(skb, RTA_TABLE, table_id)) goto nla_put_failure; r->rtm_type = rt->rt_type; r->rtm_scope = RT_SCOPE_UNIVERSE; @@ -2439,6 +2472,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) int err; int mark; struct sk_buff *skb; + u32 table_id = RT_TABLE_MAIN; err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy); if (err < 0) @@ -2474,6 +2508,9 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0; fl4.flowi4_mark = mark; + if (netif_index_is_l3_master(net, fl4.flowi4_oif)) + fl4.flowi4_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF; + if (iif) { struct net_device *dev; @@ -2508,7 +2545,10 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) if (rtm->rtm_flags & RTM_F_NOTIFY) rt->rt_flags |= RTCF_NOTIFY; - err = rt_fill_info(net, dst, src, &fl4, skb, + if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE) + table_id = rt->rt_table_id; + + err = rt_fill_info(net, dst, src, table_id, &fl4, skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, RTM_NEWROUTE, 0, 0); if (err < 0) diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index d70b1f603692..4c0892badb8b 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -192,15 +192,11 @@ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, } EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence); -__u32 cookie_v4_init_sequence(struct sock *sk, const struct sk_buff *skb, - __u16 *mssp) +__u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mssp) { const struct iphdr *iph = ip_hdr(skb); const struct tcphdr *th = tcp_hdr(skb); - tcp_synq_overflow(sk); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); - return __cookie_v4_init_sequence(iph, th, mssp); } @@ -229,6 +225,7 @@ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst); if (child) { atomic_set(&req->rsk_refcnt, 1); + sock_rps_save_rxhash(child, skb); inet_csk_reqsk_queue_add(sk, req, child); } else { reqsk_free(req); @@ -288,6 +285,10 @@ bool cookie_ecn_ok(const struct tcp_options_received *tcp_opt, } EXPORT_SYMBOL(cookie_ecn_ok); +/* On input, sk is a listener. + * Output is listener if incoming packet would not create a child + * NULL if memory could not be allocated. + */ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) { struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt; @@ -326,7 +327,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) goto out; ret = NULL; - req = inet_reqsk_alloc(&tcp_request_sock_ops, sk); /* for safety */ + req = inet_reqsk_alloc(&tcp_request_sock_ops, sk, false); /* for safety */ if (!req) goto out; @@ -345,7 +346,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) ireq->wscale_ok = tcp_opt.wscale_ok; ireq->tstamp_ok = tcp_opt.saw_tstamp; req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; - treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0; + treq->snt_synack.v64 = 0; treq->tfo_listener = false; ireq->ir_iif = sk->sk_bound_dev_if; @@ -381,10 +382,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) } /* Try to redo what tcp_v4_send_synack did. */ - req->window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW); + req->rsk_window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW); tcp_select_initial_window(tcp_full_space(sk), req->mss, - &req->rcv_wnd, &req->window_clamp, + &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(&rt->dst, RTAX_INITRWND)); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 879bdc5c95b1..30a531ccbf77 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -818,6 +818,13 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec }, { + .procname = "icmp_redirects_use_orig_daddr", + .data = &init_net.ipv4.sysctl_icmp_redirects_use_orig_daddr, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { .procname = "icmp_ratelimit", .data = &init_net.ipv4.sysctl_icmp_ratelimit, .maxlen = sizeof(int), @@ -929,6 +936,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "igmp_link_local_mcast_reports", + .data = &sysctl_igmp_llm_reports, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b8b8fa184f75..ac1bdbb50352 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -900,7 +900,8 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, */ if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && !tcp_passive_fastopen(sk)) { - if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) + err = sk_stream_wait_connect(sk, &timeo); + if (err != 0) goto out_err; } @@ -967,7 +968,8 @@ new_segment: copied += copy; offset += copy; - if (!(size -= copy)) { + size -= copy; + if (!size) { tcp_tx_timestamp(sk, skb); goto out; } @@ -988,7 +990,8 @@ wait_for_memory: tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH, size_goal); - if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) + err = sk_stream_wait_memory(sk, &timeo); + if (err != 0) goto do_error; mss_now = tcp_send_mss(sk, &size_goal, flags); @@ -1111,7 +1114,8 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) */ if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && !tcp_passive_fastopen(sk)) { - if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) + err = sk_stream_wait_connect(sk, &timeo); + if (err != 0) goto do_error; } @@ -1267,7 +1271,8 @@ wait_for_memory: tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH, size_goal); - if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) + err = sk_stream_wait_memory(sk, &timeo); + if (err != 0) goto do_error; mss_now = tcp_send_mss(sk, &size_goal, flags); @@ -1767,7 +1772,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, /* __ Restore normal policy in scheduler __ */ - if ((chunk = len - tp->ucopy.len) != 0) { + chunk = len - tp->ucopy.len; + if (chunk != 0) { NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk); len -= chunk; copied += chunk; @@ -1778,7 +1784,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, do_prequeue: tcp_prequeue_process(sk); - if ((chunk = len - tp->ucopy.len) != 0) { + chunk = len - tp->ucopy.len; + if (chunk != 0) { NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); len -= chunk; copied += chunk; @@ -2230,7 +2237,8 @@ int tcp_disconnect(struct sock *sk, int flags) sk->sk_shutdown = 0; sock_reset_flag(sk, SOCK_DONE); tp->srtt_us = 0; - if ((tp->write_seq += tp->max_window + 2) == 0) + tp->write_seq += tp->max_window + 2; + if (tp->write_seq == 0) tp->write_seq = 1; icsk->icsk_backoff = 0; tp->snd_cwnd = 2; @@ -2253,13 +2261,6 @@ int tcp_disconnect(struct sock *sk, int flags) } EXPORT_SYMBOL(tcp_disconnect); -void tcp_sock_destruct(struct sock *sk) -{ - inet_sock_destruct(sk); - - kfree(inet_csk(sk)->icsk_accept_queue.fastopenq); -} - static inline bool tcp_can_repair_sock(const struct sock *sk) { return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) && @@ -2581,7 +2582,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, TCPF_LISTEN))) { tcp_fastopen_init_key_once(true); - err = fastopen_init_queue(sk, val); + fastopen_queue_tune(sk, val); } else { err = -EINVAL; } @@ -2849,10 +2850,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, break; case TCP_FASTOPEN: - if (icsk->icsk_accept_queue.fastopenq) - val = icsk->icsk_accept_queue.fastopenq->max_qlen; - else - val = 0; + val = icsk->icsk_accept_queue.fastopenq.max_qlen; break; case TCP_TIMESTAMP: diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index a2ed23c595cf..882caa4e72bc 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -114,16 +114,19 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) } EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); -u32 tcp_ca_get_key_by_name(const char *name) +u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca) { const struct tcp_congestion_ops *ca; - u32 key; + u32 key = TCP_CA_UNSPEC; might_sleep(); rcu_read_lock(); ca = __tcp_ca_find_autoload(name); - key = ca ? ca->key : TCP_CA_UNSPEC; + if (ca) { + key = ca->key; + *ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN; + } rcu_read_unlock(); return key; @@ -170,6 +173,10 @@ out: */ if (ca->get_info) memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); + if (ca->flags & TCP_CONG_NEEDS_ECN) + INET_ECN_xmit(sk); + else + INET_ECN_dontxmit(sk); } void tcp_init_congestion_control(struct sock *sk) @@ -178,6 +185,10 @@ void tcp_init_congestion_control(struct sock *sk) if (icsk->icsk_ca_ops->init) icsk->icsk_ca_ops->init(sk); + if (tcp_ca_needs_ecn(sk)) + INET_ECN_xmit(sk); + else + INET_ECN_dontxmit(sk); } static void tcp_reinit_congestion_control(struct sock *sk, @@ -189,8 +200,8 @@ static void tcp_reinit_congestion_control(struct sock *sk, icsk->icsk_ca_ops = ca; icsk->icsk_ca_setsockopt = 1; - if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init) - icsk->icsk_ca_ops->init(sk); + if (sk->sk_state != TCP_CLOSE) + tcp_init_congestion_control(sk); } /* Manage refcounts on socket close. */ diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 28011fb1f4a2..448c2615fece 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -151,6 +151,27 @@ static void bictcp_init(struct sock *sk) tcp_sk(sk)->snd_ssthresh = initial_ssthresh; } +static void bictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) +{ + if (event == CA_EVENT_TX_START) { + struct bictcp *ca = inet_csk_ca(sk); + u32 now = tcp_time_stamp; + s32 delta; + + delta = now - tcp_sk(sk)->lsndtime; + + /* We were application limited (idle) for a while. + * Shift epoch_start to keep cwnd growth to cubic curve. + */ + if (ca->epoch_start && delta > 0) { + ca->epoch_start += delta; + if (after(ca->epoch_start, now)) + ca->epoch_start = now; + } + return; + } +} + /* calculate the cubic root of x using a table lookup followed by one * Newton-Raphson iteration. * Avg err ~= 0.195% @@ -450,6 +471,7 @@ static struct tcp_congestion_ops cubictcp __read_mostly = { .cong_avoid = bictcp_cong_avoid, .set_state = bictcp_state, .undo_cwnd = bictcp_undo_cwnd, + .cwnd_event = bictcp_cwnd_event, .pkts_acked = bictcp_acked, .owner = THIS_MODULE, .name = "cubic", diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index f9c0fb84e435..93396bf7b475 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -124,10 +124,10 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req, return false; } -static bool tcp_fastopen_create_child(struct sock *sk, - struct sk_buff *skb, - struct dst_entry *dst, - struct request_sock *req) +static struct sock *tcp_fastopen_create_child(struct sock *sk, + struct sk_buff *skb, + struct dst_entry *dst, + struct request_sock *req) { struct tcp_sock *tp; struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; @@ -140,11 +140,11 @@ static bool tcp_fastopen_create_child(struct sock *sk, child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); if (!child) - return false; + return NULL; - spin_lock(&queue->fastopenq->lock); - queue->fastopenq->qlen++; - spin_unlock(&queue->fastopenq->lock); + spin_lock(&queue->fastopenq.lock); + queue->fastopenq.qlen++; + spin_unlock(&queue->fastopenq.lock); /* Initialize the child socket. Have to fix some values to take * into account the child is a Fast Open socket and is created @@ -161,15 +161,13 @@ static bool tcp_fastopen_create_child(struct sock *sk, tp->snd_wnd = ntohs(tcp_hdr(skb)->window); /* Activate the retrans timer so that SYNACK can be retransmitted. - * The request socket is not added to the SYN table of the parent + * The request socket is not added to the ehash * because it's been added to the accept queue directly. */ inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS, TCP_TIMEOUT_INIT, TCP_RTO_MAX); - atomic_set(&req->rsk_refcnt, 1); - /* Add the child socket directly into the accept queue */ - inet_csk_reqsk_queue_add(sk, req, child); + atomic_set(&req->rsk_refcnt, 2); /* Now finish processing the fastopen child socket. */ inet_csk(child)->icsk_af_ops->rebuild_header(child); @@ -178,12 +176,10 @@ static bool tcp_fastopen_create_child(struct sock *sk, tcp_init_metrics(child); tcp_init_buffer_space(child); - /* Queue the data carried in the SYN packet. We need to first - * bump skb's refcnt because the caller will attempt to free it. - * Note that IPv6 might also have used skb_get() trick - * in tcp_v6_conn_request() to keep this SYN around (treq->pktopts) - * So we need to eventually get a clone of the packet, - * before inserting it in sk_receive_queue. + /* Queue the data carried in the SYN packet. + * We used to play tricky games with skb_get(). + * With lockless listener, it is a dead end. + * Do not think about it. * * XXX (TFO) - we honor a zero-payload TFO request for now, * (any reason not to?) but no need to queue the skb since @@ -191,12 +187,7 @@ static bool tcp_fastopen_create_child(struct sock *sk, */ end_seq = TCP_SKB_CB(skb)->end_seq; if (end_seq != TCP_SKB_CB(skb)->seq + 1) { - struct sk_buff *skb2; - - if (unlikely(skb_shared(skb))) - skb2 = skb_clone(skb, GFP_ATOMIC); - else - skb2 = skb_get(skb); + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (likely(skb2)) { skb_dst_drop(skb2); @@ -214,11 +205,10 @@ static bool tcp_fastopen_create_child(struct sock *sk, } } tcp_rsk(req)->rcv_nxt = tp->rcv_nxt = end_seq; - sk->sk_data_ready(sk); - bh_unlock_sock(child); - sock_put(child); - WARN_ON(!req->sk); - return true; + /* tcp_conn_request() is sending the SYNACK, + * and queues the child into listener accept queue. + */ + return child; } static bool tcp_fastopen_queue_check(struct sock *sk) @@ -235,8 +225,8 @@ static bool tcp_fastopen_queue_check(struct sock *sk) * between qlen overflow causing Fast Open to be disabled * temporarily vs a server not supporting Fast Open at all. */ - fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq; - if (!fastopenq || fastopenq->max_qlen == 0) + fastopenq = &inet_csk(sk)->icsk_accept_queue.fastopenq; + if (fastopenq->max_qlen == 0) return false; if (fastopenq->qlen >= fastopenq->max_qlen) { @@ -261,13 +251,14 @@ static bool tcp_fastopen_queue_check(struct sock *sk) * may be updated and return the client in the SYN-ACK later. E.g., Fast Open * cookie request (foc->len == 0). */ -bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, - struct request_sock *req, - struct tcp_fastopen_cookie *foc, - struct dst_entry *dst) +struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct tcp_fastopen_cookie *foc, + struct dst_entry *dst) { struct tcp_fastopen_cookie valid_foc = { .len = -1 }; bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1; + struct sock *child; if (foc->len == 0) /* Client requests a cookie */ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD); @@ -276,7 +267,7 @@ bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, (syn_data || foc->len >= 0) && tcp_fastopen_queue_check(sk))) { foc->len = -1; - return false; + return NULL; } if (syn_data && (sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD)) @@ -296,11 +287,12 @@ bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, * data in SYN_RECV state. */ fastopen: - if (tcp_fastopen_create_child(sk, skb, dst, req)) { + child = tcp_fastopen_create_child(sk, skb, dst, req); + if (child) { foc->len = -1; NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE); - return true; + return child; } NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL); } else if (foc->len > 0) /* Client presents an invalid cookie */ @@ -308,6 +300,5 @@ fastopen: valid_foc.exp = foc->exp; *foc = valid_foc; - return false; + return NULL; } -EXPORT_SYMBOL(tcp_try_fastopen); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index dc08e2352665..3b35c3f4d268 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2953,21 +2953,21 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, } /* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */ -static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp) +void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req) { - struct tcp_sock *tp = tcp_sk(sk); - long seq_rtt_us = -1L; + long rtt_us = -1L; - if (synack_stamp && !tp->total_retrans) - seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - synack_stamp); + if (req && !req->num_retrans && tcp_rsk(req)->snt_synack.v64) { + struct skb_mstamp now; - /* If the ACK acks both the SYNACK and the (Fast Open'd) data packets - * sent in SYN_RECV, SYNACK RTT is the smooth RTT computed in tcp_ack() - */ - if (!tp->srtt_us) - tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt_us, -1L); + skb_mstamp_get(&now); + rtt_us = skb_mstamp_us_delta(&now, &tcp_rsk(req)->snt_synack); + } + + tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L); } + static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) { const struct inet_connection_sock *icsk = inet_csk(sk); @@ -5472,7 +5472,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, } static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th, unsigned int len) + const struct tcphdr *th) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -5698,15 +5698,14 @@ reset_and_undo: * address independent. */ -int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th, unsigned int len) +int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + const struct tcphdr *th = tcp_hdr(skb); struct request_sock *req; int queued = 0; bool acceptable; - u32 synack_stamp; tp->rx_opt.saw_tstamp = 0; @@ -5750,7 +5749,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, goto discard; case TCP_SYN_SENT: - queued = tcp_rcv_synsent_state_process(sk, skb, th, len); + queued = tcp_rcv_synsent_state_process(sk, skb, th); if (queued >= 0) return queued; @@ -5785,15 +5784,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (!acceptable) return 1; + if (!tp->srtt_us) + tcp_synack_rtt_meas(sk, req); + /* Once we leave TCP_SYN_RECV, we no longer need req * so release it. */ if (req) { - synack_stamp = tcp_rsk(req)->snt_synack; tp->total_retrans = req->num_retrans; reqsk_fastopen_remove(sk, req, false); } else { - synack_stamp = tp->lsndtime; /* Make sure socket is routed, for correct metrics. */ icsk->icsk_af_ops->rebuild_header(sk); tcp_init_congestion_control(sk); @@ -5816,7 +5816,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, tp->snd_una = TCP_SKB_CB(skb)->ack_seq; tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale; tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); - tcp_synack_rtt_meas(sk, synack_stamp); if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; @@ -6003,14 +6002,17 @@ static void tcp_ecn_create_request(struct request_sock *req, const struct net *net = sock_net(listen_sk); bool th_ecn = th->ece && th->cwr; bool ect, ecn_ok; + u32 ecn_ok_dst; if (!th_ecn) return; ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield); - ecn_ok = net->ipv4.sysctl_tcp_ecn || dst_feature(dst, RTAX_FEATURE_ECN); + ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK); + ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst; - if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk)) + if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk) || + (ecn_ok_dst & DST_FEATURE_ECN_CA)) inet_rsk(req)->ecn_ok = 1; } @@ -6020,11 +6022,11 @@ static void tcp_openreq_init(struct request_sock *req, { struct inet_request_sock *ireq = inet_rsk(req); - req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */ + req->rsk_rcv_wnd = 0; /* So that tcp_send_synack() knows! */ req->cookie_ts = 0; tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; - tcp_rsk(req)->snt_synack = tcp_time_stamp; + skb_mstamp_get(&tcp_rsk(req)->snt_synack); tcp_rsk(req)->last_oow_ack_time = 0; req->mss = rx_opt->mss_clamp; req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; @@ -6040,9 +6042,11 @@ static void tcp_openreq_init(struct request_sock *req, } struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, - struct sock *sk_listener) + struct sock *sk_listener, + bool attach_listener) { - struct request_sock *req = reqsk_alloc(ops, sk_listener); + struct request_sock *req = reqsk_alloc(ops, sk_listener, + attach_listener); if (req) { struct inet_request_sock *ireq = inet_rsk(req); @@ -6062,13 +6066,13 @@ EXPORT_SYMBOL(inet_reqsk_alloc); /* * Return true if a syncookie should be sent */ -static bool tcp_syn_flood_action(struct sock *sk, +static bool tcp_syn_flood_action(const struct sock *sk, const struct sk_buff *skb, const char *proto) { + struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; const char *msg = "Dropping request"; bool want_cookie = false; - struct listen_sock *lopt; #ifdef CONFIG_SYN_COOKIES if (sysctl_tcp_syncookies) { @@ -6079,12 +6083,12 @@ static bool tcp_syn_flood_action(struct sock *sk, #endif NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); - lopt = inet_csk(sk)->icsk_accept_queue.listen_opt; - if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) { - lopt->synflood_warned = 1; + if (!queue->synflood_warned && + sysctl_tcp_syncookies != 2 && + xchg(&queue->synflood_warned, 1) == 0) pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n", proto, ntohs(tcp_hdr(skb)->dest), msg); - } + return want_cookie; } @@ -6109,16 +6113,15 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct sk_buff *skb) { + struct tcp_fastopen_cookie foc = { .len = -1 }; + __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn; struct tcp_options_received tmp_opt; - struct request_sock *req; struct tcp_sock *tp = tcp_sk(sk); + struct sock *fastopen_sk = NULL; struct dst_entry *dst = NULL; - __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn; - bool want_cookie = false, fastopen; + struct request_sock *req; + bool want_cookie = false; struct flowi fl; - struct tcp_fastopen_cookie foc = { .len = -1 }; - int err; - /* TW buckets are converted to open requests without * limitations, they conserve resources and peer is @@ -6142,7 +6145,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, goto drop; } - req = inet_reqsk_alloc(rsk_ops, sk); + req = inet_reqsk_alloc(rsk_ops, sk, !want_cookie); if (!req) goto drop; @@ -6225,20 +6228,30 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, } tcp_rsk(req)->snt_isn = isn; + tcp_rsk(req)->txhash = net_tx_rndhash(); tcp_openreq_init_rwin(req, sk, dst); - fastopen = !want_cookie && - tcp_try_fastopen(sk, skb, req, &foc, dst); - err = af_ops->send_synack(sk, dst, &fl, req, - skb_get_queue_mapping(skb), &foc); - if (!fastopen) { - if (err || want_cookie) - goto drop_and_free; - + if (!want_cookie) { + tcp_reqsk_record_syn(sk, req, skb); + fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst); + } + if (fastopen_sk) { + af_ops->send_synack(fastopen_sk, dst, &fl, req, + skb_get_queue_mapping(skb), &foc, false); + /* Add the child socket directly into the accept queue */ + inet_csk_reqsk_queue_add(sk, req, fastopen_sk); + sk->sk_data_ready(sk); + bh_unlock_sock(fastopen_sk); + sock_put(fastopen_sk); + } else { tcp_rsk(req)->tfo_listener = false; - af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + if (!want_cookie) + inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + af_ops->send_synack(sk, dst, &fl, req, + skb_get_queue_mapping(skb), &foc, !want_cookie); + if (want_cookie) + goto drop_and_free; } - tcp_reqsk_record_syn(sk, req, skb); - + reqsk_put(req); return 0; drop_and_release: diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 93898e093d4e..ddb198392c7f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -576,7 +576,7 @@ EXPORT_SYMBOL(tcp_v4_send_check); * Exception: precedence violation. We do not implement it in any case. */ -static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) +static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) { const struct tcphdr *th = tcp_hdr(skb); struct { @@ -795,7 +795,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) inet_twsk_put(tw); } -static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, +static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req) { /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV @@ -803,7 +803,7 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, */ tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, - tcp_rsk(req)->rcv_nxt, req->rcv_wnd, + tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd, tcp_time_stamp, req->ts_recent, 0, @@ -818,11 +818,12 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, * This still operates on a request_sock only, not on a big * socket. */ -static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, +static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, u16 queue_mapping, - struct tcp_fastopen_cookie *foc) + struct tcp_fastopen_cookie *foc, + bool attach_req) { const struct inet_request_sock *ireq = inet_rsk(req); struct flowi4 fl4; @@ -833,7 +834,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) return -1; - skb = tcp_make_synack(sk, dst, req, foc); + skb = tcp_make_synack(sk, dst, req, foc, attach_req); if (skb) { __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); @@ -865,7 +866,7 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req) */ /* Find the Key structure for an address. */ -struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk, +struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk, const union tcp_md5_addr *addr, int family) { @@ -877,7 +878,7 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk, /* caller either holds rcu_read_lock() or socket lock */ md5sig = rcu_dereference_check(tp->md5sig_info, sock_owned_by_user(sk) || - lockdep_is_held(&sk->sk_lock.slock)); + lockdep_is_held((spinlock_t *)&sk->sk_lock.slock)); if (!md5sig) return NULL; #if IS_ENABLED(CONFIG_IPV6) @@ -894,7 +895,7 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk, } EXPORT_SYMBOL(tcp_md5_do_lookup); -struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk, +struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, const struct sock *addr_sk) { const union tcp_md5_addr *addr; @@ -1112,10 +1113,13 @@ clear_hash_noput: } EXPORT_SYMBOL(tcp_v4_md5_hash_skb); +#endif + /* Called with rcu_read_lock() */ -static bool tcp_v4_inbound_md5_hash(struct sock *sk, +static bool tcp_v4_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb) { +#ifdef CONFIG_TCP_MD5SIG /* * This gets called for each TCP segment that arrives * so we want to be efficient. @@ -1165,10 +1169,12 @@ static bool tcp_v4_inbound_md5_hash(struct sock *sk, return true; } return false; -} #endif + return false; +} -static void tcp_v4_init_req(struct request_sock *req, struct sock *sk_listener, +static void tcp_v4_init_req(struct request_sock *req, + const struct sock *sk_listener, struct sk_buff *skb) { struct inet_request_sock *ireq = inet_rsk(req); @@ -1179,7 +1185,8 @@ static void tcp_v4_init_req(struct request_sock *req, struct sock *sk_listener, ireq->opt = tcp_v4_save_options(skb); } -static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl, +static struct dst_entry *tcp_v4_route_req(const struct sock *sk, + struct flowi *fl, const struct request_sock *req, bool *strict) { @@ -1218,7 +1225,6 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { .route_req = tcp_v4_route_req, .init_seq = tcp_v4_init_sequence, .send_synack = tcp_v4_send_synack, - .queue_hash_add = inet_csk_reqsk_queue_hash_add, }; int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) @@ -1241,7 +1247,7 @@ EXPORT_SYMBOL(tcp_v4_conn_request); * The three way handshake has completed - we got a valid synack - * now create the new socket. */ -struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, +struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst) { @@ -1277,7 +1283,6 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newinet->mc_ttl = ip_hdr(skb)->ttl; newinet->rcv_tos = ip_hdr(skb)->tos; inet_csk(newsk)->icsk_ext_hdr_len = 0; - sk_set_txhash(newsk); if (inet_opt) inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; newinet->inet_id = newtp->write_seq ^ jiffies; @@ -1338,34 +1343,11 @@ put_and_exit: } EXPORT_SYMBOL(tcp_v4_syn_recv_sock); -static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) +static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) { +#ifdef CONFIG_SYN_COOKIES const struct tcphdr *th = tcp_hdr(skb); - const struct iphdr *iph = ip_hdr(skb); - struct request_sock *req; - struct sock *nsk; - - req = inet_csk_search_req(sk, th->source, iph->saddr, iph->daddr); - if (req) { - nsk = tcp_check_req(sk, skb, req, false); - if (!nsk || nsk == sk) - reqsk_put(req); - return nsk; - } - - nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr, - th->source, iph->daddr, th->dest, inet_iif(skb)); - - if (nsk) { - if (nsk->sk_state != TCP_TIME_WAIT) { - bh_lock_sock(nsk); - return nsk; - } - inet_twsk_put(inet_twsk(nsk)); - return NULL; - } -#ifdef CONFIG_SYN_COOKIES if (!th->syn) sk = cookie_v4_check(sk, skb); #endif @@ -1373,7 +1355,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) } /* The socket must have it's spinlock held when we get - * here. + * here, unless it is a TCP_LISTEN socket. * * We have a potential double-lock case here, so even when * doing backlog processing we use the BH locking scheme. @@ -1404,13 +1386,13 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) goto csum_err; if (sk->sk_state == TCP_LISTEN) { - struct sock *nsk = tcp_v4_hnd_req(sk, skb); + struct sock *nsk = tcp_v4_cookie_check(sk, skb); + if (!nsk) goto discard; - if (nsk != sk) { sock_rps_save_rxhash(nsk, skb); - sk_mark_napi_id(sk, skb); + sk_mark_napi_id(nsk, skb); if (tcp_child_process(sk, nsk, skb)) { rsk = nsk; goto reset; @@ -1420,7 +1402,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) } else sock_rps_save_rxhash(sk, skb); - if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) { + if (tcp_rcv_state_process(sk, skb)) { rsk = sk; goto reset; } @@ -1598,6 +1580,29 @@ process: if (sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; + if (sk->sk_state == TCP_NEW_SYN_RECV) { + struct request_sock *req = inet_reqsk(sk); + struct sock *nsk = NULL; + + sk = req->rsk_listener; + if (tcp_v4_inbound_md5_hash(sk, skb)) + goto discard_and_relse; + if (sk->sk_state == TCP_LISTEN) + nsk = tcp_check_req(sk, skb, req, false); + if (!nsk) { + reqsk_put(req); + goto discard_it; + } + if (nsk == sk) { + sock_hold(sk); + reqsk_put(req); + } else if (tcp_child_process(sk, nsk, skb)) { + tcp_v4_send_reset(nsk, skb); + goto discard_it; + } else { + return 0; + } + } if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); goto discard_and_relse; @@ -1606,25 +1611,23 @@ process: if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; -#ifdef CONFIG_TCP_MD5SIG - /* - * We really want to reject the packet as early as possible - * if: - * o We're expecting an MD5'd packet and this is no MD5 tcp option - * o There is an MD5 option and we're not expecting one - */ if (tcp_v4_inbound_md5_hash(sk, skb)) goto discard_and_relse; -#endif nf_reset(skb); if (sk_filter(sk, skb)) goto discard_and_relse; - sk_incoming_cpu_update(sk); skb->dev = NULL; + if (sk->sk_state == TCP_LISTEN) { + ret = tcp_v4_do_rcv(sk, skb); + goto put_and_return; + } + + sk_incoming_cpu_update(sk); + bh_lock_sock_nested(sk); tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs); ret = 0; @@ -1639,6 +1642,7 @@ process: } bh_unlock_sock(sk); +put_and_return: sock_put(sk); return ret; @@ -1833,35 +1837,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur) ++st->num; ++st->offset; - if (st->state == TCP_SEQ_STATE_OPENREQ) { - struct request_sock *req = cur; - - icsk = inet_csk(st->syn_wait_sk); - req = req->dl_next; - while (1) { - while (req) { - if (req->rsk_ops->family == st->family) { - cur = req; - goto out; - } - req = req->dl_next; - } - if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries) - break; -get_req: - req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket]; - } - sk = sk_nulls_next(st->syn_wait_sk); - st->state = TCP_SEQ_STATE_LISTENING; - spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - } else { - icsk = inet_csk(sk); - spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - if (reqsk_queue_len(&icsk->icsk_accept_queue)) - goto start_req; - spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - sk = sk_nulls_next(sk); - } + sk = sk_nulls_next(sk); get_sk: sk_nulls_for_each_from(sk, node) { if (!net_eq(sock_net(sk), net)) @@ -1871,16 +1847,6 @@ get_sk: goto out; } icsk = inet_csk(sk); - spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - if (reqsk_queue_len(&icsk->icsk_accept_queue)) { -start_req: - st->uid = sock_i_uid(sk); - st->syn_wait_sk = sk; - st->state = TCP_SEQ_STATE_OPENREQ; - st->sbucket = 0; - goto get_req; - } - spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); } spin_unlock_bh(&ilb->lock); st->offset = 0; @@ -2012,7 +1978,6 @@ static void *tcp_seek_last_pos(struct seq_file *seq) void *rc = NULL; switch (st->state) { - case TCP_SEQ_STATE_OPENREQ: case TCP_SEQ_STATE_LISTENING: if (st->bucket >= INET_LHTABLE_SIZE) break; @@ -2071,7 +2036,6 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) } switch (st->state) { - case TCP_SEQ_STATE_OPENREQ: case TCP_SEQ_STATE_LISTENING: rc = listening_get_next(seq, v); if (!rc) { @@ -2096,11 +2060,6 @@ static void tcp_seq_stop(struct seq_file *seq, void *v) struct tcp_iter_state *st = seq->private; switch (st->state) { - case TCP_SEQ_STATE_OPENREQ: - if (v) { - struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk); - spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - } case TCP_SEQ_STATE_LISTENING: if (v != SEQ_START_TOKEN) spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock); @@ -2154,7 +2113,7 @@ void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo) EXPORT_SYMBOL(tcp_proc_unregister); static void get_openreq4(const struct request_sock *req, - struct seq_file *f, int i, kuid_t uid) + struct seq_file *f, int i) { const struct inet_request_sock *ireq = inet_rsk(req); long delta = req->rsk_timer.expires - jiffies; @@ -2171,7 +2130,8 @@ static void get_openreq4(const struct request_sock *req, 1, /* timers active (only the expire timer) */ jiffies_delta_to_clock_t(delta), req->num_timeout, - from_kuid_munged(seq_user_ns(f), uid), + from_kuid_munged(seq_user_ns(f), + sock_i_uid(req->rsk_listener)), 0, /* non standard timer */ 0, /* open_requests have no inode */ 0, @@ -2185,7 +2145,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) const struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); const struct inet_sock *inet = inet_sk(sk); - struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq; + const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; __be32 dest = inet->inet_daddr; __be32 src = inet->inet_rcv_saddr; __u16 destp = ntohs(inet->inet_dport); @@ -2272,18 +2232,12 @@ static int tcp4_seq_show(struct seq_file *seq, void *v) } st = seq->private; - switch (st->state) { - case TCP_SEQ_STATE_LISTENING: - case TCP_SEQ_STATE_ESTABLISHED: - if (sk->sk_state == TCP_TIME_WAIT) - get_timewait4_sock(v, seq, st->num); - else - get_tcp4_sock(v, seq, st->num); - break; - case TCP_SEQ_STATE_OPENREQ: - get_openreq4(v, seq, st->num, st->uid); - break; - } + if (sk->sk_state == TCP_TIME_WAIT) + get_timewait4_sock(v, seq, st->num); + else if (sk->sk_state == TCP_NEW_SYN_RECV) + get_openreq4(v, seq, st->num); + else + get_tcp4_sock(v, seq, st->num); out: seq_pad(seq, '\n'); return 0; diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index b3d64f61d922..c8cbc2b4b792 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -81,11 +81,7 @@ static void tcp_metric_set(struct tcp_metrics_block *tm, static bool addr_same(const struct inetpeer_addr *a, const struct inetpeer_addr *b) { - if (a->family != b->family) - return false; - if (a->family == AF_INET) - return a->addr.a4 == b->addr.a4; - return ipv6_addr_equal(&a->addr.in6, &b->addr.in6); + return inetpeer_addr_cmp(a, b) == 0; } struct tcpm_hash_bucket { @@ -247,14 +243,14 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, daddr.family = req->rsk_ops->family; switch (daddr.family) { case AF_INET: - saddr.addr.a4 = inet_rsk(req)->ir_loc_addr; - daddr.addr.a4 = inet_rsk(req)->ir_rmt_addr; - hash = (__force unsigned int) daddr.addr.a4; + inetpeer_set_addr_v4(&saddr, inet_rsk(req)->ir_loc_addr); + inetpeer_set_addr_v4(&daddr, inet_rsk(req)->ir_rmt_addr); + hash = ipv4_addr_hash(inet_rsk(req)->ir_rmt_addr); break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - saddr.addr.in6 = inet_rsk(req)->ir_v6_loc_addr; - daddr.addr.in6 = inet_rsk(req)->ir_v6_rmt_addr; + inetpeer_set_addr_v6(&saddr, &inet_rsk(req)->ir_v6_loc_addr); + inetpeer_set_addr_v6(&daddr, &inet_rsk(req)->ir_v6_rmt_addr); hash = ipv6_addr_hash(&inet_rsk(req)->ir_v6_rmt_addr); break; #endif @@ -285,25 +281,19 @@ static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock struct net *net; if (tw->tw_family == AF_INET) { - saddr.family = AF_INET; - saddr.addr.a4 = tw->tw_rcv_saddr; - daddr.family = AF_INET; - daddr.addr.a4 = tw->tw_daddr; - hash = (__force unsigned int) daddr.addr.a4; + inetpeer_set_addr_v4(&saddr, tw->tw_rcv_saddr); + inetpeer_set_addr_v4(&daddr, tw->tw_daddr); + hash = ipv4_addr_hash(tw->tw_daddr); } #if IS_ENABLED(CONFIG_IPV6) else if (tw->tw_family == AF_INET6) { if (ipv6_addr_v4mapped(&tw->tw_v6_daddr)) { - saddr.family = AF_INET; - saddr.addr.a4 = tw->tw_rcv_saddr; - daddr.family = AF_INET; - daddr.addr.a4 = tw->tw_daddr; - hash = (__force unsigned int) daddr.addr.a4; + inetpeer_set_addr_v4(&saddr, tw->tw_rcv_saddr); + inetpeer_set_addr_v4(&daddr, tw->tw_daddr); + hash = ipv4_addr_hash(tw->tw_daddr); } else { - saddr.family = AF_INET6; - saddr.addr.in6 = tw->tw_v6_rcv_saddr; - daddr.family = AF_INET6; - daddr.addr.in6 = tw->tw_v6_daddr; + inetpeer_set_addr_v6(&saddr, &tw->tw_v6_rcv_saddr); + inetpeer_set_addr_v6(&daddr, &tw->tw_v6_daddr); hash = ipv6_addr_hash(&tw->tw_v6_daddr); } } @@ -335,25 +325,19 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, struct net *net; if (sk->sk_family == AF_INET) { - saddr.family = AF_INET; - saddr.addr.a4 = inet_sk(sk)->inet_saddr; - daddr.family = AF_INET; - daddr.addr.a4 = inet_sk(sk)->inet_daddr; - hash = (__force unsigned int) daddr.addr.a4; + inetpeer_set_addr_v4(&saddr, inet_sk(sk)->inet_saddr); + inetpeer_set_addr_v4(&daddr, inet_sk(sk)->inet_daddr); + hash = ipv4_addr_hash(inet_sk(sk)->inet_daddr); } #if IS_ENABLED(CONFIG_IPV6) else if (sk->sk_family == AF_INET6) { if (ipv6_addr_v4mapped(&sk->sk_v6_daddr)) { - saddr.family = AF_INET; - saddr.addr.a4 = inet_sk(sk)->inet_saddr; - daddr.family = AF_INET; - daddr.addr.a4 = inet_sk(sk)->inet_daddr; - hash = (__force unsigned int) daddr.addr.a4; + inetpeer_set_addr_v4(&saddr, inet_sk(sk)->inet_saddr); + inetpeer_set_addr_v4(&daddr, inet_sk(sk)->inet_daddr); + hash = ipv4_addr_hash(inet_sk(sk)->inet_daddr); } else { - saddr.family = AF_INET6; - saddr.addr.in6 = sk->sk_v6_rcv_saddr; - daddr.family = AF_INET6; - daddr.addr.in6 = sk->sk_v6_daddr; + inetpeer_set_addr_v6(&saddr, &sk->sk_v6_rcv_saddr); + inetpeer_set_addr_v6(&daddr, &sk->sk_v6_daddr); hash = ipv6_addr_hash(&sk->sk_v6_daddr); } } @@ -796,18 +780,18 @@ static int tcp_metrics_fill_info(struct sk_buff *msg, switch (tm->tcpm_daddr.family) { case AF_INET: if (nla_put_in_addr(msg, TCP_METRICS_ATTR_ADDR_IPV4, - tm->tcpm_daddr.addr.a4) < 0) + inetpeer_get_addr_v4(&tm->tcpm_daddr)) < 0) goto nla_put_failure; if (nla_put_in_addr(msg, TCP_METRICS_ATTR_SADDR_IPV4, - tm->tcpm_saddr.addr.a4) < 0) + inetpeer_get_addr_v4(&tm->tcpm_saddr)) < 0) goto nla_put_failure; break; case AF_INET6: if (nla_put_in6_addr(msg, TCP_METRICS_ATTR_ADDR_IPV6, - &tm->tcpm_daddr.addr.in6) < 0) + inetpeer_get_addr_v6(&tm->tcpm_daddr)) < 0) goto nla_put_failure; if (nla_put_in6_addr(msg, TCP_METRICS_ATTR_SADDR_IPV6, - &tm->tcpm_saddr.addr.in6) < 0) + inetpeer_get_addr_v6(&tm->tcpm_saddr)) < 0) goto nla_put_failure; break; default: @@ -956,20 +940,21 @@ static int __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr, a = info->attrs[v4]; if (a) { - addr->family = AF_INET; - addr->addr.a4 = nla_get_in_addr(a); + inetpeer_set_addr_v4(addr, nla_get_in_addr(a)); if (hash) - *hash = (__force unsigned int) addr->addr.a4; + *hash = ipv4_addr_hash(inetpeer_get_addr_v4(addr)); return 0; } a = info->attrs[v6]; if (a) { + struct in6_addr in6; + if (nla_len(a) != sizeof(struct in6_addr)) return -EINVAL; - addr->family = AF_INET6; - addr->addr.in6 = nla_get_in6_addr(a); + in6 = nla_get_in6_addr(a); + inetpeer_set_addr_v6(addr, &in6); if (hash) - *hash = ipv6_addr_hash(&addr->addr.in6); + *hash = ipv6_addr_hash(inetpeer_get_addr_v6(addr)); return 0; } return optional ? 1 : -EAFNOSUPPORT; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 6d8795b066ac..41828bdc5d32 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -162,9 +162,9 @@ kill_with_rst: if (tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp && tcp_tw_remember_stamp(tw)) - inet_twsk_schedule(tw, tw->tw_timeout); + inet_twsk_reschedule(tw, tw->tw_timeout); else - inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN); + inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); return TCP_TW_ACK; } @@ -201,7 +201,7 @@ kill: return TCP_TW_SUCCESS; } } - inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN); + inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); if (tmp_opt.saw_tstamp) { tcptw->tw_ts_recent = tmp_opt.rcv_tsval; @@ -251,7 +251,7 @@ kill: * Do not reschedule in the last case. */ if (paws_reject || th->ack) - inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN); + inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); return tcp_timewait_check_oow_rate_limit( tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT); @@ -322,9 +322,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) } while (0); #endif - /* Linkage updates. */ - __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); - /* Get the TIME_WAIT timeout firing. */ if (timeo < rto) timeo = rto; @@ -338,6 +335,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) } inet_twsk_schedule(tw, timeo); + /* Linkage updates. */ + __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); inet_twsk_put(tw); } else { /* Sorry, if we're out of memory, just CLOSE this @@ -362,30 +361,38 @@ void tcp_twsk_destructor(struct sock *sk) } EXPORT_SYMBOL_GPL(tcp_twsk_destructor); +/* Warning : This function is called without sk_listener being locked. + * Be sure to read socket fields once, as their value could change under us. + */ void tcp_openreq_init_rwin(struct request_sock *req, - struct sock *sk, struct dst_entry *dst) + const struct sock *sk_listener, + const struct dst_entry *dst) { struct inet_request_sock *ireq = inet_rsk(req); - struct tcp_sock *tp = tcp_sk(sk); - __u8 rcv_wscale; + const struct tcp_sock *tp = tcp_sk(sk_listener); + u16 user_mss = READ_ONCE(tp->rx_opt.user_mss); + int full_space = tcp_full_space(sk_listener); int mss = dst_metric_advmss(dst); + u32 window_clamp; + __u8 rcv_wscale; - if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) - mss = tp->rx_opt.user_mss; + if (user_mss && user_mss < mss) + mss = user_mss; + window_clamp = READ_ONCE(tp->window_clamp); /* Set this up on the first call only */ - req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); + req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW); /* limit the window selection if the user enforce a smaller rx buffer */ - if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && - (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0)) - req->window_clamp = tcp_full_space(sk); + if (sk_listener->sk_userlocks & SOCK_RCVBUF_LOCK && + (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) + req->rsk_window_clamp = full_space; /* tcp_full_space because it is guaranteed to be the first packet */ - tcp_select_initial_window(tcp_full_space(sk), + tcp_select_initial_window(full_space, mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), - &req->rcv_wnd, - &req->window_clamp, + &req->rsk_rcv_wnd, + &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(dst, RTAX_INITRWND)); @@ -434,7 +441,9 @@ EXPORT_SYMBOL_GPL(tcp_ca_openreq_child); * Actually, we could lots of memory writes here. tp of listening * socket contains all necessary default parameters. */ -struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb) +struct sock *tcp_create_openreq_child(const struct sock *sk, + struct request_sock *req, + struct sk_buff *skb) { struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC); @@ -470,7 +479,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tcp_enable_early_retrans(newtp); newtp->tlp_high_seq = 0; - newtp->lsndtime = treq->snt_synack; + newtp->lsndtime = treq->snt_synack.stamp_jiffies; + newsk->sk_txhash = treq->txhash; newtp->last_oow_ack_time = 0; newtp->total_retrans = req->num_retrans; @@ -502,9 +512,9 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, if (sysctl_tcp_fack) tcp_enable_fack(newtp); } - newtp->window_clamp = req->window_clamp; - newtp->rcv_ssthresh = req->rcv_wnd; - newtp->rcv_wnd = req->rcv_wnd; + newtp->window_clamp = req->rsk_window_clamp; + newtp->rcv_ssthresh = req->rsk_rcv_wnd; + newtp->rcv_wnd = req->rsk_rcv_wnd; newtp->rx_opt.wscale_ok = ireq->wscale_ok; if (newtp->rx_opt.wscale_ok) { newtp->rx_opt.snd_wscale = ireq->snd_wscale; @@ -568,8 +578,6 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); bool paws_reject = false; - BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN)); - tmp_opt.saw_tstamp = 0; if (th->doff > (sizeof(struct tcphdr)>>2)) { tcp_parse_options(skb, &tmp_opt, 0, NULL); @@ -699,7 +707,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, /* RFC793: "first check sequence number". */ if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, - tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rcv_wnd)) { + tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rsk_rcv_wnd)) { /* Out of window: send ACK and drop. */ if (!(flg & TCP_FLAG_RST)) req->rsk_ops->send_ack(sk, skb, req); @@ -760,6 +768,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, if (!child) goto listen_overflow; + sock_rps_save_rxhash(child, skb); + tcp_synack_rtt_meas(child, req); inet_csk_reqsk_queue_drop(sk, req); inet_csk_reqsk_queue_add(sk, req, child); /* Warning: caller must not call reqsk_put(req); @@ -812,8 +822,7 @@ int tcp_child_process(struct sock *parent, struct sock *child, int state = child->sk_state; if (!sock_owned_by_user(child)) { - ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb), - skb->len); + ret = tcp_rcv_state_process(child, skb); /* Wakeup parent, send SIGIO */ if (state == TCP_SYN_RECV && child->sk_state != state) parent->sk_data_ready(parent); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 1188e4fcf23b..6e79fcb0addb 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -164,6 +164,9 @@ static void tcp_event_data_sent(struct tcp_sock *tp, struct inet_connection_sock *icsk = inet_csk(sk); const u32 now = tcp_time_stamp; + if (tcp_packets_in_flight(tp) == 0) + tcp_ca_event(sk, CA_EVENT_TX_START); + tp->lsndtime = now; /* If it is a reply for ato after last received @@ -354,14 +357,10 @@ static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb) } static void -tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th, - struct sock *sk) +tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th) { - if (inet_rsk(req)->ecn_ok) { + if (inet_rsk(req)->ecn_ok) th->ece = 1; - if (tcp_ca_needs_ecn(sk)) - INET_ECN_xmit(sk); - } } /* Set up ECN state for a packet on a ESTABLISHED socket that is about to @@ -609,12 +608,11 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, } /* Set up TCP options for SYN-ACKs. */ -static unsigned int tcp_synack_options(struct sock *sk, - struct request_sock *req, - unsigned int mss, struct sk_buff *skb, - struct tcp_out_options *opts, - const struct tcp_md5sig_key *md5, - struct tcp_fastopen_cookie *foc) +static unsigned int tcp_synack_options(struct request_sock *req, + unsigned int mss, struct sk_buff *skb, + struct tcp_out_options *opts, + const struct tcp_md5sig_key *md5, + struct tcp_fastopen_cookie *foc) { struct inet_request_sock *ireq = inet_rsk(req); unsigned int remaining = MAX_TCP_OPTION_SPACE; @@ -940,9 +938,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, &md5); tcp_header_size = tcp_options_size + sizeof(struct tcphdr); - if (tcp_packets_in_flight(tp) == 0) - tcp_ca_event(sk, CA_EVENT_TX_START); - /* if no packet is in qdisc/device queue, then allow XPS to select * another queue. We can be called from tcp_tsq_handler() * which holds one reference to sk_wmem_alloc. @@ -1827,7 +1822,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, /* Ok, it looks like it is advisable to defer. */ - if (cong_win < send_win && cong_win < skb->len) + if (cong_win < send_win && cong_win <= skb->len) *is_cwnd_limited = true; return true; @@ -2060,7 +2055,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, cwnd_quota = tcp_cwnd_test(tp, skb); if (!cwnd_quota) { - is_cwnd_limited = true; if (push_one == 2) /* Force out a loss probe pkt. */ cwnd_quota = 1; @@ -2142,6 +2136,7 @@ repair: /* Send one loss probe per tail loss episode. */ if (push_one != 2) tcp_schedule_loss_probe(sk); + is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd); tcp_cwnd_validate(sk, is_cwnd_limited); return false; } @@ -2165,7 +2160,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) /* Don't do any loss probe on a Fast Open connection before 3WHS * finishes. */ - if (sk->sk_state == TCP_SYN_RECV) + if (tp->fastopen_rsk) return false; /* TLP is only scheduled when next timer event is RTO. */ @@ -2175,7 +2170,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) /* Schedule a loss probe in 2*RTT for SACK capable connections * in Open state, that are either limited by cwnd or application. */ - if (sysctl_tcp_early_retrans < 3 || !tp->srtt_us || !tp->packets_out || + if (sysctl_tcp_early_retrans < 3 || !tp->packets_out || !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open) return false; @@ -2184,9 +2179,10 @@ bool tcp_schedule_loss_probe(struct sock *sk) return false; /* Probe timeout is at least 1.5*rtt + TCP_DELACK_MAX to account - * for delayed ack when there's one outstanding packet. + * for delayed ack when there's one outstanding packet. If no RTT + * sample is available then probe after TCP_TIMEOUT_INIT. */ - timeout = rtt << 1; + timeout = rtt << 1 ? : TCP_TIMEOUT_INIT; if (tp->packets_out == 1) timeout = max_t(u32, timeout, (rtt + (rtt >> 1) + TCP_DELACK_MAX)); @@ -2897,6 +2893,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) skb_reserve(skb, MAX_TCP_HEADER); tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk), TCPHDR_ACK | TCPHDR_RST); + skb_mstamp_get(&skb->skb_mstamp); /* Send it off. */ if (tcp_transmit_skb(sk, skb, 0, priority)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); @@ -2948,20 +2945,22 @@ int tcp_send_synack(struct sock *sk) * Allocate one skb and build a SYNACK packet. * @dst is consumed : Caller should not use it again. */ -struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, +struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct request_sock *req, - struct tcp_fastopen_cookie *foc) + struct tcp_fastopen_cookie *foc, + bool attach_req) { - struct tcp_out_options opts; struct inet_request_sock *ireq = inet_rsk(req); - struct tcp_sock *tp = tcp_sk(sk); - struct tcphdr *th; - struct sk_buff *skb; + const struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *md5 = NULL; + struct tcp_out_options opts; + struct sk_buff *skb; int tcp_header_size; + struct tcphdr *th; + u16 user_mss; int mss; - skb = sock_wmalloc(sk, MAX_TCP_HEADER, 1, GFP_ATOMIC); + skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); if (unlikely(!skb)) { dst_release(dst); return NULL; @@ -2969,11 +2968,23 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, /* Reserve space for headers. */ skb_reserve(skb, MAX_TCP_HEADER); + if (attach_req) { + skb->destructor = sock_edemux; + sock_hold(req_to_sk(req)); + skb->sk = req_to_sk(req); + } else { + /* sk is a const pointer, because we want to express multiple + * cpu might call us concurrently. + * sk->sk_wmem_alloc in an atomic, we can promote to rw. + */ + skb_set_owner_w(skb, (struct sock *)sk); + } skb_dst_set(skb, dst); mss = dst_metric_advmss(dst); - if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) - mss = tp->rx_opt.user_mss; + user_mss = READ_ONCE(tp->rx_opt.user_mss); + if (user_mss && user_mss < mss) + mss = user_mss; memset(&opts, 0, sizeof(opts)); #ifdef CONFIG_SYN_COOKIES @@ -2987,8 +2998,9 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, rcu_read_lock(); md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req)); #endif - tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5, - foc) + sizeof(*th); + skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4); + tcp_header_size = tcp_synack_options(req, mss, skb, &opts, md5, foc) + + sizeof(*th); skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); @@ -2997,7 +3009,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, memset(th, 0, sizeof(struct tcphdr)); th->syn = 1; th->ack = 1; - tcp_ecn_make_synack(req, th, sk); + tcp_ecn_make_synack(req, th); th->source = htons(ireq->ir_num); th->dest = ireq->ir_rmt_port; /* Setting of flags are superfluous here for callers (and ECE is @@ -3011,8 +3023,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt); /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ - th->window = htons(min(req->rcv_wnd, 65535U)); - tcp_options_write((__be32 *)(th + 1), tp, &opts); + th->window = htons(min(req->rsk_rcv_wnd, 65535U)); + tcp_options_write((__be32 *)(th + 1), NULL, &opts); th->doff = (tcp_header_size >> 2); TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS); @@ -3499,13 +3511,14 @@ void tcp_send_probe0(struct sock *sk) TCP_RTO_MAX); } -int tcp_rtx_synack(struct sock *sk, struct request_sock *req) +int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) { const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific; struct flowi fl; int res; - res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL); + tcp_rsk(req)->txhash = net_tx_rndhash(); + res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL, true); if (!res) { TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 7149ebc820c7..c9c716a483e4 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -83,7 +83,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) } /* Calculate maximal number or retries on an orphaned socket. */ -static int tcp_orphan_retries(struct sock *sk, int alive) +static int tcp_orphan_retries(struct sock *sk, bool alive) { int retries = sysctl_tcp_orphan_retries; /* May be zero. */ @@ -184,7 +184,7 @@ static int tcp_write_timeout(struct sock *sk) retry_until = sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { - const int alive = icsk->icsk_rto < TCP_RTO_MAX; + const bool alive = icsk->icsk_rto < TCP_RTO_MAX; retry_until = tcp_orphan_retries(sk, alive); do_reset = alive || @@ -298,7 +298,7 @@ static void tcp_probe_timer(struct sock *sk) max_probes = sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { - const int alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX; + const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX; max_probes = tcp_orphan_retries(sk, alive); if (!alive && icsk->icsk_backoff >= max_probes) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index c0a15e7f359f..24ec14f9825c 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -375,7 +375,8 @@ static inline int compute_score(struct sock *sk, struct net *net, return -1; score += 4; } - + if (sk->sk_incoming_cpu == raw_smp_processor_id()) + score++; return score; } @@ -419,6 +420,9 @@ static inline int compute_score2(struct sock *sk, struct net *net, score += 4; } + if (sk->sk_incoming_cpu == raw_smp_processor_id()) + score++; + return score; } @@ -1017,29 +1021,14 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl4 = &fl4_stack; - /* unconnected socket. If output device is enslaved to a VRF - * device lookup source address from VRF table. This mimics - * behavior of ip_route_connect{_init}. - */ - if (netif_index_is_vrf(net, ipc.oif)) { - flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos, - RT_SCOPE_UNIVERSE, sk->sk_protocol, - (flow_flags | FLOWI_FLAG_VRFSRC), - faddr, saddr, dport, - inet->inet_sport); - - rt = ip_route_output_flow(net, fl4, sk); - if (!IS_ERR(rt)) { - saddr = fl4->saddr; - ip_rt_put(rt); - } - } - flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, sk->sk_protocol, flow_flags, faddr, saddr, dport, inet->inet_sport); + if (!saddr && ipc.oif) + l3mdev_get_saddr(net, ipc.oif, fl4); + security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) { diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index 933ea903f7b8..aba428626b52 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -4,9 +4,10 @@ #include <linux/udp.h> #include <linux/types.h> #include <linux/kernel.h> +#include <net/dst_metadata.h> +#include <net/net_namespace.h> #include <net/udp.h> #include <net/udp_tunnel.h> -#include <net/net_namespace.h> int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, struct socket **sockp) @@ -103,4 +104,26 @@ void udp_tunnel_sock_release(struct socket *sock) } EXPORT_SYMBOL_GPL(udp_tunnel_sock_release); +struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family, + __be16 flags, __be64 tunnel_id, int md_size) +{ + struct metadata_dst *tun_dst; + struct ip_tunnel_info *info; + + if (family == AF_INET) + tun_dst = ip_tun_rx_dst(skb, flags, tunnel_id, md_size); + else + tun_dst = ipv6_tun_rx_dst(skb, flags, tunnel_id, md_size); + if (!tun_dst) + return NULL; + + info = &tun_dst->u.tun_info; + info->key.tp_src = udp_hdr(skb)->source; + info->key.tp_dst = udp_hdr(skb)->dest; + if (udp_hdr(skb)->check) + info->key.tun_flags |= TUNNEL_CSUM; + return tun_dst; +} +EXPORT_SYMBOL_GPL(udp_tun_rx_dst); + MODULE_LICENSE("GPL"); diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index 60b032f58ccc..62e1e72db461 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -22,7 +22,8 @@ int xfrm4_extract_input(struct xfrm_state *x, struct sk_buff *skb) return xfrm4_extract_header(skb); } -static inline int xfrm4_rcv_encap_finish(struct sock *sk, struct sk_buff *skb) +static inline int xfrm4_rcv_encap_finish(struct net *net, struct sock *sk, + struct sk_buff *skb) { if (!skb_dst(skb)) { const struct iphdr *iph = ip_hdr(skb); @@ -52,8 +53,8 @@ int xfrm4_transport_finish(struct sk_buff *skb, int async) iph->tot_len = htons(skb->len); ip_send_check(iph); - NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, NULL, skb, - skb->dev, NULL, + NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, + dev_net(skb->dev), NULL, skb, skb->dev, NULL, xfrm4_rcv_encap_finish); return 0; } diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 2878dbfffeb7..9f298d0dc9a1 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -80,24 +80,25 @@ int xfrm4_output_finish(struct sock *sk, struct sk_buff *skb) return xfrm_output(sk, skb); } -static int __xfrm4_output(struct sock *sk, struct sk_buff *skb) +static int __xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct xfrm_state *x = skb_dst(skb)->xfrm; #ifdef CONFIG_NETFILTER if (!x) { IPCB(skb)->flags |= IPSKB_REROUTED; - return dst_output_sk(sk, skb); + return dst_output(net, sk, skb); } #endif return x->outer_mode->afinfo->output_finish(sk, skb); } -int xfrm4_output(struct sock *sk, struct sk_buff *skb) +int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb, - NULL, skb_dst(skb)->dev, __xfrm4_output, + return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, + net, sk, skb, NULL, skb_dst(skb)->dev, + __xfrm4_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index bb919b28619f..f2606b9056bb 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -15,7 +15,7 @@ #include <net/dst.h> #include <net/xfrm.h> #include <net/ip.h> -#include <net/vrf.h> +#include <net/l3mdev.h> static struct xfrm_policy_afinfo xfrm4_policy_afinfo; @@ -33,6 +33,8 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, if (saddr) fl4->saddr = saddr->a4; + fl4->flowi4_flags = FLOWI_FLAG_SKIP_NH_OIF; + rt = __ip_route_output_key(net, fl4); if (!IS_ERR(rt)) return &rt->dst; @@ -95,6 +97,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, xdst->u.rt.rt_gateway = rt->rt_gateway; xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway; xdst->u.rt.rt_pmtu = rt->rt_pmtu; + xdst->u.rt.rt_table_id = rt->rt_table_id; INIT_LIST_HEAD(&xdst->u.rt.rt_uncached); return 0; @@ -108,10 +111,8 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) struct flowi4 *fl4 = &fl->u.ip4; int oif = 0; - if (skb_dst(skb)) { - oif = vrf_master_ifindex(skb_dst(skb)->dev) ? - : skb_dst(skb)->dev->ifindex; - } + if (skb_dst(skb)) + oif = l3mdev_fib_oif(skb_dst(skb)->dev); memset(fl4, 0, sizeof(struct flowi4)); fl4->flowi4_mark = skb->mark; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 0f08d3b9e238..c8380f1876f1 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1943,37 +1943,6 @@ static void addrconf_leave_anycast(struct inet6_ifaddr *ifp) __ipv6_dev_ac_dec(ifp->idev, &addr); } -static int addrconf_ifid_eui48(u8 *eui, struct net_device *dev) -{ - if (dev->addr_len != ETH_ALEN) - return -1; - memcpy(eui, dev->dev_addr, 3); - memcpy(eui + 5, dev->dev_addr + 3, 3); - - /* - * The zSeries OSA network cards can be shared among various - * OS instances, but the OSA cards have only one MAC address. - * This leads to duplicate address conflicts in conjunction - * with IPv6 if more than one instance uses the same card. - * - * The driver for these cards can deliver a unique 16-bit - * identifier for each instance sharing the same card. It is - * placed instead of 0xFFFE in the interface identifier. The - * "u" bit of the interface identifier is not inverted in this - * case. Hence the resulting interface identifier has local - * scope according to RFC2373. - */ - if (dev->dev_id) { - eui[3] = (dev->dev_id >> 8) & 0xFF; - eui[4] = dev->dev_id & 0xFF; - } else { - eui[3] = 0xFF; - eui[4] = 0xFE; - eui[0] ^= 2; - } - return 0; -} - static int addrconf_ifid_eui64(u8 *eui, struct net_device *dev) { if (dev->addr_len != IEEE802154_ADDR_LEN) @@ -3656,7 +3625,7 @@ static void addrconf_dad_work(struct work_struct *w) /* send a neighbour solicitation for our addr */ addrconf_addr_solict_mult(&ifp->addr, &mcaddr); - ndisc_send_ns(ifp->idev->dev, NULL, &ifp->addr, &mcaddr, &in6addr_any, NULL); + ndisc_send_ns(ifp->idev->dev, &ifp->addr, &mcaddr, &in6addr_any, NULL); out: in6_ifa_put(ifp); rtnl_unlock(); @@ -4726,18 +4695,24 @@ static inline void __snmp6_fill_statsdev(u64 *stats, atomic_long_t *mib, } static inline void __snmp6_fill_stats64(u64 *stats, void __percpu *mib, - int items, int bytes, size_t syncpoff) + int bytes, size_t syncpoff) { - int i; - int pad = bytes - sizeof(u64) * items; + int i, c; + u64 buff[IPSTATS_MIB_MAX]; + int pad = bytes - sizeof(u64) * IPSTATS_MIB_MAX; + BUG_ON(pad < 0); - /* Use put_unaligned() because stats may not be aligned for u64. */ - put_unaligned(items, &stats[0]); - for (i = 1; i < items; i++) - put_unaligned(snmp_fold_field64(mib, i, syncpoff), &stats[i]); + memset(buff, 0, sizeof(buff)); + buff[0] = IPSTATS_MIB_MAX; - memset(&stats[items], 0, pad); + for_each_possible_cpu(c) { + for (i = 1; i < IPSTATS_MIB_MAX; i++) + buff[i] += snmp_get_cpu_field64(mib, c, i, syncpoff); + } + + memcpy(stats, buff, IPSTATS_MIB_MAX * sizeof(u64)); + memset(&stats[IPSTATS_MIB_MAX], 0, pad); } static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int attrtype, @@ -4745,8 +4720,8 @@ static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int attrtype, { switch (attrtype) { case IFLA_INET6_STATS: - __snmp6_fill_stats64(stats, idev->stats.ipv6, - IPSTATS_MIB_MAX, bytes, offsetof(struct ipstats_mib, syncp)); + __snmp6_fill_stats64(stats, idev->stats.ipv6, bytes, + offsetof(struct ipstats_mib, syncp)); break; case IFLA_INET6_ICMP6STATS: __snmp6_fill_statsdev(stats, idev->stats.icmpv6dev->mibs, ICMP6_MIB_MAX, bytes); @@ -4754,7 +4729,8 @@ static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int attrtype, } } -static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev) +static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev, + u32 ext_filter_mask) { struct nlattr *nla; struct ifla_cacheinfo ci; @@ -4774,6 +4750,9 @@ static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev) /* XXX - MC not implemented */ + if (ext_filter_mask & RTEXT_FILTER_SKIP_STATS) + return 0; + nla = nla_reserve(skb, IFLA_INET6_STATS, IPSTATS_MIB_MAX * sizeof(u64)); if (!nla) goto nla_put_failure; @@ -4809,14 +4788,15 @@ static size_t inet6_get_link_af_size(const struct net_device *dev) return inet6_ifla6_size(); } -static int inet6_fill_link_af(struct sk_buff *skb, const struct net_device *dev) +static int inet6_fill_link_af(struct sk_buff *skb, const struct net_device *dev, + u32 ext_filter_mask) { struct inet6_dev *idev = __in6_dev_get(dev); if (!idev) return -ENODATA; - if (inet6_fill_ifla6_attrs(skb, idev) < 0) + if (inet6_fill_ifla6_attrs(skb, idev, ext_filter_mask) < 0) return -EMSGSIZE; return 0; @@ -4971,7 +4951,7 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, if (!protoinfo) goto nla_put_failure; - if (inet6_fill_ifla6_attrs(skb, idev) < 0) + if (inet6_fill_ifla6_attrs(skb, idev, 0) < 0) goto nla_put_failure; nla_nest_end(skb, protoinfo); @@ -5152,13 +5132,12 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) rt = addrconf_get_prefix_route(&ifp->peer_addr, 128, ifp->idev->dev, 0, 0); - if (rt && ip6_del_rt(rt)) - dst_free(&rt->dst); + if (rt) + ip6_del_rt(rt); } dst_hold(&ifp->rt->dst); - if (ip6_del_rt(ifp->rt)) - dst_free(&ifp->rt->dst); + ip6_del_rt(ifp->rt); rt_genid_bump_ipv6(net); break; diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 9aadd57808a5..d70b0238f468 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -263,7 +263,7 @@ void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info) { - struct ipv6_pinfo *np = inet6_sk(sk); + const struct ipv6_pinfo *np = inet6_sk(sk); struct sock_exterr_skb *serr; struct ipv6hdr *iph; struct sk_buff *skb; diff --git a/net/ipv6/exthdrs_offload.c b/net/ipv6/exthdrs_offload.c index 447a7fbd1bb6..f5e2ba1c18bf 100644 --- a/net/ipv6/exthdrs_offload.c +++ b/net/ipv6/exthdrs_offload.c @@ -36,6 +36,6 @@ out: return ret; out_rt: - inet_del_offload(&rthdr_offload, IPPROTO_ROUTING); + inet6_del_offload(&rthdr_offload, IPPROTO_ROUTING); goto out; } diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 2367a16eae58..9f777ec59a59 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -258,11 +258,6 @@ nla_put_failure: return -ENOBUFS; } -static u32 fib6_rule_default_pref(struct fib_rules_ops *ops) -{ - return 0x3FFF; -} - static size_t fib6_rule_nlmsg_payload(struct fib_rule *rule) { return nla_total_size(16) /* dst */ @@ -279,7 +274,6 @@ static const struct fib_rules_ops __net_initconst fib6_rules_ops_template = { .configure = fib6_rule_configure, .compare = fib6_rule_compare, .fill = fib6_rule_fill, - .default_pref = fib6_rule_default_pref, .nlmsg_payload = fib6_rule_nlmsg_payload, .nlgroup = RTNLGRP_IPV6_RULE, .policy = fib6_rule_policy, diff --git a/net/ipv6/ila.c b/net/ipv6/ila.c index 678d2df4b8d9..1a6852e1ac69 100644 --- a/net/ipv6/ila.c +++ b/net/ipv6/ila.c @@ -91,7 +91,7 @@ static void update_ipv6_locator(struct sk_buff *skb, struct ila_params *p) *(__be64 *)&ip6h->daddr = p->locator; } -static int ila_output(struct sock *sk, struct sk_buff *skb) +static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); @@ -100,7 +100,7 @@ static int ila_output(struct sock *sk, struct sk_buff *skb) update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate)); - return dst->lwtstate->orig_output(sk, skb); + return dst->lwtstate->orig_output(net, sk, skb); drop: kfree_skb(skb); diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 6927f3fb5597..5d1c7cee2cb2 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -65,17 +65,18 @@ int inet6_csk_bind_conflict(const struct sock *sk, } EXPORT_SYMBOL_GPL(inet6_csk_bind_conflict); -struct dst_entry *inet6_csk_route_req(struct sock *sk, +struct dst_entry *inet6_csk_route_req(const struct sock *sk, struct flowi6 *fl6, - const struct request_sock *req) + const struct request_sock *req, + u8 proto) { struct inet_request_sock *ireq = inet_rsk(req); - struct ipv6_pinfo *np = inet6_sk(sk); + const struct ipv6_pinfo *np = inet6_sk(sk); struct in6_addr *final_p, final; struct dst_entry *dst; memset(fl6, 0, sizeof(*fl6)); - fl6->flowi6_proto = IPPROTO_TCP; + fl6->flowi6_proto = proto; fl6->daddr = ireq->ir_v6_rmt_addr; final_p = fl6_update_dst(fl6, np->opt, &final); fl6->saddr = ireq->ir_v6_loc_addr; @@ -91,73 +92,7 @@ struct dst_entry *inet6_csk_route_req(struct sock *sk, return dst; } - -/* - * request_sock (formerly open request) hash tables. - */ -static u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport, - const u32 rnd, const u32 synq_hsize) -{ - u32 c; - - c = jhash_3words((__force u32)raddr->s6_addr32[0], - (__force u32)raddr->s6_addr32[1], - (__force u32)raddr->s6_addr32[2], - rnd); - - c = jhash_2words((__force u32)raddr->s6_addr32[3], - (__force u32)rport, - c); - - return c & (synq_hsize - 1); -} - -struct request_sock *inet6_csk_search_req(struct sock *sk, - const __be16 rport, - const struct in6_addr *raddr, - const struct in6_addr *laddr, - const int iif) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - struct request_sock *req; - u32 hash = inet6_synq_hash(raddr, rport, lopt->hash_rnd, - lopt->nr_table_entries); - - spin_lock(&icsk->icsk_accept_queue.syn_wait_lock); - for (req = lopt->syn_table[hash]; req != NULL; req = req->dl_next) { - const struct inet_request_sock *ireq = inet_rsk(req); - - if (ireq->ir_rmt_port == rport && - req->rsk_ops->family == AF_INET6 && - ipv6_addr_equal(&ireq->ir_v6_rmt_addr, raddr) && - ipv6_addr_equal(&ireq->ir_v6_loc_addr, laddr) && - (!ireq->ir_iif || ireq->ir_iif == iif)) { - atomic_inc(&req->rsk_refcnt); - WARN_ON(req->sk != NULL); - break; - } - } - spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock); - - return req; -} -EXPORT_SYMBOL_GPL(inet6_csk_search_req); - -void inet6_csk_reqsk_queue_hash_add(struct sock *sk, - struct request_sock *req, - const unsigned long timeout) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - const u32 h = inet6_synq_hash(&inet_rsk(req)->ir_v6_rmt_addr, - inet_rsk(req)->ir_rmt_port, - lopt->hash_rnd, lopt->nr_table_entries); - - reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); - inet_csk_reqsk_queue_added(sk, timeout); -} -EXPORT_SYMBOL_GPL(inet6_csk_reqsk_queue_hash_add); +EXPORT_SYMBOL(inet6_csk_route_req); void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) { diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 6ac8dad0138a..21ace5a2bf7c 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -114,6 +114,8 @@ static inline int compute_score(struct sock *sk, struct net *net, return -1; score++; } + if (sk->sk_incoming_cpu == raw_smp_processor_id()) + score++; } return score; } diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 418d9823692b..7d2e0023c72d 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -155,6 +155,11 @@ static void node_free(struct fib6_node *fn) kmem_cache_free(fib6_node_kmem, fn); } +static void rt6_rcu_free(struct rt6_info *rt) +{ + call_rcu(&rt->dst.rcu_head, dst_rcu_free); +} + static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) { int cpu; @@ -169,7 +174,7 @@ static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) ppcpu_rt = per_cpu_ptr(non_pcpu_rt->rt6i_pcpu, cpu); pcpu_rt = *ppcpu_rt; if (pcpu_rt) { - dst_free(&pcpu_rt->dst); + rt6_rcu_free(pcpu_rt); *ppcpu_rt = NULL; } } @@ -181,7 +186,7 @@ static void rt6_release(struct rt6_info *rt) { if (atomic_dec_and_test(&rt->rt6i_ref)) { rt6_free_pcpu(rt); - dst_free(&rt->dst); + rt6_rcu_free(rt); } } @@ -846,7 +851,7 @@ add: *ins = rt; rt->rt6i_node = fn; atomic_inc(&rt->rt6i_ref); - inet6_rt_notify(RTM_NEWROUTE, rt, info); + inet6_rt_notify(RTM_NEWROUTE, rt, info, 0); info->nl_net->ipv6.rt6_stats->fib_rt_entries++; if (!(fn->fn_flags & RTN_RTINFO)) { @@ -872,7 +877,7 @@ add: rt->rt6i_node = fn; rt->dst.rt6_next = iter->dst.rt6_next; atomic_inc(&rt->rt6i_ref); - inet6_rt_notify(RTM_NEWROUTE, rt, info); + inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); if (!(fn->fn_flags & RTN_RTINFO)) { info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; @@ -933,6 +938,10 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, int replace_required = 0; int sernum = fib6_new_sernum(info->nl_net); + if (WARN_ON_ONCE((rt->dst.flags & DST_NOCACHE) && + !atomic_read(&rt->dst.__refcnt))) + return -EINVAL; + if (info->nlh) { if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) allow_create = 0; @@ -1025,6 +1034,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, fib6_start_gc(info->nl_net, rt); if (!(rt->rt6i_flags & RTF_CACHE)) fib6_prune_clones(info->nl_net, pn); + rt->dst.flags &= ~DST_NOCACHE; } out: @@ -1049,7 +1059,8 @@ out: atomic_inc(&pn->leaf->rt6i_ref); } #endif - dst_free(&rt->dst); + if (!(rt->dst.flags & DST_NOCACHE)) + dst_free(&rt->dst); } return err; @@ -1060,7 +1071,8 @@ out: st_failure: if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT))) fib6_repair_tree(info->nl_net, fn); - dst_free(&rt->dst); + if (!(rt->dst.flags & DST_NOCACHE)) + dst_free(&rt->dst); return err; #endif } @@ -1410,7 +1422,7 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, fib6_purge_rt(rt, fn, net); - inet6_rt_notify(RTM_DELROUTE, rt, info); + inet6_rt_notify(RTM_DELROUTE, rt, info, 0); rt6_release(rt); } diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 34f121812a14..3c7b9310b33f 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -361,6 +361,7 @@ static void ip6gre_tunnel_uninit(struct net_device *dev) struct ip6gre_net *ign = net_generic(t->net, ip6gre_net_id); ip6gre_tunnel_unlink(ign, t); + ip6_tnl_dst_reset(t); dev_put(dev); } @@ -403,13 +404,13 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, struct ipv6_tlv_tnl_enc_lim *tel; __u32 mtu; case ICMPV6_DEST_UNREACH: - net_warn_ratelimited("%s: Path to destination invalid or inactive!\n", - t->parms.name); + net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n", + t->parms.name); break; case ICMPV6_TIME_EXCEED: if (code == ICMPV6_EXC_HOPLIMIT) { - net_warn_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n", - t->parms.name); + net_dbg_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n", + t->parms.name); } break; case ICMPV6_PARAMPROB: @@ -420,12 +421,12 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (teli && teli == be32_to_cpu(info) - 2) { tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli]; if (tel->encap_limit == 0) { - net_warn_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n", - t->parms.name); + net_dbg_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n", + t->parms.name); } } else { - net_warn_ratelimited("%s: Recipient unable to parse tunneled packet!\n", - t->parms.name); + net_dbg_ratelimited("%s: Recipient unable to parse tunneled packet!\n", + t->parms.name); } break; case ICMPV6_PKT_TOOBIG: @@ -633,20 +634,20 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, } if (!fl6->flowi6_mark) - dst = ip6_tnl_dst_check(tunnel); + dst = ip6_tnl_dst_get(tunnel); if (!dst) { - ndst = ip6_route_output(net, NULL, fl6); + dst = ip6_route_output(net, NULL, fl6); - if (ndst->error) + if (dst->error) goto tx_err_link_failure; - ndst = xfrm_lookup(net, ndst, flowi6_to_flowi(fl6), NULL, 0); - if (IS_ERR(ndst)) { - err = PTR_ERR(ndst); - ndst = NULL; + dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), NULL, 0); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + dst = NULL; goto tx_err_link_failure; } - dst = ndst; + ndst = dst; } tdev = dst->dev; @@ -701,12 +702,9 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, skb = new_skb; } - if (fl6->flowi6_mark) { - skb_dst_set(skb, dst); - ndst = NULL; - } else { - skb_dst_set_noref(skb, dst); - } + if (!fl6->flowi6_mark && ndst) + ip6_tnl_dst_set(tunnel, ndst); + skb_dst_set(skb, dst); proto = NEXTHDR_GRE; if (encap_limit >= 0) { @@ -761,14 +759,12 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, skb_set_inner_protocol(skb, protocol); ip6tunnel_xmit(NULL, skb, dev); - if (ndst) - ip6_tnl_dst_store(tunnel, ndst); return 0; tx_err_link_failure: stats->tx_carrier_errors++; dst_link_failure(skb); tx_err_dst_release: - dst_release(ndst); + dst_release(dst); return err; } @@ -1222,6 +1218,9 @@ static const struct net_device_ops ip6gre_netdev_ops = { static void ip6gre_dev_free(struct net_device *dev) { + struct ip6_tnl *t = netdev_priv(dev); + + ip6_tnl_dst_destroy(t); free_percpu(dev->tstats); free_netdev(dev); } @@ -1244,9 +1243,10 @@ static void ip6gre_tunnel_setup(struct net_device *dev) netif_keep_dst(dev); } -static int ip6gre_tunnel_init(struct net_device *dev) +static int ip6gre_tunnel_init_common(struct net_device *dev) { struct ip6_tnl *tunnel; + int ret; tunnel = netdev_priv(dev); @@ -1254,16 +1254,37 @@ static int ip6gre_tunnel_init(struct net_device *dev) tunnel->net = dev_net(dev); strcpy(tunnel->parms.name, dev->name); + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!dev->tstats) + return -ENOMEM; + + ret = ip6_tnl_dst_init(tunnel); + if (ret) { + free_percpu(dev->tstats); + dev->tstats = NULL; + return ret; + } + + return 0; +} + +static int ip6gre_tunnel_init(struct net_device *dev) +{ + struct ip6_tnl *tunnel; + int ret; + + ret = ip6gre_tunnel_init_common(dev); + if (ret) + return ret; + + tunnel = netdev_priv(dev); + memcpy(dev->dev_addr, &tunnel->parms.laddr, sizeof(struct in6_addr)); memcpy(dev->broadcast, &tunnel->parms.raddr, sizeof(struct in6_addr)); if (ipv6_addr_any(&tunnel->parms.raddr)) dev->header_ops = &ip6gre_header_ops; - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) - return -ENOMEM; - return 0; } @@ -1459,19 +1480,16 @@ static void ip6gre_netlink_parms(struct nlattr *data[], static int ip6gre_tap_init(struct net_device *dev) { struct ip6_tnl *tunnel; + int ret; - tunnel = netdev_priv(dev); + ret = ip6gre_tunnel_init_common(dev); + if (ret) + return ret; - tunnel->dev = dev; - tunnel->net = dev_net(dev); - strcpy(tunnel->parms.name, dev->name); + tunnel = netdev_priv(dev); ip6gre_tnl_link_config(tunnel, 1); - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) - return -ENOMEM; - return 0; } diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index adba03ac7ce9..9075acf081dd 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -47,7 +47,7 @@ #include <net/inet_ecn.h> #include <net/dst_metadata.h> -int ip6_rcv_finish(struct sock *sk, struct sk_buff *skb) +int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { if (sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) { const struct inet6_protocol *ipprot; @@ -109,7 +109,7 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt if (hdr->version != 6) goto err; - IP6_ADD_STATS_BH(dev_net(dev), idev, + IP6_ADD_STATS_BH(net, idev, IPSTATS_MIB_NOECTPKTS + (ipv6_get_dsfield(hdr) & INET_ECN_MASK), max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs)); @@ -183,8 +183,8 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt /* Must drop socket now because of tproxy. */ skb_orphan(skb); - return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, NULL, skb, - dev, NULL, + return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, + net, NULL, skb, dev, NULL, ip6_rcv_finish); err: IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INHDRERRORS); @@ -199,9 +199,8 @@ drop: */ -static int ip6_input_finish(struct sock *sk, struct sk_buff *skb) +static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct net *net = dev_net(skb_dst(skb)->dev); const struct inet6_protocol *ipprot; struct inet6_dev *idev; unsigned int nhoff; @@ -278,8 +277,8 @@ discard: int ip6_input(struct sk_buff *skb) { - return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN, NULL, skb, - skb->dev, NULL, + return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN, + dev_net(skb->dev), NULL, skb, skb->dev, NULL, ip6_input_finish); } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 26ea47930740..32583b507c2e 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -56,7 +56,7 @@ #include <net/checksum.h> #include <linux/mroute6.h> -static int ip6_finish_output2(struct sock *sk, struct sk_buff *skb) +static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct net_device *dev = dst->dev; @@ -71,7 +71,7 @@ static int ip6_finish_output2(struct sock *sk, struct sk_buff *skb) struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) && - ((mroute6_socket(dev_net(dev), skb) && + ((mroute6_socket(net, skb) && !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr, &ipv6_hdr(skb)->saddr))) { @@ -82,19 +82,18 @@ static int ip6_finish_output2(struct sock *sk, struct sk_buff *skb) */ if (newskb) NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, - sk, newskb, NULL, newskb->dev, + net, sk, newskb, NULL, newskb->dev, dev_loopback_xmit); if (ipv6_hdr(skb)->hop_limit == 0) { - IP6_INC_STATS(dev_net(dev), idev, + IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); kfree_skb(skb); return 0; } } - IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST, - skb->len); + IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len); if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <= IPV6_ADDR_SCOPE_NODELOCAL && @@ -116,48 +115,49 @@ static int ip6_finish_output2(struct sock *sk, struct sk_buff *skb) } rcu_read_unlock_bh(); - IP6_INC_STATS(dev_net(dst->dev), - ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); + IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); kfree_skb(skb); return -EINVAL; } -static int ip6_finish_output(struct sock *sk, struct sk_buff *skb) +static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) || dst_allfrag(skb_dst(skb)) || (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)) - return ip6_fragment(sk, skb, ip6_finish_output2); + return ip6_fragment(net, sk, skb, ip6_finish_output2); else - return ip6_finish_output2(sk, skb); + return ip6_finish_output2(net, sk, skb); } -int ip6_output(struct sock *sk, struct sk_buff *skb) +int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev; struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); + if (unlikely(idev->cnf.disable_ipv6)) { - IP6_INC_STATS(dev_net(dev), idev, - IPSTATS_MIB_OUTDISCARDS); + IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); kfree_skb(skb); return 0; } - return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, sk, skb, - NULL, dev, + return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, + net, sk, skb, NULL, dev, ip6_finish_output, !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } /* - * xmit an sk_buff (used by TCP, SCTP and DCCP) + * xmit an sk_buff (used by TCP, SCTP and DCCP) + * Note : socket lock is not held for SYNACK packets, but might be modified + * by calls to skb_set_owner_w() and ipv6_local_error(), + * which are using proper atomic operations or spinlocks. */ - -int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, +int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, struct ipv6_txoptions *opt, int tclass) { struct net *net = sock_net(sk); - struct ipv6_pinfo *np = inet6_sk(sk); + const struct ipv6_pinfo *np = inet6_sk(sk); struct in6_addr *first_hop = &fl6->daddr; struct dst_entry *dst = skb_dst(skb); struct ipv6hdr *hdr; @@ -186,7 +186,10 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, } consume_skb(skb); skb = skb2; - skb_set_owner_w(skb, sk); + /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically, + * it is safe to call in our context (socket lock not held) + */ + skb_set_owner_w(skb, (struct sock *)sk); } if (opt->opt_flen) ipv6_push_frag_opts(skb, opt, &proto); @@ -224,12 +227,20 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) { IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUT, skb->len); - return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, sk, skb, - NULL, dst->dev, dst_output_sk); + /* hooks should never assume socket lock is held. + * we promote our socket to non const + */ + return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, + net, (struct sock *)sk, skb, NULL, dst->dev, + dst_output); } skb->dev = dst->dev; - ipv6_local_error(sk, EMSGSIZE, fl6, mtu); + /* ipv6_local_error() does not require socket lock, + * we promote our socket to non const + */ + ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu); + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); kfree_skb(skb); return -EMSGSIZE; @@ -317,10 +328,11 @@ static int ip6_forward_proxy_check(struct sk_buff *skb) return 0; } -static inline int ip6_forward_finish(struct sock *sk, struct sk_buff *skb) +static inline int ip6_forward_finish(struct net *net, struct sock *sk, + struct sk_buff *skb) { skb_sender_cpu_clear(skb); - return dst_output_sk(sk, skb); + return dst_output(net, sk, skb); } static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) @@ -512,8 +524,8 @@ int ip6_forward(struct sk_buff *skb) IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len); - return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, NULL, skb, - skb->dev, dst->dev, + return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, + net, NULL, skb, skb->dev, dst->dev, ip6_forward_finish); error: @@ -540,8 +552,8 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) skb_copy_secmark(to, from); } -int ip6_fragment(struct sock *sk, struct sk_buff *skb, - int (*output)(struct sock *, struct sk_buff *)) +int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, + int (*output)(struct net *, struct sock *, struct sk_buff *)) { struct sk_buff *frag; struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); @@ -554,7 +566,6 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb, __be32 frag_id; int ptr, offset = 0, err = 0; u8 *prevhdr, nexthdr = 0; - struct net *net = dev_net(skb_dst(skb)->dev); hlen = ip6_find_1stfragopt(skb, &prevhdr); nexthdr = *prevhdr; @@ -586,20 +597,22 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb, frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr, &ipv6_hdr(skb)->saddr); + hroom = LL_RESERVED_SPACE(rt->dst.dev); if (skb_has_frag_list(skb)) { int first_len = skb_pagelen(skb); struct sk_buff *frag2; if (first_len - hlen > mtu || ((first_len - hlen) & 7) || - skb_cloned(skb)) + skb_cloned(skb) || + skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) goto slow_path; skb_walk_frags(skb, frag) { /* Correct geometry. */ if (frag->len > mtu || ((frag->len & 7) && frag->next) || - skb_headroom(frag) < hlen) + skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr))) goto slow_path_clean; /* Partially cloned skb? */ @@ -616,8 +629,6 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb, err = 0; offset = 0; - frag = skb_shinfo(skb)->frag_list; - skb_frag_list_init(skb); /* BUILD HEADER */ *prevhdr = NEXTHDR_FRAGMENT; @@ -625,8 +636,11 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb, if (!tmp_hdr) { IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); - return -ENOMEM; + err = -ENOMEM; + goto fail; } + frag = skb_shinfo(skb)->frag_list; + skb_frag_list_init(skb); __skb_pull(skb, hlen); fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr)); @@ -671,7 +685,7 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb, ip6_copy_metadata(frag, skb); } - err = output(sk, skb); + err = output(net, sk, skb); if (!err) IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), IPSTATS_MIB_FRAGCREATES); @@ -723,7 +737,6 @@ slow_path: */ *prevhdr = NEXTHDR_FRAGMENT; - hroom = LL_RESERVED_SPACE(rt->dst.dev); troom = rt->dst.dev->needed_tailroom; /* @@ -800,7 +813,7 @@ slow_path: /* * Put this fragment into the sending queue. */ - err = output(sk, frag); + err = output(net, sk, frag); if (err) goto fail; @@ -881,7 +894,7 @@ out: return dst; } -static int ip6_dst_lookup_tail(struct net *net, struct sock *sk, +static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6) { #ifdef CONFIG_IPV6_OPTIMISTIC_DAD @@ -1012,7 +1025,7 @@ EXPORT_SYMBOL_GPL(ip6_dst_lookup); * It returns a valid dst pointer on success, or a pointer encoded * error code. */ -struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, +struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6, const struct in6_addr *final_dst) { struct dst_entry *dst = NULL; @@ -1678,7 +1691,7 @@ int ip6_send_skb(struct sk_buff *skb) struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); int err; - err = ip6_local_out(skb); + err = ip6_local_out(net, skb->sk, skb); if (err) { if (err > 0) err = net_xmit_errno(err); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index b0ab420612bc..eabffbb89795 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -126,36 +126,92 @@ static struct net_device_stats *ip6_get_stats(struct net_device *dev) * Locking : hash tables are protected by RCU and RTNL */ -struct dst_entry *ip6_tnl_dst_check(struct ip6_tnl *t) +static void ip6_tnl_per_cpu_dst_set(struct ip6_tnl_dst *idst, + struct dst_entry *dst) { - struct dst_entry *dst = t->dst_cache; + write_seqlock_bh(&idst->lock); + dst_release(rcu_dereference_protected( + idst->dst, + lockdep_is_held(&idst->lock.lock))); + if (dst) { + dst_hold(dst); + idst->cookie = rt6_get_cookie((struct rt6_info *)dst); + } else { + idst->cookie = 0; + } + rcu_assign_pointer(idst->dst, dst); + write_sequnlock_bh(&idst->lock); +} + +struct dst_entry *ip6_tnl_dst_get(struct ip6_tnl *t) +{ + struct ip6_tnl_dst *idst; + struct dst_entry *dst; + unsigned int seq; + u32 cookie; - if (dst && dst->obsolete && - !dst->ops->check(dst, t->dst_cookie)) { - t->dst_cache = NULL; + idst = raw_cpu_ptr(t->dst_cache); + + rcu_read_lock(); + do { + seq = read_seqbegin(&idst->lock); + dst = rcu_dereference(idst->dst); + cookie = idst->cookie; + } while (read_seqretry(&idst->lock, seq)); + + if (dst && !atomic_inc_not_zero(&dst->__refcnt)) + dst = NULL; + rcu_read_unlock(); + + if (dst && dst->obsolete && !dst->ops->check(dst, cookie)) { + ip6_tnl_per_cpu_dst_set(idst, NULL); dst_release(dst); - return NULL; + dst = NULL; } - return dst; } -EXPORT_SYMBOL_GPL(ip6_tnl_dst_check); +EXPORT_SYMBOL_GPL(ip6_tnl_dst_get); void ip6_tnl_dst_reset(struct ip6_tnl *t) { - dst_release(t->dst_cache); - t->dst_cache = NULL; + int i; + + for_each_possible_cpu(i) + ip6_tnl_per_cpu_dst_set(raw_cpu_ptr(t->dst_cache), NULL); } EXPORT_SYMBOL_GPL(ip6_tnl_dst_reset); -void ip6_tnl_dst_store(struct ip6_tnl *t, struct dst_entry *dst) +void ip6_tnl_dst_set(struct ip6_tnl *t, struct dst_entry *dst) +{ + ip6_tnl_per_cpu_dst_set(raw_cpu_ptr(t->dst_cache), dst); + +} +EXPORT_SYMBOL_GPL(ip6_tnl_dst_set); + +void ip6_tnl_dst_destroy(struct ip6_tnl *t) { - struct rt6_info *rt = (struct rt6_info *) dst; - t->dst_cookie = rt6_get_cookie(rt); - dst_release(t->dst_cache); - t->dst_cache = dst; + if (!t->dst_cache) + return; + + ip6_tnl_dst_reset(t); + free_percpu(t->dst_cache); } -EXPORT_SYMBOL_GPL(ip6_tnl_dst_store); +EXPORT_SYMBOL_GPL(ip6_tnl_dst_destroy); + +int ip6_tnl_dst_init(struct ip6_tnl *t) +{ + int i; + + t->dst_cache = alloc_percpu(struct ip6_tnl_dst); + if (!t->dst_cache) + return -ENOMEM; + + for_each_possible_cpu(i) + seqlock_init(&per_cpu_ptr(t->dst_cache, i)->lock); + + return 0; +} +EXPORT_SYMBOL_GPL(ip6_tnl_dst_init); /** * ip6_tnl_lookup - fetch tunnel matching the end-point addresses @@ -271,6 +327,9 @@ ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) static void ip6_dev_free(struct net_device *dev) { + struct ip6_tnl *t = netdev_priv(dev); + + ip6_tnl_dst_destroy(t); free_percpu(dev->tstats); free_netdev(dev); } @@ -510,14 +569,14 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, struct ipv6_tlv_tnl_enc_lim *tel; __u32 mtu; case ICMPV6_DEST_UNREACH: - net_warn_ratelimited("%s: Path to destination invalid or inactive!\n", - t->parms.name); + net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n", + t->parms.name); rel_msg = 1; break; case ICMPV6_TIME_EXCEED: if ((*code) == ICMPV6_EXC_HOPLIMIT) { - net_warn_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n", - t->parms.name); + net_dbg_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n", + t->parms.name); rel_msg = 1; } break; @@ -529,13 +588,13 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, if (teli && teli == *info - 2) { tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli]; if (tel->encap_limit == 0) { - net_warn_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n", - t->parms.name); + net_dbg_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n", + t->parms.name); rel_msg = 1; } } else { - net_warn_ratelimited("%s: Recipient unable to parse tunneled packet!\n", - t->parms.name); + net_dbg_ratelimited("%s: Recipient unable to parse tunneled packet!\n", + t->parms.name); } break; case ICMPV6_PKT_TOOBIG: @@ -1010,23 +1069,23 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr)); neigh_release(neigh); } else if (!fl6->flowi6_mark) - dst = ip6_tnl_dst_check(t); + dst = ip6_tnl_dst_get(t); if (!ip6_tnl_xmit_ctl(t, &fl6->saddr, &fl6->daddr)) goto tx_err_link_failure; if (!dst) { - ndst = ip6_route_output(net, NULL, fl6); + dst = ip6_route_output(net, NULL, fl6); - if (ndst->error) + if (dst->error) goto tx_err_link_failure; - ndst = xfrm_lookup(net, ndst, flowi6_to_flowi(fl6), NULL, 0); - if (IS_ERR(ndst)) { - err = PTR_ERR(ndst); - ndst = NULL; + dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), NULL, 0); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + dst = NULL; goto tx_err_link_failure; } - dst = ndst; + ndst = dst; } tdev = dst->dev; @@ -1072,12 +1131,11 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, consume_skb(skb); skb = new_skb; } - if (fl6->flowi6_mark) { - skb_dst_set(skb, dst); - ndst = NULL; - } else { - skb_dst_set_noref(skb, dst); - } + + if (!fl6->flowi6_mark && ndst) + ip6_tnl_dst_set(t, ndst); + skb_dst_set(skb, dst); + skb->transport_header = skb->network_header; proto = fl6->flowi6_proto; @@ -1101,14 +1159,12 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, ipv6h->saddr = fl6->saddr; ipv6h->daddr = fl6->daddr; ip6tunnel_xmit(NULL, skb, dev); - if (ndst) - ip6_tnl_dst_store(t, ndst); return 0; tx_err_link_failure: stats->tx_carrier_errors++; dst_link_failure(skb); tx_err_dst_release: - dst_release(ndst); + dst_release(dst); return err; } @@ -1573,12 +1629,21 @@ static inline int ip6_tnl_dev_init_gen(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); + int ret; t->dev = dev; t->net = dev_net(dev); dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!dev->tstats) return -ENOMEM; + + ret = ip6_tnl_dst_init(t); + if (ret) { + free_percpu(dev->tstats); + dev->tstats = NULL; + return ret; + } + return 0; } diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index e1a1136bda7c..14dacf1df529 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -23,6 +23,15 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, if (err < 0) goto error; + if (cfg->ipv6_v6only) { + int val = 1; + + err = kernel_setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY, + (char *) &val, sizeof(val)); + if (err < 0) + goto error; + } + udp6_addr.sin6_family = AF_INET6; memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6, sizeof(udp6_addr.sin6_addr)); diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 0224c032dca5..0a8610b33d79 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -482,7 +482,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) return -EMSGSIZE; } - err = dst_output(skb); + err = dst_output(t->net, skb->sk, skb); if (net_xmit_eval(err) == 0) { struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats); diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 74ceb73c1c9a..ad19136086dd 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -217,7 +217,6 @@ static const struct fib_rules_ops __net_initconst ip6mr_rules_ops_template = { .match = ip6mr_rule_match, .configure = ip6mr_rule_configure, .compare = ip6mr_rule_compare, - .default_pref = fib_default_rule_pref, .fill = ip6mr_rule_fill, .nlgroup = RTNLGRP_IPV6_RULE, .policy = ip6mr_rule_policy, @@ -550,7 +549,7 @@ static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v) if (it->cache == &mrt->mfc6_unres_queue) spin_unlock_bh(&mfc_unres_lock); - else if (it->cache == mrt->mfc6_cache_array) + else if (it->cache == &mrt->mfc6_cache_array[it->ct]) read_unlock(&mrt_lock); } @@ -1986,13 +1985,13 @@ int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) } #endif -static inline int ip6mr_forward2_finish(struct sock *sk, struct sk_buff *skb) +static inline int ip6mr_forward2_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { - IP6_INC_STATS_BH(dev_net(skb_dst(skb)->dev), ip6_dst_idev(skb_dst(skb)), + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUTFORWDATAGRAMS); - IP6_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), ip6_dst_idev(skb_dst(skb)), + IP6_ADD_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUTOCTETS, skb->len); - return dst_output_sk(sk, skb); + return dst_output(net, sk, skb); } /* @@ -2064,8 +2063,8 @@ static int ip6mr_forward2(struct net *net, struct mr6_table *mrt, IP6CB(skb)->flags |= IP6SKB_FORWARDED; - return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, NULL, skb, - skb->dev, dev, + return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, + net, NULL, skb, skb->dev, dev, ip6mr_forward2_finish); out_free: diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 083b2927fc67..124338a39e29 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1645,8 +1645,8 @@ static void mld_sendpack(struct sk_buff *skb) payload_len = skb->len; err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, - net->ipv6.igmp_sk, skb, NULL, skb->dev, - dst_output_sk); + net, net->ipv6.igmp_sk, skb, NULL, skb->dev, + dst_output); out: if (!err) { ICMP6MSGOUT_INC_STATS(net, idev, ICMPV6_MLD2_REPORT); @@ -2008,8 +2008,9 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) } skb_dst_set(skb, dst); - err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, sk, skb, - NULL, skb->dev, dst_output_sk); + err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, + net, sk, skb, NULL, skb->dev, + dst_output); out: if (!err) { ICMP6MSGOUT_INC_STATS(net, idev, type); diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c index b9779d441b12..60c79a08e14a 100644 --- a/net/ipv6/mip6.c +++ b/net/ipv6/mip6.c @@ -118,7 +118,7 @@ static int mip6_mh_filter(struct sock *sk, struct sk_buff *skb) struct mip6_report_rate_limiter { spinlock_t lock; - struct timeval stamp; + ktime_t stamp; int iif; struct in6_addr src; struct in6_addr dst; @@ -184,20 +184,18 @@ static int mip6_destopt_output(struct xfrm_state *x, struct sk_buff *skb) return 0; } -static inline int mip6_report_rl_allow(struct timeval *stamp, +static inline int mip6_report_rl_allow(ktime_t stamp, const struct in6_addr *dst, const struct in6_addr *src, int iif) { int allow = 0; spin_lock_bh(&mip6_report_rl.lock); - if (mip6_report_rl.stamp.tv_sec != stamp->tv_sec || - mip6_report_rl.stamp.tv_usec != stamp->tv_usec || + if (!ktime_equal(mip6_report_rl.stamp, stamp) || mip6_report_rl.iif != iif || !ipv6_addr_equal(&mip6_report_rl.src, src) || !ipv6_addr_equal(&mip6_report_rl.dst, dst)) { - mip6_report_rl.stamp.tv_sec = stamp->tv_sec; - mip6_report_rl.stamp.tv_usec = stamp->tv_usec; + mip6_report_rl.stamp = stamp; mip6_report_rl.iif = iif; mip6_report_rl.src = *src; mip6_report_rl.dst = *dst; @@ -216,7 +214,7 @@ static int mip6_destopt_reject(struct xfrm_state *x, struct sk_buff *skb, struct ipv6_destopt_hao *hao = NULL; struct xfrm_selector sel; int offset; - struct timeval stamp; + ktime_t stamp; int err = 0; if (unlikely(fl6->flowi6_proto == IPPROTO_MH && @@ -230,9 +228,9 @@ static int mip6_destopt_reject(struct xfrm_state *x, struct sk_buff *skb, (skb_network_header(skb) + offset); } - skb_get_timestamp(skb, &stamp); + stamp = skb_get_ktime(skb); - if (!mip6_report_rl_allow(&stamp, &ipv6_hdr(skb)->daddr, + if (!mip6_report_rl_allow(stamp, &ipv6_hdr(skb)->daddr, hao ? &hao->addr : &ipv6_hdr(skb)->saddr, opt->iif)) goto out; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 13d3c2beb93e..b18012f9f9fc 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -463,9 +463,9 @@ static void ndisc_send_skb(struct sk_buff *skb, idev = __in6_dev_get(dst->dev); IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); - err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, sk, skb, - NULL, dst->dev, - dst_output_sk); + err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, + net, sk, skb, NULL, dst->dev, + dst_output); if (!err) { ICMP6MSGOUT_INC_STATS(net, idev, type); ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); @@ -474,8 +474,7 @@ static void ndisc_send_skb(struct sk_buff *skb, rcu_read_unlock(); } -void ndisc_send_na(struct net_device *dev, struct neighbour *neigh, - const struct in6_addr *daddr, +void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr, const struct in6_addr *solicited_addr, bool router, bool solicited, bool override, bool inc_opt) { @@ -541,7 +540,7 @@ static void ndisc_send_unsol_na(struct net_device *dev) read_lock_bh(&idev->lock); list_for_each_entry(ifa, &idev->addr_list, if_list) { - ndisc_send_na(dev, NULL, &in6addr_linklocal_allnodes, &ifa->addr, + ndisc_send_na(dev, &in6addr_linklocal_allnodes, &ifa->addr, /*router=*/ !!idev->cnf.forwarding, /*solicited=*/ false, /*override=*/ true, /*inc_opt=*/ true); @@ -551,8 +550,7 @@ static void ndisc_send_unsol_na(struct net_device *dev) in6_dev_put(idev); } -void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh, - const struct in6_addr *solicit, +void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, const struct in6_addr *daddr, const struct in6_addr *saddr, struct sk_buff *oskb) { @@ -679,12 +677,12 @@ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb) "%s: trying to ucast probe in NUD_INVALID: %pI6\n", __func__, target); } - ndisc_send_ns(dev, neigh, target, target, saddr, skb); + ndisc_send_ns(dev, target, target, saddr, skb); } else if ((probes -= NEIGH_VAR(neigh->parms, APP_PROBES)) < 0) { neigh_app_ns(neigh); } else { addrconf_addr_solict_mult(target, &mcaddr); - ndisc_send_ns(dev, NULL, target, &mcaddr, saddr, skb); + ndisc_send_ns(dev, target, &mcaddr, saddr, skb); } } @@ -828,7 +826,7 @@ static void ndisc_recv_ns(struct sk_buff *skb) is_router = idev->cnf.forwarding; if (dad) { - ndisc_send_na(dev, NULL, &in6addr_linklocal_allnodes, &msg->target, + ndisc_send_na(dev, &in6addr_linklocal_allnodes, &msg->target, !!is_router, false, (ifp != NULL), true); goto out; } @@ -849,8 +847,7 @@ static void ndisc_recv_ns(struct sk_buff *skb) NEIGH_UPDATE_F_WEAK_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE); if (neigh || !dev->header_ops) { - ndisc_send_na(dev, neigh, saddr, &msg->target, - !!is_router, + ndisc_send_na(dev, saddr, &msg->target, !!is_router, true, (ifp != NULL && inc), inc); if (neigh) neigh_release(neigh); @@ -1078,6 +1075,8 @@ static void ndisc_router_discovery(struct sk_buff *skb) struct ndisc_options ndopts; int optlen; unsigned int pref = 0; + __u32 old_if_flags; + bool send_ifinfo_notify = false; __u8 *opt = (__u8 *)(ra_msg + 1); @@ -1148,6 +1147,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) * Remember the managed/otherconf flags from most recently * received RA message (RFC 2462) -- yoshfuji */ + old_if_flags = in6_dev->if_flags; in6_dev->if_flags = (in6_dev->if_flags & ~(IF_RA_MANAGED | IF_RA_OTHERCONF)) | (ra_msg->icmph.icmp6_addrconf_managed ? @@ -1155,6 +1155,9 @@ static void ndisc_router_discovery(struct sk_buff *skb) (ra_msg->icmph.icmp6_addrconf_other ? IF_RA_OTHERCONF : 0); + if (old_if_flags != in6_dev->if_flags) + send_ifinfo_notify = true; + if (!in6_dev->cnf.accept_ra_defrtr) { ND_PRINTK(2, info, "RA: %s, defrtr is false for dev: %s\n", @@ -1256,7 +1259,7 @@ skip_defrtr: rtime = HZ/10; NEIGH_VAR_SET(in6_dev->nd_parms, RETRANS_TIME, rtime); in6_dev->tstamp = jiffies; - inet6_ifinfo_notify(RTM_NEWLINK, in6_dev); + send_ifinfo_notify = true; } rtime = ntohl(ra_msg->reachable_time); @@ -1273,11 +1276,17 @@ skip_defrtr: GC_STALETIME, 3 * rtime); in6_dev->nd_parms->reachable_time = neigh_rand_reach_time(rtime); in6_dev->tstamp = jiffies; - inet6_ifinfo_notify(RTM_NEWLINK, in6_dev); + send_ifinfo_notify = true; } } } + /* + * Send a notify if RA changed managed/otherconf flags or timer settings + */ + if (send_ifinfo_notify) + inet6_ifinfo_notify(RTM_NEWLINK, in6_dev); + skip_linkparms: /* diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index b4de08a83e0b..d11c46833d61 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -18,9 +18,8 @@ #include <net/ip6_checksum.h> #include <net/netfilter/nf_queue.h> -int ip6_route_me_harder(struct sk_buff *skb) +int ip6_route_me_harder(struct net *net, struct sk_buff *skb) { - struct net *net = dev_net(skb_dst(skb)->dev); const struct ipv6hdr *iph = ipv6_hdr(skb); unsigned int hh_len; struct dst_entry *dst; @@ -93,7 +92,7 @@ static void nf_ip6_saveroute(const struct sk_buff *skb, } } -static int nf_ip6_reroute(struct sk_buff *skb, +static int nf_ip6_reroute(struct net *net, struct sk_buff *skb, const struct nf_queue_entry *entry) { struct ip6_rt_info *rt_info = nf_queue_entry_reroute(entry); @@ -103,7 +102,7 @@ static int nf_ip6_reroute(struct sk_buff *skb, if (!ipv6_addr_equal(&iph->daddr, &rt_info->daddr) || !ipv6_addr_equal(&iph->saddr, &rt_info->saddr) || skb->mark != rt_info->mark) - return ip6_route_me_harder(skb); + return ip6_route_me_harder(net, skb); } return 0; } diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 4e21f80228be..80e3bd72b715 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -275,7 +275,8 @@ get_chainname_rulenum(const struct ip6t_entry *s, const struct ip6t_entry *e, return 0; } -static void trace_packet(const struct sk_buff *skb, +static void trace_packet(struct net *net, + const struct sk_buff *skb, unsigned int hook, const struct net_device *in, const struct net_device *out, @@ -287,7 +288,6 @@ static void trace_packet(const struct sk_buff *skb, const char *hookname, *chainname, *comment; const struct ip6t_entry *iter; unsigned int rulenum = 0; - struct net *net = dev_net(in ? in : out); root = get_entry(private->entries, private->hook_entry[hook]); @@ -314,10 +314,10 @@ ip6t_next_entry(const struct ip6t_entry *entry) /* Returns one of the generic firewall policies, like NF_ACCEPT. */ unsigned int ip6t_do_table(struct sk_buff *skb, - unsigned int hook, const struct nf_hook_state *state, struct xt_table *table) { + unsigned int hook = state->hook; static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); /* Initializing verdict to NF_DROP keeps gcc happy. */ unsigned int verdict = NF_DROP; @@ -340,6 +340,7 @@ ip6t_do_table(struct sk_buff *skb, * rule is also a fragment-specific rule, non-fragments won't * match it. */ acpar.hotdrop = false; + acpar.net = state->net; acpar.in = state->in; acpar.out = state->out; acpar.family = NFPROTO_IPV6; @@ -401,8 +402,8 @@ ip6t_do_table(struct sk_buff *skb, #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) /* The packet is traced: log it */ if (unlikely(skb->nf_trace)) - trace_packet(skb, hook, state->in, state->out, - table->name, private, e); + trace_packet(state->net, skb, hook, state->in, + state->out, table->name, private, e); #endif /* Standard target? */ if (!t->u.kernel.target->target) { @@ -455,15 +456,11 @@ ip6t_do_table(struct sk_buff *skb, } /* Figures out from what hook each rule can be called: returns 0 if - * there are loops. Puts hook bitmask in comefrom. - * - * Keeps track of largest call depth seen and stores it in newinfo->stacksize. - */ + there are loops. Puts hook bitmask in comefrom. */ static int -mark_source_chains(struct xt_table_info *newinfo, +mark_source_chains(const struct xt_table_info *newinfo, unsigned int valid_hooks, void *entry0) { - unsigned int calldepth, max_calldepth = 0; unsigned int hook; /* No recursion; use packet counter to save back ptrs (reset @@ -477,7 +474,6 @@ mark_source_chains(struct xt_table_info *newinfo, /* Set initial back pointer. */ e->counters.pcnt = pos; - calldepth = 0; for (;;) { const struct xt_standard_target *t @@ -539,8 +535,6 @@ mark_source_chains(struct xt_table_info *newinfo, (entry0 + pos + size); e->counters.pcnt = pos; pos += size; - if (calldepth > 0) - --calldepth; } else { int newpos = t->verdict; @@ -554,11 +548,6 @@ mark_source_chains(struct xt_table_info *newinfo, newpos); return 0; } - if (entry0 + newpos != ip6t_next_entry(e) && - !(e->ipv6.flags & IP6T_F_GOTO) && - ++calldepth > max_calldepth) - max_calldepth = calldepth; - /* This a jump; chase it. */ duprintf("Jump rule %u -> %u\n", pos, newpos); @@ -575,7 +564,6 @@ mark_source_chains(struct xt_table_info *newinfo, next: duprintf("Finished chain %u\n", hook); } - newinfo->stacksize = max_calldepth; return 1; } @@ -855,6 +843,9 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, if (ret != 0) return ret; ++i; + if (strcmp(ip6t_get_target(iter)->u.user.name, + XT_ERROR_TARGET) == 0) + ++newinfo->stacksize; } if (i != repl->num_entries) { @@ -1767,6 +1758,9 @@ translate_compat_table(struct net *net, if (ret != 0) break; ++i; + if (strcmp(ip6t_get_target(iter1)->u.user.name, + XT_ERROR_TARGET) == 0) + ++newinfo->stacksize; } if (ret) { /* diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c index 567367a75172..db29bbf41b59 100644 --- a/net/ipv6/netfilter/ip6t_REJECT.c +++ b/net/ipv6/netfilter/ip6t_REJECT.c @@ -39,7 +39,7 @@ static unsigned int reject_tg6(struct sk_buff *skb, const struct xt_action_param *par) { const struct ip6t_reject_info *reject = par->targinfo; - struct net *net = dev_net((par->in != NULL) ? par->in : par->out); + struct net *net = par->net; switch (reject->with) { case IP6T_ICMP6_NO_ROUTE: @@ -63,6 +63,12 @@ reject_tg6(struct sk_buff *skb, const struct xt_action_param *par) case IP6T_TCP_RESET: nf_send_reset6(net, skb, par->hooknum); break; + case IP6T_ICMP6_POLICY_FAIL: + nf_send_unreach6(net, skb, ICMPV6_POLICY_FAIL, par->hooknum); + break; + case IP6T_ICMP6_REJECT_ROUTE: + nf_send_unreach6(net, skb, ICMPV6_REJECT_ROUTE, par->hooknum); + break; } return NF_DROP; diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c index ebbb754c2111..a10a2a9e9f94 100644 --- a/net/ipv6/netfilter/ip6t_SYNPROXY.c +++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c @@ -76,7 +76,7 @@ synproxy_send_tcp(const struct synproxy_net *snet, nf_conntrack_get(nfct); } - ip6_local_out(nskb); + ip6_local_out(net, nskb->sk, nskb); return; free_nskb: @@ -237,7 +237,7 @@ synproxy_send_client_ack(const struct synproxy_net *snet, nth->ack_seq = th->ack_seq; tcp_flag_word(nth) = TCP_FLAG_ACK; nth->doff = tcp_hdr_size / 4; - nth->window = ntohs(htons(th->window) >> opts->wscale); + nth->window = htons(ntohs(th->window) >> opts->wscale); nth->check = 0; nth->urg_ptr = 0; @@ -275,7 +275,7 @@ static unsigned int synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_synproxy_info *info = par->targinfo; - struct synproxy_net *snet = synproxy_pernet(dev_net(par->in)); + struct synproxy_net *snet = synproxy_pernet(par->net); struct synproxy_options opts = {}; struct tcphdr *th, _th; @@ -316,11 +316,11 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par) return XT_CONTINUE; } -static unsigned int ipv6_synproxy_hook(const struct nf_hook_ops *ops, +static unsigned int ipv6_synproxy_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *nhs) { - struct synproxy_net *snet = synproxy_pernet(dev_net(nhs->in ? : nhs->out)); + struct synproxy_net *snet = synproxy_pernet(nhs->net); enum ip_conntrack_info ctinfo; struct nf_conn *ct; struct nf_conn_synproxy *synproxy; diff --git a/net/ipv6/netfilter/ip6t_rpfilter.c b/net/ipv6/netfilter/ip6t_rpfilter.c index 790e0c6b19e1..1ee1b25df096 100644 --- a/net/ipv6/netfilter/ip6t_rpfilter.c +++ b/net/ipv6/netfilter/ip6t_rpfilter.c @@ -26,7 +26,7 @@ static bool rpfilter_addr_unicast(const struct in6_addr *addr) return addr_type & IPV6_ADDR_UNICAST; } -static bool rpfilter_lookup_reverse6(const struct sk_buff *skb, +static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb, const struct net_device *dev, u8 flags) { struct rt6_info *rt; @@ -53,7 +53,7 @@ static bool rpfilter_lookup_reverse6(const struct sk_buff *skb, lookup_flags |= RT6_LOOKUP_F_IFACE; } - rt = (void *) ip6_route_lookup(dev_net(dev), &fl6, lookup_flags); + rt = (void *) ip6_route_lookup(net, &fl6, lookup_flags); if (rt->dst.error) goto out; @@ -93,7 +93,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) if (unlikely(saddrtype == IPV6_ADDR_ANY)) return true ^ invert; /* not routable: forward path will drop it */ - return rpfilter_lookup_reverse6(skb, par->in, info->flags) ^ invert; + return rpfilter_lookup_reverse6(par->net, skb, par->in, info->flags) ^ invert; } static int rpfilter_check(const struct xt_mtchk_param *par) diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c index 5c33d8abc077..8b277b983ca5 100644 --- a/net/ipv6/netfilter/ip6table_filter.c +++ b/net/ipv6/netfilter/ip6table_filter.c @@ -32,12 +32,10 @@ static const struct xt_table packet_filter = { /* The work comes in here from netfilter.c. */ static unsigned int -ip6table_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, +ip6table_filter_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - const struct net *net = dev_net(state->in ? state->in : state->out); - - return ip6t_do_table(skb, ops->hooknum, state, net->ipv6.ip6table_filter); + return ip6t_do_table(skb, state, state->net->ipv6.ip6table_filter); } static struct nf_hook_ops *filter_ops __read_mostly; diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c index b551f5b79fe2..abe278b07932 100644 --- a/net/ipv6/netfilter/ip6table_mangle.c +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -57,8 +57,7 @@ ip6t_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state) /* flowlabel and prio (includes version, which shouldn't change either */ flowlabel = *((u_int32_t *)ipv6_hdr(skb)); - ret = ip6t_do_table(skb, NF_INET_LOCAL_OUT, state, - dev_net(state->out)->ipv6.ip6table_mangle); + ret = ip6t_do_table(skb, state, state->net->ipv6.ip6table_mangle); if (ret != NF_DROP && ret != NF_STOLEN && (!ipv6_addr_equal(&ipv6_hdr(skb)->saddr, &saddr) || @@ -66,7 +65,7 @@ ip6t_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state) skb->mark != mark || ipv6_hdr(skb)->hop_limit != hop_limit || flowlabel != *((u_int32_t *)ipv6_hdr(skb)))) { - err = ip6_route_me_harder(skb); + err = ip6_route_me_harder(state->net, skb); if (err < 0) ret = NF_DROP_ERR(err); } @@ -76,17 +75,16 @@ ip6t_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state) /* The work comes in here from netfilter.c. */ static unsigned int -ip6table_mangle_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, +ip6table_mangle_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - if (ops->hooknum == NF_INET_LOCAL_OUT) + if (state->hook == NF_INET_LOCAL_OUT) return ip6t_mangle_out(skb, state); - if (ops->hooknum == NF_INET_POST_ROUTING) - return ip6t_do_table(skb, ops->hooknum, state, - dev_net(state->out)->ipv6.ip6table_mangle); + if (state->hook == NF_INET_POST_ROUTING) + return ip6t_do_table(skb, state, + state->net->ipv6.ip6table_mangle); /* INPUT/FORWARD */ - return ip6t_do_table(skb, ops->hooknum, state, - dev_net(state->in)->ipv6.ip6table_mangle); + return ip6t_do_table(skb, state, state->net->ipv6.ip6table_mangle); } static struct nf_hook_ops *mangle_ops __read_mostly; diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index c3a7f7af0ed4..abea175d5853 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -30,42 +30,40 @@ static const struct xt_table nf_nat_ipv6_table = { .af = NFPROTO_IPV6, }; -static unsigned int ip6table_nat_do_chain(const struct nf_hook_ops *ops, +static unsigned int ip6table_nat_do_chain(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, struct nf_conn *ct) { - struct net *net = nf_ct_net(ct); - - return ip6t_do_table(skb, ops->hooknum, state, net->ipv6.ip6table_nat); + return ip6t_do_table(skb, state, state->net->ipv6.ip6table_nat); } -static unsigned int ip6table_nat_fn(const struct nf_hook_ops *ops, +static unsigned int ip6table_nat_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv6_fn(ops, skb, state, ip6table_nat_do_chain); + return nf_nat_ipv6_fn(priv, skb, state, ip6table_nat_do_chain); } -static unsigned int ip6table_nat_in(const struct nf_hook_ops *ops, +static unsigned int ip6table_nat_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv6_in(ops, skb, state, ip6table_nat_do_chain); + return nf_nat_ipv6_in(priv, skb, state, ip6table_nat_do_chain); } -static unsigned int ip6table_nat_out(const struct nf_hook_ops *ops, +static unsigned int ip6table_nat_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv6_out(ops, skb, state, ip6table_nat_do_chain); + return nf_nat_ipv6_out(priv, skb, state, ip6table_nat_do_chain); } -static unsigned int ip6table_nat_local_fn(const struct nf_hook_ops *ops, +static unsigned int ip6table_nat_local_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv6_local_fn(ops, skb, state, ip6table_nat_do_chain); + return nf_nat_ipv6_local_fn(priv, skb, state, ip6table_nat_do_chain); } static struct nf_hook_ops nf_nat_ipv6_ops[] __read_mostly = { diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c index 0b33caad2b69..9021963565c3 100644 --- a/net/ipv6/netfilter/ip6table_raw.c +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -19,12 +19,10 @@ static const struct xt_table packet_raw = { /* The work comes in here from netfilter.c. */ static unsigned int -ip6table_raw_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, +ip6table_raw_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - const struct net *net = dev_net(state->in ? state->in : state->out); - - return ip6t_do_table(skb, ops->hooknum, state, net->ipv6.ip6table_raw); + return ip6t_do_table(skb, state, state->net->ipv6.ip6table_raw); } static struct nf_hook_ops *rawtable_ops __read_mostly; diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c index fcef83c25f7b..0d856fedfeb0 100644 --- a/net/ipv6/netfilter/ip6table_security.c +++ b/net/ipv6/netfilter/ip6table_security.c @@ -36,13 +36,10 @@ static const struct xt_table security_table = { }; static unsigned int -ip6table_security_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, +ip6table_security_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - const struct net *net = dev_net(state->in ? state->in : state->out); - - return ip6t_do_table(skb, ops->hooknum, state, - net->ipv6.ip6table_security); + return ip6t_do_table(skb, state, state->net->ipv6.ip6table_security); } static struct nf_hook_ops *sectbl_ops __read_mostly; diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index 7302900c321a..dd83ad42f8f6 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -95,7 +95,7 @@ static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, return NF_ACCEPT; } -static unsigned int ipv6_helper(const struct nf_hook_ops *ops, +static unsigned int ipv6_helper(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { @@ -131,7 +131,7 @@ static unsigned int ipv6_helper(const struct nf_hook_ops *ops, return helper->help(skb, protoff, ct, ctinfo); } -static unsigned int ipv6_confirm(const struct nf_hook_ops *ops, +static unsigned int ipv6_confirm(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { @@ -165,14 +165,14 @@ out: return nf_conntrack_confirm(skb); } -static unsigned int ipv6_conntrack_in(const struct nf_hook_ops *ops, +static unsigned int ipv6_conntrack_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_conntrack_in(dev_net(state->in), PF_INET6, ops->hooknum, skb); + return nf_conntrack_in(state->net, PF_INET6, state->hook, skb); } -static unsigned int ipv6_conntrack_local(const struct nf_hook_ops *ops, +static unsigned int ipv6_conntrack_local(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { @@ -181,7 +181,7 @@ static unsigned int ipv6_conntrack_local(const struct nf_hook_ops *ops, net_notice_ratelimited("ipv6_conntrack_local: packet too short\n"); return NF_ACCEPT; } - return nf_conntrack_in(dev_net(state->out), PF_INET6, ops->hooknum, skb); + return nf_conntrack_in(state->net, PF_INET6, state->hook, skb); } static struct nf_hook_ops ipv6_conntrack_ops[] __read_mostly = { diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index 0e6fae103d33..d3b797446cea 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -36,6 +36,7 @@ static inline struct nf_icmp_net *icmpv6_pernet(struct net *net) static bool icmpv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, + struct net *net, struct nf_conntrack_tuple *tuple) { const struct icmp6hdr *hp; @@ -159,7 +160,7 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl, skb_network_offset(skb) + sizeof(struct ipv6hdr) + sizeof(struct icmp6hdr), - PF_INET6, &origtuple)) { + PF_INET6, net, &origtuple)) { pr_debug("icmpv6_error: Can't get tuple\n"); return -NF_ACCEPT; } diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 6d02498172c1..2fb86a99bf5f 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -563,12 +563,10 @@ find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff) return 0; } -struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user) +struct sk_buff *nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) { struct sk_buff *clone; struct net_device *dev = skb->dev; - struct net *net = skb_dst(skb) ? dev_net(skb_dst(skb)->dev) - : dev_net(skb->dev); struct frag_hdr *fhdr; struct frag_queue *fq; struct ipv6hdr *hdr; @@ -633,6 +631,7 @@ ret_orig: kfree_skb(clone); return skb; } +EXPORT_SYMBOL_GPL(nf_ct_frag6_gather); void nf_ct_frag6_consume_orig(struct sk_buff *skb) { diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c index 6d9c0b3d5b8c..5173a89a238e 100644 --- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -51,7 +51,7 @@ static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum, return IP6_DEFRAG_CONNTRACK_OUT + zone_id; } -static unsigned int ipv6_defrag(const struct nf_hook_ops *ops, +static unsigned int ipv6_defrag(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { @@ -63,7 +63,8 @@ static unsigned int ipv6_defrag(const struct nf_hook_ops *ops, return NF_ACCEPT; #endif - reasm = nf_ct_frag6_gather(skb, nf_ct6_defrag_user(ops->hooknum, skb)); + reasm = nf_ct_frag6_gather(state->net, skb, + nf_ct6_defrag_user(state->hook, skb)); /* queued */ if (reasm == NULL) return NF_STOLEN; @@ -74,7 +75,7 @@ static unsigned int ipv6_defrag(const struct nf_hook_ops *ops, nf_ct_frag6_consume_orig(reasm); - NF_HOOK_THRESH(NFPROTO_IPV6, ops->hooknum, state->sk, reasm, + NF_HOOK_THRESH(NFPROTO_IPV6, state->hook, state->net, state->sk, reasm, state->in, state->out, state->okfn, NF_IP6_PRI_CONNTRACK_DEFRAG + 1); diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c index d8ab654080b4..6989c70ae29f 100644 --- a/net/ipv6/netfilter/nf_dup_ipv6.c +++ b/net/ipv6/netfilter/nf_dup_ipv6.c @@ -11,6 +11,7 @@ #include <linux/module.h> #include <linux/percpu.h> #include <linux/skbuff.h> +#include <linux/netfilter.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/netfilter/ipv6/nf_dup_ipv6.h> @@ -18,25 +19,10 @@ #include <net/netfilter/nf_conntrack.h> #endif -static struct net *pick_net(struct sk_buff *skb) -{ -#ifdef CONFIG_NET_NS - const struct dst_entry *dst; - - if (skb->dev != NULL) - return dev_net(skb->dev); - dst = skb_dst(skb); - if (dst != NULL && dst->dev != NULL) - return dev_net(dst->dev); -#endif - return &init_net; -} - -static bool nf_dup_ipv6_route(struct sk_buff *skb, const struct in6_addr *gw, - int oif) +static bool nf_dup_ipv6_route(struct net *net, struct sk_buff *skb, + const struct in6_addr *gw, int oif) { const struct ipv6hdr *iph = ipv6_hdr(skb); - struct net *net = pick_net(skb); struct dst_entry *dst; struct flowi6 fl6; @@ -45,8 +31,8 @@ static bool nf_dup_ipv6_route(struct sk_buff *skb, const struct in6_addr *gw, fl6.flowi6_oif = oif; fl6.daddr = *gw; - fl6.flowlabel = ((iph->flow_lbl[0] & 0xF) << 16) | - (iph->flow_lbl[1] << 8) | iph->flow_lbl[2]; + fl6.flowlabel = (__force __be32)(((iph->flow_lbl[0] & 0xF) << 16) | + (iph->flow_lbl[1] << 8) | iph->flow_lbl[2]); dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { dst_release(dst); @@ -60,7 +46,7 @@ static bool nf_dup_ipv6_route(struct sk_buff *skb, const struct in6_addr *gw, return true; } -void nf_dup_ipv6(struct sk_buff *skb, unsigned int hooknum, +void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum, const struct in6_addr *gw, int oif) { if (this_cpu_read(nf_skb_duplicated)) @@ -80,9 +66,9 @@ void nf_dup_ipv6(struct sk_buff *skb, unsigned int hooknum, struct ipv6hdr *iph = ipv6_hdr(skb); --iph->hop_limit; } - if (nf_dup_ipv6_route(skb, gw, oif)) { + if (nf_dup_ipv6_route(net, skb, gw, oif)) { __this_cpu_write(nf_skb_duplicated, true); - ip6_local_out(skb); + ip6_local_out(net, skb->sk, skb); __this_cpu_write(nf_skb_duplicated, false); } else { kfree_skb(skb); diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c index 70fbaed49edb..238e70c3f7b7 100644 --- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c @@ -262,9 +262,9 @@ int nf_nat_icmpv6_reply_translation(struct sk_buff *skb, EXPORT_SYMBOL_GPL(nf_nat_icmpv6_reply_translation); unsigned int -nf_nat_ipv6_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, +nf_nat_ipv6_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, - unsigned int (*do_chain)(const struct nf_hook_ops *ops, + unsigned int (*do_chain)(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, struct nf_conn *ct)) @@ -272,7 +272,7 @@ nf_nat_ipv6_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, struct nf_conn *ct; enum ip_conntrack_info ctinfo; struct nf_conn_nat *nat; - enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum); + enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook); __be16 frag_off; int hdrlen; u8 nexthdr; @@ -303,7 +303,7 @@ nf_nat_ipv6_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) { if (!nf_nat_icmpv6_reply_translation(skb, ct, ctinfo, - ops->hooknum, + state->hook, hdrlen)) return NF_DROP; else @@ -317,21 +317,21 @@ nf_nat_ipv6_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, if (!nf_nat_initialized(ct, maniptype)) { unsigned int ret; - ret = do_chain(ops, skb, state, ct); + ret = do_chain(priv, skb, state, ct); if (ret != NF_ACCEPT) return ret; - if (nf_nat_initialized(ct, HOOK2MANIP(ops->hooknum))) + if (nf_nat_initialized(ct, HOOK2MANIP(state->hook))) break; - ret = nf_nat_alloc_null_binding(ct, ops->hooknum); + ret = nf_nat_alloc_null_binding(ct, state->hook); if (ret != NF_ACCEPT) return ret; } else { pr_debug("Already setup manip %s for ct %p\n", maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", ct); - if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, state->out)) + if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out)) goto oif_changed; } break; @@ -340,11 +340,11 @@ nf_nat_ipv6_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, /* ESTABLISHED */ NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED || ctinfo == IP_CT_ESTABLISHED_REPLY); - if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, state->out)) + if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out)) goto oif_changed; } - return nf_nat_packet(ct, ctinfo, ops->hooknum, skb); + return nf_nat_packet(ct, ctinfo, state->hook, skb); oif_changed: nf_ct_kill_acct(ct, ctinfo, skb); @@ -353,9 +353,9 @@ oif_changed: EXPORT_SYMBOL_GPL(nf_nat_ipv6_fn); unsigned int -nf_nat_ipv6_in(const struct nf_hook_ops *ops, struct sk_buff *skb, +nf_nat_ipv6_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, - unsigned int (*do_chain)(const struct nf_hook_ops *ops, + unsigned int (*do_chain)(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, struct nf_conn *ct)) @@ -363,7 +363,7 @@ nf_nat_ipv6_in(const struct nf_hook_ops *ops, struct sk_buff *skb, unsigned int ret; struct in6_addr daddr = ipv6_hdr(skb)->daddr; - ret = nf_nat_ipv6_fn(ops, skb, state, do_chain); + ret = nf_nat_ipv6_fn(priv, skb, state, do_chain); if (ret != NF_DROP && ret != NF_STOLEN && ipv6_addr_cmp(&daddr, &ipv6_hdr(skb)->daddr)) skb_dst_drop(skb); @@ -373,9 +373,9 @@ nf_nat_ipv6_in(const struct nf_hook_ops *ops, struct sk_buff *skb, EXPORT_SYMBOL_GPL(nf_nat_ipv6_in); unsigned int -nf_nat_ipv6_out(const struct nf_hook_ops *ops, struct sk_buff *skb, +nf_nat_ipv6_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, - unsigned int (*do_chain)(const struct nf_hook_ops *ops, + unsigned int (*do_chain)(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, struct nf_conn *ct)) @@ -391,7 +391,7 @@ nf_nat_ipv6_out(const struct nf_hook_ops *ops, struct sk_buff *skb, if (skb->len < sizeof(struct ipv6hdr)) return NF_ACCEPT; - ret = nf_nat_ipv6_fn(ops, skb, state, do_chain); + ret = nf_nat_ipv6_fn(priv, skb, state, do_chain); #ifdef CONFIG_XFRM if (ret != NF_DROP && ret != NF_STOLEN && !(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && @@ -403,7 +403,7 @@ nf_nat_ipv6_out(const struct nf_hook_ops *ops, struct sk_buff *skb, (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMPV6 && ct->tuplehash[dir].tuple.src.u.all != ct->tuplehash[!dir].tuple.dst.u.all)) { - err = nf_xfrm_me_harder(skb, AF_INET6); + err = nf_xfrm_me_harder(state->net, skb, AF_INET6); if (err < 0) ret = NF_DROP_ERR(err); } @@ -414,9 +414,9 @@ nf_nat_ipv6_out(const struct nf_hook_ops *ops, struct sk_buff *skb, EXPORT_SYMBOL_GPL(nf_nat_ipv6_out); unsigned int -nf_nat_ipv6_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, +nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, - unsigned int (*do_chain)(const struct nf_hook_ops *ops, + unsigned int (*do_chain)(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, struct nf_conn *ct)) @@ -430,14 +430,14 @@ nf_nat_ipv6_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, if (skb->len < sizeof(struct ipv6hdr)) return NF_ACCEPT; - ret = nf_nat_ipv6_fn(ops, skb, state, do_chain); + ret = nf_nat_ipv6_fn(priv, skb, state, do_chain); if (ret != NF_DROP && ret != NF_STOLEN && (ct = nf_ct_get(skb, &ctinfo)) != NULL) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3, &ct->tuplehash[!dir].tuple.src.u3)) { - err = ip6_route_me_harder(skb); + err = ip6_route_me_harder(state->net, skb); if (err < 0) ret = NF_DROP_ERR(err); } @@ -446,7 +446,7 @@ nf_nat_ipv6_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMPV6 && ct->tuplehash[dir].tuple.dst.u.all != ct->tuplehash[!dir].tuple.src.u.all) { - err = nf_xfrm_me_harder(skb, AF_INET6); + err = nf_xfrm_me_harder(state->net, skb, AF_INET6); if (err < 0) ret = NF_DROP_ERR(err); } diff --git a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c index 7745609665cd..31ba7ca19757 100644 --- a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c @@ -34,7 +34,7 @@ nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range *range, NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY)); - if (ipv6_dev_get_saddr(dev_net(out), out, + if (ipv6_dev_get_saddr(nf_ct_net(ct), out, &ipv6_hdr(skb)->daddr, 0, &src) < 0) return NF_DROP; diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index 94b4c6dfb400..7309e475f68b 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -206,7 +206,7 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) dev_queue_xmit(nskb); } else #endif - ip6_local_out(nskb); + ip6_local_out(net, nskb->sk, nskb); } EXPORT_SYMBOL_GPL(nf_send_reset6); diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c index c8148ba76d1a..120ea9131be0 100644 --- a/net/ipv6/netfilter/nf_tables_ipv6.c +++ b/net/ipv6/netfilter/nf_tables_ipv6.c @@ -16,20 +16,20 @@ #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_ipv6.h> -static unsigned int nft_do_chain_ipv6(const struct nf_hook_ops *ops, +static unsigned int nft_do_chain_ipv6(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nft_pktinfo pkt; /* malformed packet, drop it */ - if (nft_set_pktinfo_ipv6(&pkt, ops, skb, state) < 0) + if (nft_set_pktinfo_ipv6(&pkt, skb, state) < 0) return NF_DROP; - return nft_do_chain(&pkt, ops); + return nft_do_chain(&pkt, priv); } -static unsigned int nft_ipv6_output(const struct nf_hook_ops *ops, +static unsigned int nft_ipv6_output(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { @@ -40,7 +40,7 @@ static unsigned int nft_ipv6_output(const struct nf_hook_ops *ops, return NF_ACCEPT; } - return nft_do_chain_ipv6(ops, skb, state); + return nft_do_chain_ipv6(priv, skb, state); } struct nft_af_info nft_af_ipv6 __read_mostly = { diff --git a/net/ipv6/netfilter/nft_chain_nat_ipv6.c b/net/ipv6/netfilter/nft_chain_nat_ipv6.c index 951bb458b7bd..443cd306c0b0 100644 --- a/net/ipv6/netfilter/nft_chain_nat_ipv6.c +++ b/net/ipv6/netfilter/nft_chain_nat_ipv6.c @@ -24,44 +24,44 @@ #include <net/netfilter/nf_nat_l3proto.h> #include <net/ipv6.h> -static unsigned int nft_nat_do_chain(const struct nf_hook_ops *ops, +static unsigned int nft_nat_do_chain(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, struct nf_conn *ct) { struct nft_pktinfo pkt; - nft_set_pktinfo_ipv6(&pkt, ops, skb, state); + nft_set_pktinfo_ipv6(&pkt, skb, state); - return nft_do_chain(&pkt, ops); + return nft_do_chain(&pkt, priv); } -static unsigned int nft_nat_ipv6_fn(const struct nf_hook_ops *ops, +static unsigned int nft_nat_ipv6_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv6_fn(ops, skb, state, nft_nat_do_chain); + return nf_nat_ipv6_fn(priv, skb, state, nft_nat_do_chain); } -static unsigned int nft_nat_ipv6_in(const struct nf_hook_ops *ops, +static unsigned int nft_nat_ipv6_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv6_in(ops, skb, state, nft_nat_do_chain); + return nf_nat_ipv6_in(priv, skb, state, nft_nat_do_chain); } -static unsigned int nft_nat_ipv6_out(const struct nf_hook_ops *ops, +static unsigned int nft_nat_ipv6_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv6_out(ops, skb, state, nft_nat_do_chain); + return nf_nat_ipv6_out(priv, skb, state, nft_nat_do_chain); } -static unsigned int nft_nat_ipv6_local_fn(const struct nf_hook_ops *ops, +static unsigned int nft_nat_ipv6_local_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv6_local_fn(ops, skb, state, nft_nat_do_chain); + return nf_nat_ipv6_local_fn(priv, skb, state, nft_nat_do_chain); } static const struct nf_chain_type nft_chain_nat_ipv6 = { diff --git a/net/ipv6/netfilter/nft_chain_route_ipv6.c b/net/ipv6/netfilter/nft_chain_route_ipv6.c index 0dafdaac5e17..9df75bd7c94a 100644 --- a/net/ipv6/netfilter/nft_chain_route_ipv6.c +++ b/net/ipv6/netfilter/nft_chain_route_ipv6.c @@ -22,7 +22,7 @@ #include <net/netfilter/nf_tables_ipv6.h> #include <net/route.h> -static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops, +static unsigned int nf_route_table_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { @@ -33,7 +33,7 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops, u32 mark, flowlabel; /* malformed packet, drop it */ - if (nft_set_pktinfo_ipv6(&pkt, ops, skb, state) < 0) + if (nft_set_pktinfo_ipv6(&pkt, skb, state) < 0) return NF_DROP; /* save source/dest address, mark, hoplimit, flowlabel, priority */ @@ -45,14 +45,14 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops, /* flowlabel and prio (includes version, which shouldn't change either */ flowlabel = *((u32 *)ipv6_hdr(skb)); - ret = nft_do_chain(&pkt, ops); + ret = nft_do_chain(&pkt, priv); if (ret != NF_DROP && ret != NF_QUEUE && (memcmp(&ipv6_hdr(skb)->saddr, &saddr, sizeof(saddr)) || memcmp(&ipv6_hdr(skb)->daddr, &daddr, sizeof(daddr)) || skb->mark != mark || ipv6_hdr(skb)->hop_limit != hop_limit || flowlabel != *((u_int32_t *)ipv6_hdr(skb)))) - return ip6_route_me_harder(skb) == 0 ? ret : NF_DROP; + return ip6_route_me_harder(state->net, skb) == 0 ? ret : NF_DROP; return ret; } diff --git a/net/ipv6/netfilter/nft_dup_ipv6.c b/net/ipv6/netfilter/nft_dup_ipv6.c index 0eaa4f65fdea..8bfd470cbe72 100644 --- a/net/ipv6/netfilter/nft_dup_ipv6.c +++ b/net/ipv6/netfilter/nft_dup_ipv6.c @@ -28,7 +28,7 @@ static void nft_dup_ipv6_eval(const struct nft_expr *expr, struct in6_addr *gw = (struct in6_addr *)®s->data[priv->sreg_addr]; int oif = regs->data[priv->sreg_dev]; - nf_dup_ipv6(pkt->skb, pkt->ops->hooknum, gw, oif); + nf_dup_ipv6(pkt->net, pkt->skb, pkt->hook, gw, oif); } static int nft_dup_ipv6_init(const struct nft_ctx *ctx, diff --git a/net/ipv6/netfilter/nft_redir_ipv6.c b/net/ipv6/netfilter/nft_redir_ipv6.c index effd393bd517..aca44e89a881 100644 --- a/net/ipv6/netfilter/nft_redir_ipv6.c +++ b/net/ipv6/netfilter/nft_redir_ipv6.c @@ -35,8 +35,7 @@ static void nft_redir_ipv6_eval(const struct nft_expr *expr, range.flags |= priv->flags; - regs->verdict.code = nf_nat_redirect_ipv6(pkt->skb, &range, - pkt->ops->hooknum); + regs->verdict.code = nf_nat_redirect_ipv6(pkt->skb, &range, pkt->hook); } static struct nft_expr_type nft_redir_ipv6_type; diff --git a/net/ipv6/netfilter/nft_reject_ipv6.c b/net/ipv6/netfilter/nft_reject_ipv6.c index d0d1540ecf87..533cd5719c59 100644 --- a/net/ipv6/netfilter/nft_reject_ipv6.c +++ b/net/ipv6/netfilter/nft_reject_ipv6.c @@ -24,15 +24,14 @@ static void nft_reject_ipv6_eval(const struct nft_expr *expr, const struct nft_pktinfo *pkt) { struct nft_reject *priv = nft_expr_priv(expr); - struct net *net = dev_net((pkt->in != NULL) ? pkt->in : pkt->out); switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: - nf_send_unreach6(net, pkt->skb, priv->icmp_code, - pkt->ops->hooknum); + nf_send_unreach6(pkt->net, pkt->skb, priv->icmp_code, + pkt->hook); break; case NFT_REJECT_TCP_RST: - nf_send_reset6(net, pkt->skb, pkt->ops->hooknum); + nf_send_reset6(pkt->net, pkt->skb, pkt->hook); break; default: break; diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index 928a0fb0b744..462f2a76b5c2 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -138,7 +138,7 @@ int ip6_dst_hoplimit(struct dst_entry *dst) EXPORT_SYMBOL(ip6_dst_hoplimit); #endif -static int __ip6_local_out_sk(struct sock *sk, struct sk_buff *skb) +int __ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { int len; @@ -148,30 +148,20 @@ static int __ip6_local_out_sk(struct sock *sk, struct sk_buff *skb) ipv6_hdr(skb)->payload_len = htons(len); IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr); - return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, sk, skb, - NULL, skb_dst(skb)->dev, dst_output_sk); -} - -int __ip6_local_out(struct sk_buff *skb) -{ - return __ip6_local_out_sk(skb->sk, skb); + return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, + net, sk, skb, NULL, skb_dst(skb)->dev, + dst_output); } EXPORT_SYMBOL_GPL(__ip6_local_out); -int ip6_local_out_sk(struct sock *sk, struct sk_buff *skb) +int ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; - err = __ip6_local_out_sk(sk, skb); + err = __ip6_local_out(net, sk, skb); if (likely(err == 1)) - err = dst_output_sk(sk, skb); + err = dst_output(net, sk, skb); return err; } -EXPORT_SYMBOL_GPL(ip6_local_out_sk); - -int ip6_local_out(struct sk_buff *skb) -{ - return ip6_local_out_sk(skb->sk, skb); -} EXPORT_SYMBOL_GPL(ip6_local_out); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index fdbada1569a3..dc65ec198f7c 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -614,6 +614,7 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, unsigned int flags) { struct ipv6_pinfo *np = inet6_sk(sk); + struct net *net = sock_net(sk); struct ipv6hdr *iph; struct sk_buff *skb; int err; @@ -652,9 +653,9 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, if (err) goto error_fault; - IP6_UPD_PO_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); - err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, sk, skb, - NULL, rt->dst.dev, dst_output_sk); + IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); + err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, + NULL, rt->dst.dev, dst_output); if (err > 0) err = net_xmit_errno(err); if (err) @@ -666,7 +667,7 @@ error_fault: err = -EFAULT; kfree_skb(skb); error: - IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); + IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); if (err == -ENOBUFS && !np->recverr) err = 0; return err; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index df3e353a012d..db5b54ad5912 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -86,9 +86,9 @@ static void ip6_dst_ifdown(struct dst_entry *, static int ip6_dst_gc(struct dst_ops *ops); static int ip6_pkt_discard(struct sk_buff *skb); -static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb); +static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); static int ip6_pkt_prohibit(struct sk_buff *skb); -static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb); +static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); static void ip6_link_failure(struct sk_buff *skb); static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu); @@ -308,7 +308,7 @@ static const struct rt6_info ip6_blk_hole_entry_template = { .obsolete = DST_OBSOLETE_FORCE_CHK, .error = -EINVAL, .input = dst_discard, - .output = dst_discard_sk, + .output = dst_discard_out, }, .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), .rt6i_protocol = RTPROT_KERNEL, @@ -421,31 +421,7 @@ static bool rt6_check_expired(const struct rt6_info *rt) static int rt6_info_hash_nhsfn(unsigned int candidate_count, const struct flowi6 *fl6) { - unsigned int val = fl6->flowi6_proto; - - val ^= ipv6_addr_hash(&fl6->daddr); - val ^= ipv6_addr_hash(&fl6->saddr); - - /* Work only if this not encapsulated */ - switch (fl6->flowi6_proto) { - case IPPROTO_UDP: - case IPPROTO_TCP: - case IPPROTO_SCTP: - val ^= (__force u16)fl6->fl6_sport; - val ^= (__force u16)fl6->fl6_dport; - break; - - case IPPROTO_ICMPV6: - val ^= (__force u16)fl6->fl6_icmp_type; - val ^= (__force u16)fl6->fl6_icmp_code; - break; - } - /* RFC6438 recommands to use flowlabel */ - val ^= (__force u32)fl6->flowlabel; - - /* Perhaps, we need to tune, this function? */ - val = val ^ (val >> 7) ^ (val >> 12); - return val % candidate_count; + return get_hash_from_flowi6(fl6) % candidate_count; } static struct rt6_info *rt6_multipath_select(struct rt6_info *match, @@ -498,10 +474,10 @@ static inline struct rt6_info *rt6_device_match(struct net *net, if (dev->flags & IFF_LOOPBACK) { if (!sprt->rt6i_idev || sprt->rt6i_idev->dev->ifindex != oif) { - if (flags & RT6_LOOKUP_F_IFACE && oif) + if (flags & RT6_LOOKUP_F_IFACE) continue; - if (local && (!oif || - local->rt6i_idev->dev->ifindex == oif)) + if (local && + local->rt6i_idev->dev->ifindex == oif) continue; } local = sprt; @@ -538,7 +514,7 @@ static void rt6_probe_deferred(struct work_struct *w) container_of(w, struct __rt6_probe_work, work); addrconf_addr_solict_mult(&work->target, &mcaddr); - ndisc_send_ns(work->dev, NULL, &work->target, &mcaddr, NULL, NULL); + ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, NULL); dev_put(work->dev); kfree(work); } @@ -1174,7 +1150,7 @@ void ip6_route_input(struct sk_buff *skb) }; tun_info = skb_tunnel_info(skb); - if (tun_info && tun_info->mode == IP_TUNNEL_INFO_RX) + if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; skb_dst_drop(skb); skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags)); @@ -1193,7 +1169,8 @@ struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk, fl6->flowi6_iif = LOOPBACK_IFINDEX; - if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr)) + if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || + fl6->flowi6_oif) flags |= RT6_LOOKUP_F_IFACE; if (!ipv6_addr_any(&fl6->saddr)) @@ -1218,7 +1195,7 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori new->__use = 1; new->input = dst_discard; - new->output = dst_discard_sk; + new->output = dst_discard_out; if (dst_metrics_read_only(&ort->dst)) new->_metrics = ort->dst._metrics; @@ -1322,8 +1299,7 @@ static void ip6_link_failure(struct sk_buff *skb) if (rt) { if (rt->rt6i_flags & RTF_CACHE) { dst_hold(&rt->dst); - if (ip6_del_rt(rt)) - dst_free(&rt->dst); + ip6_del_rt(rt); } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) { rt->rt6i_node->fn_sernum = -1; } @@ -1698,6 +1674,7 @@ out: static int ip6_convert_metrics(struct mx6_config *mxc, const struct fib6_config *cfg) { + bool ecn_ca = false; struct nlattr *nla; int remaining; u32 *mp; @@ -1711,52 +1688,57 @@ static int ip6_convert_metrics(struct mx6_config *mxc, nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { int type = nla_type(nla); + u32 val; - if (type) { - u32 val; + if (!type) + continue; + if (unlikely(type > RTAX_MAX)) + goto err; + + if (type == RTAX_CC_ALGO) { + char tmp[TCP_CA_NAME_MAX]; - if (unlikely(type > RTAX_MAX)) + nla_strlcpy(tmp, nla, sizeof(tmp)); + val = tcp_ca_get_key_by_name(tmp, &ecn_ca); + if (val == TCP_CA_UNSPEC) goto err; - if (type == RTAX_CC_ALGO) { - char tmp[TCP_CA_NAME_MAX]; + } else { + val = nla_get_u32(nla); + } + if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) + goto err; - nla_strlcpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(tmp); - if (val == TCP_CA_UNSPEC) - goto err; - } else { - val = nla_get_u32(nla); - } + mp[type - 1] = val; + __set_bit(type - 1, mxc->mx_valid); + } - mp[type - 1] = val; - __set_bit(type - 1, mxc->mx_valid); - } + if (ecn_ca) { + __set_bit(RTAX_FEATURES - 1, mxc->mx_valid); + mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; } mxc->mx = mp; - return 0; err: kfree(mp); return -EINVAL; } -int ip6_route_add(struct fib6_config *cfg) +static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg) { - int err; struct net *net = cfg->fc_nlinfo.nl_net; struct rt6_info *rt = NULL; struct net_device *dev = NULL; struct inet6_dev *idev = NULL; struct fib6_table *table; - struct mx6_config mxc = { .mx = NULL, }; int addr_type; + int err = -EINVAL; if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128) - return -EINVAL; + goto out; #ifndef CONFIG_IPV6_SUBTREES if (cfg->fc_src_len) - return -EINVAL; + goto out; #endif if (cfg->fc_ifindex) { err = -ENODEV; @@ -1871,7 +1853,7 @@ int ip6_route_add(struct fib6_config *cfg) switch (cfg->fc_type) { case RTN_BLACKHOLE: rt->dst.error = -EINVAL; - rt->dst.output = dst_discard_sk; + rt->dst.output = dst_discard_out; rt->dst.input = dst_discard; break; case RTN_PROHIBIT: @@ -1880,9 +1862,11 @@ int ip6_route_add(struct fib6_config *cfg) rt->dst.input = ip6_pkt_prohibit; break; case RTN_THROW: + case RTN_UNREACHABLE: default: rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN - : -ENETUNREACH; + : (cfg->fc_type == RTN_UNREACHABLE) + ? -EHOSTUNREACH : -ENETUNREACH; rt->dst.output = ip6_pkt_discard_out; rt->dst.input = ip6_pkt_discard; break; @@ -1974,6 +1958,31 @@ install_route: cfg->fc_nlinfo.nl_net = dev_net(dev); + return rt; +out: + if (dev) + dev_put(dev); + if (idev) + in6_dev_put(idev); + if (rt) + dst_free(&rt->dst); + + return ERR_PTR(err); +} + +int ip6_route_add(struct fib6_config *cfg) +{ + struct mx6_config mxc = { .mx = NULL, }; + struct rt6_info *rt; + int err; + + rt = ip6_route_info_create(cfg); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + rt = NULL; + goto out; + } + err = ip6_convert_metrics(&mxc, cfg); if (err) goto out; @@ -1981,14 +1990,12 @@ install_route: err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc); kfree(mxc.mx); + return err; out: - if (dev) - dev_put(dev); - if (idev) - in6_dev_put(idev); if (rt) dst_free(&rt->dst); + return err; } @@ -1998,7 +2005,8 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info) struct fib6_table *table; struct net *net = dev_net(rt->dst.dev); - if (rt == net->ipv6.ip6_null_entry) { + if (rt == net->ipv6.ip6_null_entry || + rt->dst.flags & DST_NOCACHE) { err = -ENOENT; goto out; } @@ -2437,7 +2445,7 @@ static int ip6_pkt_discard(struct sk_buff *skb) return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); } -static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb) +static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) { skb->dev = skb_dst(skb)->dev; return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); @@ -2448,7 +2456,7 @@ static int ip6_pkt_prohibit(struct sk_buff *skb) return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); } -static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb) +static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) { skb->dev = skb_dst(skb)->dev; return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); @@ -2485,6 +2493,7 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, rt->rt6i_dst.addr = *addr; rt->rt6i_dst.plen = 128; rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL); + rt->dst.flags |= DST_NOCACHE; atomic_set(&rt->dst.__refcnt, 1); @@ -2769,19 +2778,78 @@ errout: return err; } -static int ip6_route_multipath(struct fib6_config *cfg, int add) +struct rt6_nh { + struct rt6_info *rt6_info; + struct fib6_config r_cfg; + struct mx6_config mxc; + struct list_head next; +}; + +static void ip6_print_replace_route_err(struct list_head *rt6_nh_list) +{ + struct rt6_nh *nh; + + list_for_each_entry(nh, rt6_nh_list, next) { + pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6 nexthop %pI6 ifi %d\n", + &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway, + nh->r_cfg.fc_ifindex); + } +} + +static int ip6_route_info_append(struct list_head *rt6_nh_list, + struct rt6_info *rt, struct fib6_config *r_cfg) +{ + struct rt6_nh *nh; + struct rt6_info *rtnh; + int err = -EEXIST; + + list_for_each_entry(nh, rt6_nh_list, next) { + /* check if rt6_info already exists */ + rtnh = nh->rt6_info; + + if (rtnh->dst.dev == rt->dst.dev && + rtnh->rt6i_idev == rt->rt6i_idev && + ipv6_addr_equal(&rtnh->rt6i_gateway, + &rt->rt6i_gateway)) + return err; + } + + nh = kzalloc(sizeof(*nh), GFP_KERNEL); + if (!nh) + return -ENOMEM; + nh->rt6_info = rt; + err = ip6_convert_metrics(&nh->mxc, r_cfg); + if (err) { + kfree(nh); + return err; + } + memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); + list_add_tail(&nh->next, rt6_nh_list); + + return 0; +} + +static int ip6_route_multipath_add(struct fib6_config *cfg) { struct fib6_config r_cfg; struct rtnexthop *rtnh; + struct rt6_info *rt; + struct rt6_nh *err_nh; + struct rt6_nh *nh, *nh_safe; int remaining; int attrlen; - int err = 0, last_err = 0; + int err = 1; + int nhn = 0; + int replace = (cfg->fc_nlinfo.nlh && + (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); + LIST_HEAD(rt6_nh_list); remaining = cfg->fc_mp_len; -beginning: rtnh = (struct rtnexthop *)cfg->fc_mp; - /* Parse a Multipath Entry */ + /* Parse a Multipath Entry and build a list (rt6_nh_list) of + * rt6_info structs per nexthop + */ while (rtnh_ok(rtnh, remaining)) { memcpy(&r_cfg, cfg, sizeof(*cfg)); if (rtnh->rtnh_ifindex) @@ -2801,22 +2869,35 @@ beginning: if (nla) r_cfg.fc_encap_type = nla_get_u16(nla); } - err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg); + + rt = ip6_route_info_create(&r_cfg); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + rt = NULL; + goto cleanup; + } + + err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg); if (err) { - last_err = err; - /* If we are trying to remove a route, do not stop the - * loop when ip6_route_del() fails (because next hop is - * already gone), we should try to remove all next hops. - */ - if (add) { - /* If add fails, we should try to delete all - * next hops that have been already added. - */ - add = 0; - remaining = cfg->fc_mp_len - remaining; - goto beginning; - } + dst_free(&rt->dst); + goto cleanup; + } + + rtnh = rtnh_next(rtnh, &remaining); + } + + err_nh = NULL; + list_for_each_entry(nh, &rt6_nh_list, next) { + err = __ip6_ins_rt(nh->rt6_info, &cfg->fc_nlinfo, &nh->mxc); + /* nh->rt6_info is used or freed at this point, reset to NULL*/ + nh->rt6_info = NULL; + if (err) { + if (replace && nhn) + ip6_print_replace_route_err(&rt6_nh_list); + err_nh = nh; + goto add_errout; } + /* Because each route is added like a single route we remove * these flags after the first nexthop: if there is a collision, * we have already failed to add the first nexthop: @@ -2826,6 +2907,62 @@ beginning: */ cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | NLM_F_REPLACE); + nhn++; + } + + goto cleanup; + +add_errout: + /* Delete routes that were already added */ + list_for_each_entry(nh, &rt6_nh_list, next) { + if (err_nh == nh) + break; + ip6_route_del(&nh->r_cfg); + } + +cleanup: + list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { + if (nh->rt6_info) + dst_free(&nh->rt6_info->dst); + kfree(nh->mxc.mx); + list_del(&nh->next); + kfree(nh); + } + + return err; +} + +static int ip6_route_multipath_del(struct fib6_config *cfg) +{ + struct fib6_config r_cfg; + struct rtnexthop *rtnh; + int remaining; + int attrlen; + int err = 1, last_err = 0; + + remaining = cfg->fc_mp_len; + rtnh = (struct rtnexthop *)cfg->fc_mp; + + /* Parse a Multipath Entry */ + while (rtnh_ok(rtnh, remaining)) { + memcpy(&r_cfg, cfg, sizeof(*cfg)); + if (rtnh->rtnh_ifindex) + r_cfg.fc_ifindex = rtnh->rtnh_ifindex; + + attrlen = rtnh_attrlen(rtnh); + if (attrlen > 0) { + struct nlattr *nla, *attrs = rtnh_attrs(rtnh); + + nla = nla_find(attrs, attrlen, RTA_GATEWAY); + if (nla) { + nla_memcpy(&r_cfg.fc_gateway, nla, 16); + r_cfg.fc_flags |= RTF_GATEWAY; + } + } + err = ip6_route_del(&r_cfg); + if (err) + last_err = err; + rtnh = rtnh_next(rtnh, &remaining); } @@ -2842,7 +2979,7 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh) return err; if (cfg.fc_mp) - return ip6_route_multipath(&cfg, 0); + return ip6_route_multipath_del(&cfg); else return ip6_route_del(&cfg); } @@ -2857,7 +2994,7 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh) return err; if (cfg.fc_mp) - return ip6_route_multipath(&cfg, 1); + return ip6_route_multipath_add(&cfg); else return ip6_route_add(&cfg); } @@ -3148,7 +3285,8 @@ errout: return err; } -void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info) +void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info, + unsigned int nlm_flags) { struct sk_buff *skb; struct net *net = info->nl_net; @@ -3163,7 +3301,7 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info) goto errout; err = rt6_fill_node(net, skb, rt, NULL, NULL, 0, - event, info->portid, seq, 0, 0, 0); + event, info->portid, seq, 0, 0, nlm_flags); if (err < 0) { /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 0909f4e0d53c..bb8f2fa1c7fb 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -114,14 +114,11 @@ u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph, } EXPORT_SYMBOL_GPL(__cookie_v6_init_sequence); -__u32 cookie_v6_init_sequence(struct sock *sk, const struct sk_buff *skb, __u16 *mssp) +__u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mssp) { const struct ipv6hdr *iph = ipv6_hdr(skb); const struct tcphdr *th = tcp_hdr(skb); - tcp_synq_overflow(sk); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); - return __cookie_v6_init_sequence(iph, th, mssp); } @@ -173,7 +170,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) goto out; ret = NULL; - req = inet_reqsk_alloc(&tcp6_request_sock_ops, sk); + req = inet_reqsk_alloc(&tcp6_request_sock_ops, sk, false); if (!req) goto out; @@ -210,7 +207,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) ireq->wscale_ok = tcp_opt.wscale_ok; ireq->tstamp_ok = tcp_opt.saw_tstamp; req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; - treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0; + treq->snt_synack.v64 = 0; treq->rcv_isn = ntohl(th->seq) - 1; treq->snt_isn = cookie; @@ -238,9 +235,9 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) goto out_free; } - req->window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW); + req->rsk_window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW); tcp_select_initial_window(tcp_full_space(sk), req->mss, - &req->rcv_wnd, &req->window_clamp, + &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(dst, RTAX_INITRWND)); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 97d9314ea361..2887c8474b65 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -70,8 +70,8 @@ #include <linux/crypto.h> #include <linux/scatterlist.h> -static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb); -static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, +static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb); +static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req); static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); @@ -82,7 +82,7 @@ static const struct inet_connection_sock_af_ops ipv6_specific; static const struct tcp_sock_af_ops tcp_sock_ipv6_specific; static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific; #else -static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(struct sock *sk, +static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(const struct sock *sk, const struct in6_addr *addr) { return NULL; @@ -434,11 +434,12 @@ out: } -static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst, +static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, u16 queue_mapping, - struct tcp_fastopen_cookie *foc) + struct tcp_fastopen_cookie *foc, + bool attach_req) { struct inet_request_sock *ireq = inet_rsk(req); struct ipv6_pinfo *np = inet6_sk(sk); @@ -447,10 +448,11 @@ static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst, int err = -ENOMEM; /* First, grab a route. */ - if (!dst && (dst = inet6_csk_route_req(sk, fl6, req)) == NULL) + if (!dst && (dst = inet6_csk_route_req(sk, fl6, req, + IPPROTO_TCP)) == NULL) goto done; - skb = tcp_make_synack(sk, dst, req, foc); + skb = tcp_make_synack(sk, dst, req, foc, attach_req); if (skb) { __tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr, @@ -476,13 +478,13 @@ static void tcp_v6_reqsk_destructor(struct request_sock *req) } #ifdef CONFIG_TCP_MD5SIG -static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(struct sock *sk, +static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(const struct sock *sk, const struct in6_addr *addr) { return tcp_md5_do_lookup(sk, (union tcp_md5_addr *)addr, AF_INET6); } -static struct tcp_md5sig_key *tcp_v6_md5_lookup(struct sock *sk, +static struct tcp_md5sig_key *tcp_v6_md5_lookup(const struct sock *sk, const struct sock *addr_sk) { return tcp_v6_md5_do_lookup(sk, &addr_sk->sk_v6_daddr); @@ -621,8 +623,12 @@ clear_hash_noput: return 1; } -static bool tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) +#endif + +static bool tcp_v6_inbound_md5_hash(const struct sock *sk, + const struct sk_buff *skb) { +#ifdef CONFIG_TCP_MD5SIG const __u8 *hash_location = NULL; struct tcp_md5sig_key *hash_expected; const struct ipv6hdr *ip6h = ipv6_hdr(skb); @@ -659,26 +665,27 @@ static bool tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) &ip6h->daddr, ntohs(th->dest)); return true; } +#endif return false; } -#endif -static void tcp_v6_init_req(struct request_sock *req, struct sock *sk, +static void tcp_v6_init_req(struct request_sock *req, + const struct sock *sk_listener, struct sk_buff *skb) { struct inet_request_sock *ireq = inet_rsk(req); - struct ipv6_pinfo *np = inet6_sk(sk); + const struct ipv6_pinfo *np = inet6_sk(sk_listener); ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; /* So that link locals have meaning */ - if (!sk->sk_bound_dev_if && + if (!sk_listener->sk_bound_dev_if && ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) ireq->ir_iif = tcp_v6_iif(skb); if (!TCP_SKB_CB(skb)->tcp_tw_isn && - (ipv6_opt_accepted(sk, skb, &TCP_SKB_CB(skb)->header.h6) || + (ipv6_opt_accepted(sk_listener, skb, &TCP_SKB_CB(skb)->header.h6) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim || np->repflow)) { @@ -687,13 +694,14 @@ static void tcp_v6_init_req(struct request_sock *req, struct sock *sk, } } -static struct dst_entry *tcp_v6_route_req(struct sock *sk, struct flowi *fl, +static struct dst_entry *tcp_v6_route_req(const struct sock *sk, + struct flowi *fl, const struct request_sock *req, bool *strict) { if (strict) *strict = true; - return inet6_csk_route_req(sk, &fl->u.ip6, req); + return inet6_csk_route_req(sk, &fl->u.ip6, req, IPPROTO_TCP); } struct request_sock_ops tcp6_request_sock_ops __read_mostly = { @@ -720,10 +728,9 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { .route_req = tcp_v6_route_req, .init_seq = tcp_v6_init_sequence, .send_synack = tcp_v6_send_synack, - .queue_hash_add = inet6_csk_reqsk_queue_hash_add, }; -static void tcp_v6_send_response(struct sock *sk, struct sk_buff *skb, u32 seq, +static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, struct tcp_md5sig_key *key, int rst, u8 tclass, u32 label) @@ -822,7 +829,7 @@ static void tcp_v6_send_response(struct sock *sk, struct sk_buff *skb, u32 seq, kfree_skb(buff); } -static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb) +static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) { const struct tcphdr *th = tcp_hdr(skb); u32 seq = 0, ack_seq = 0; @@ -893,7 +900,7 @@ release_sk1: #endif } -static void tcp_v6_send_ack(struct sock *sk, struct sk_buff *skb, u32 seq, +static void tcp_v6_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, struct tcp_md5sig_key *key, u8 tclass, u32 label) @@ -916,7 +923,7 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) inet_twsk_put(tw); } -static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, +static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req) { /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV @@ -924,44 +931,18 @@ static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, */ tcp_v6_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, - tcp_rsk(req)->rcv_nxt, req->rcv_wnd, + tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd, tcp_time_stamp, req->ts_recent, sk->sk_bound_dev_if, tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr), 0, 0); } -static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb) +static struct sock *tcp_v6_cookie_check(struct sock *sk, struct sk_buff *skb) { +#ifdef CONFIG_SYN_COOKIES const struct tcphdr *th = tcp_hdr(skb); - struct request_sock *req; - struct sock *nsk; - - /* Find possible connection requests. */ - req = inet6_csk_search_req(sk, th->source, - &ipv6_hdr(skb)->saddr, - &ipv6_hdr(skb)->daddr, tcp_v6_iif(skb)); - if (req) { - nsk = tcp_check_req(sk, skb, req, false); - if (!nsk || nsk == sk) - reqsk_put(req); - return nsk; - } - nsk = __inet6_lookup_established(sock_net(sk), &tcp_hashinfo, - &ipv6_hdr(skb)->saddr, th->source, - &ipv6_hdr(skb)->daddr, ntohs(th->dest), - tcp_v6_iif(skb)); - - if (nsk) { - if (nsk->sk_state != TCP_TIME_WAIT) { - bh_lock_sock(nsk); - return nsk; - } - inet_twsk_put(inet_twsk(nsk)); - return NULL; - } -#ifdef CONFIG_SYN_COOKIES if (!th->syn) sk = cookie_v6_check(sk, skb); #endif @@ -984,12 +965,13 @@ drop: return 0; /* don't send reset */ } -static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, +static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst) { struct inet_request_sock *ireq; - struct ipv6_pinfo *newnp, *np = inet6_sk(sk); + struct ipv6_pinfo *newnp; + const struct ipv6_pinfo *np = inet6_sk(sk); struct tcp6_sock *newtcp6sk; struct inet_sock *newinet; struct tcp_sock *newtp; @@ -1057,7 +1039,7 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, goto out_overflow; if (!dst) { - dst = inet6_csk_route_req(sk, &fl6, req); + dst = inet6_csk_route_req(sk, &fl6, req, IPPROTO_TCP); if (!dst) goto out; } @@ -1090,8 +1072,6 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newsk->sk_v6_rcv_saddr = ireq->ir_v6_loc_addr; newsk->sk_bound_dev_if = ireq->ir_iif; - sk_set_txhash(newsk); - /* Now IPv6 options... First: no IPv4 options. @@ -1181,7 +1161,7 @@ out: } /* The socket must have it's spinlock held when we get - * here. + * here, unless it is a TCP_LISTEN socket. * * We have a potential double-lock case here, so even when * doing backlog processing we use the BH locking scheme. @@ -1252,18 +1232,14 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) goto csum_err; if (sk->sk_state == TCP_LISTEN) { - struct sock *nsk = tcp_v6_hnd_req(sk, skb); + struct sock *nsk = tcp_v6_cookie_check(sk, skb); + if (!nsk) goto discard; - /* - * Queue it on the new socket if the new socket is active, - * otherwise we just shortcircuit this and continue with - * the new socket.. - */ if (nsk != sk) { sock_rps_save_rxhash(nsk, skb); - sk_mark_napi_id(sk, skb); + sk_mark_napi_id(nsk, skb); if (tcp_child_process(sk, nsk, skb)) goto reset; if (opt_skb) @@ -1273,7 +1249,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) } else sock_rps_save_rxhash(sk, skb); - if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) + if (tcp_rcv_state_process(sk, skb)) goto reset; if (opt_skb) goto ipv6_pktoptions; @@ -1396,6 +1372,33 @@ process: if (sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; + if (sk->sk_state == TCP_NEW_SYN_RECV) { + struct request_sock *req = inet_reqsk(sk); + struct sock *nsk = NULL; + + sk = req->rsk_listener; + tcp_v6_fill_cb(skb, hdr, th); + if (tcp_v6_inbound_md5_hash(sk, skb)) { + reqsk_put(req); + goto discard_it; + } + if (sk->sk_state == TCP_LISTEN) + nsk = tcp_check_req(sk, skb, req, false); + if (!nsk) { + reqsk_put(req); + goto discard_it; + } + if (nsk == sk) { + sock_hold(sk); + reqsk_put(req); + tcp_v6_restore_cb(skb); + } else if (tcp_child_process(sk, nsk, skb)) { + tcp_v6_send_reset(nsk, skb); + goto discard_it; + } else { + return 0; + } + } if (hdr->hop_limit < inet6_sk(sk)->min_hopcount) { NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); goto discard_and_relse; @@ -1406,17 +1409,21 @@ process: tcp_v6_fill_cb(skb, hdr, th); -#ifdef CONFIG_TCP_MD5SIG if (tcp_v6_inbound_md5_hash(sk, skb)) goto discard_and_relse; -#endif if (sk_filter(sk, skb)) goto discard_and_relse; - sk_incoming_cpu_update(sk); skb->dev = NULL; + if (sk->sk_state == TCP_LISTEN) { + ret = tcp_v6_do_rcv(sk, skb); + goto put_and_return; + } + + sk_incoming_cpu_update(sk); + bh_lock_sock_nested(sk); tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs); ret = 0; @@ -1431,6 +1438,7 @@ process: } bh_unlock_sock(sk); +put_and_return: sock_put(sk); return ret ? -1 : 0; @@ -1631,7 +1639,7 @@ static void tcp_v6_destroy_sock(struct sock *sk) #ifdef CONFIG_PROC_FS /* Proc filesystem TCPv6 sock list dumping. */ static void get_openreq6(struct seq_file *seq, - struct request_sock *req, int i, kuid_t uid) + const struct request_sock *req, int i) { long ttd = req->rsk_timer.expires - jiffies; const struct in6_addr *src = &inet_rsk(req)->ir_v6_loc_addr; @@ -1655,7 +1663,8 @@ static void get_openreq6(struct seq_file *seq, 1, /* timers active (only the expire timer) */ jiffies_to_clock_t(ttd), req->num_timeout, - from_kuid_munged(seq_user_ns(seq), uid), + from_kuid_munged(seq_user_ns(seq), + sock_i_uid(req->rsk_listener)), 0, /* non standard timer */ 0, /* open_requests have no inode */ 0, req); @@ -1670,7 +1679,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) const struct inet_sock *inet = inet_sk(sp); const struct tcp_sock *tp = tcp_sk(sp); const struct inet_connection_sock *icsk = inet_csk(sp); - struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq; + const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; dest = &sp->sk_v6_daddr; src = &sp->sk_v6_rcv_saddr; @@ -1714,7 +1723,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, tp->snd_cwnd, sp->sk_state == TCP_LISTEN ? - (fastopenq ? fastopenq->max_qlen : 0) : + fastopenq->max_qlen : (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh) ); } @@ -1760,18 +1769,12 @@ static int tcp6_seq_show(struct seq_file *seq, void *v) } st = seq->private; - switch (st->state) { - case TCP_SEQ_STATE_LISTENING: - case TCP_SEQ_STATE_ESTABLISHED: - if (sk->sk_state == TCP_TIME_WAIT) - get_timewait6_sock(seq, v, st->num); - else - get_tcp6_sock(seq, v, st->num); - break; - case TCP_SEQ_STATE_OPENREQ: - get_openreq6(seq, v, st->num, st->uid); - break; - } + if (sk->sk_state == TCP_TIME_WAIT) + get_timewait6_sock(seq, v, st->num); + else if (sk->sk_state == TCP_NEW_SYN_RECV) + get_openreq6(seq, v, st->num); + else + get_tcp6_sock(seq, v, st->num); out: return 0; } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 0aba654f5b91..01bcb49619ee 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -182,10 +182,12 @@ static inline int compute_score(struct sock *sk, struct net *net, score++; } + if (sk->sk_incoming_cpu == raw_smp_processor_id()) + score++; + return score; } -#define SCORE2_MAX (1 + 1 + 1) static inline int compute_score2(struct sock *sk, struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, @@ -223,6 +225,9 @@ static inline int compute_score2(struct sock *sk, struct net *net, score++; } + if (sk->sk_incoming_cpu == raw_smp_processor_id()) + score++; + return score; } @@ -251,8 +256,7 @@ begin: hash = udp6_ehashfn(net, daddr, hnum, saddr, sport); matches = 1; - } else if (score == SCORE2_MAX) - goto exact_match; + } } else if (score == badness && reuseport) { matches++; if (reciprocal_scale(hash, matches) == 0) @@ -269,7 +273,6 @@ begin: goto begin; if (result) { -exact_match: if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) result = NULL; else if (unlikely(compute_score2(result, net, saddr, sport, diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index 74bd17882a2f..0eaab1fa6be5 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -42,8 +42,8 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async) ipv6_hdr(skb)->payload_len = htons(skb->len); __skb_push(skb, skb->data - skb_network_header(skb)); - NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, NULL, skb, - skb->dev, NULL, + NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, + dev_net(skb->dev), NULL, skb, skb->dev, NULL, ip6_rcv_finish); return -1; } diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 09c76a7b474d..9db067a11b52 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -131,7 +131,14 @@ int xfrm6_output_finish(struct sock *sk, struct sk_buff *skb) return xfrm_output(sk, skb); } -static int __xfrm6_output(struct sock *sk, struct sk_buff *skb) +static int __xfrm6_output_finish(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct xfrm_state *x = skb_dst(skb)->xfrm; + + return x->outer_mode->afinfo->output_finish(sk, skb); +} + +static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct xfrm_state *x = dst->xfrm; @@ -140,7 +147,7 @@ static int __xfrm6_output(struct sock *sk, struct sk_buff *skb) #ifdef CONFIG_NETFILTER if (!x) { IP6CB(skb)->flags |= IP6SKB_REROUTED; - return dst_output_sk(sk, skb); + return dst_output(net, sk, skb); } #endif @@ -160,15 +167,16 @@ static int __xfrm6_output(struct sock *sk, struct sk_buff *skb) if (x->props.mode == XFRM_MODE_TUNNEL && ((skb->len > mtu && !skb_is_gso(skb)) || dst_allfrag(skb_dst(skb)))) { - return ip6_fragment(sk, skb, - x->outer_mode->afinfo->output_finish); + return ip6_fragment(net, sk, skb, + __xfrm6_output_finish); } return x->outer_mode->afinfo->output_finish(sk, skb); } -int xfrm6_output(struct sock *sk, struct sk_buff *skb) +int xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, sk, skb, - NULL, skb_dst(skb)->dev, __xfrm6_output, + return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, + net, sk, skb, NULL, skb_dst(skb)->dev, + __xfrm6_output, !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 30caa289c5db..08c9c93f3527 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -20,7 +20,7 @@ #include <net/ip.h> #include <net/ipv6.h> #include <net/ip6_route.h> -#include <net/vrf.h> +#include <net/l3mdev.h> #if IS_ENABLED(CONFIG_IPV6_MIP6) #include <net/mip6.h> #endif @@ -37,6 +37,7 @@ static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif, memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_oif = oif; + fl6.flowi6_flags = FLOWI_FLAG_SKIP_NH_OIF; memcpy(&fl6.daddr, daddr, sizeof(fl6.daddr)); if (saddr) memcpy(&fl6.saddr, saddr, sizeof(fl6.saddr)); @@ -132,10 +133,8 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse) nexthdr = nh[nhoff]; - if (skb_dst(skb)) { - oif = vrf_master_ifindex(skb_dst(skb)->dev) ? - : skb_dst(skb)->dev->ifindex; - } + if (skb_dst(skb)) + oif = l3mdev_fib_oif(skb_dst(skb)->dev); memset(fl6, 0, sizeof(struct flowi6)); fl6->flowi6_mark = skb->mark; diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 918151c11348..fcb2752419c6 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -95,11 +95,10 @@ static void afiucv_hs_callback_txnotify(struct sk_buff *, enum iucv_tx_notify); /* Call Back functions */ static void iucv_callback_rx(struct iucv_path *, struct iucv_message *); static void iucv_callback_txdone(struct iucv_path *, struct iucv_message *); -static void iucv_callback_connack(struct iucv_path *, u8 ipuser[16]); -static int iucv_callback_connreq(struct iucv_path *, u8 ipvmid[8], - u8 ipuser[16]); -static void iucv_callback_connrej(struct iucv_path *, u8 ipuser[16]); -static void iucv_callback_shutdown(struct iucv_path *, u8 ipuser[16]); +static void iucv_callback_connack(struct iucv_path *, u8 *); +static int iucv_callback_connreq(struct iucv_path *, u8 *, u8 *); +static void iucv_callback_connrej(struct iucv_path *, u8 *); +static void iucv_callback_shutdown(struct iucv_path *, u8 *); static struct iucv_sock_list iucv_sk_list = { .lock = __RW_LOCK_UNLOCKED(iucv_sk_list.lock), diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c index 2a6a1fdd62c0..7eaa000c9258 100644 --- a/net/iucv/iucv.c +++ b/net/iucv/iucv.c @@ -713,7 +713,7 @@ static struct notifier_block __refdata iucv_cpu_notifier = { * * Sever an iucv path to free up the pathid. Used internally. */ -static int iucv_sever_pathid(u16 pathid, u8 userdata[16]) +static int iucv_sever_pathid(u16 pathid, u8 *userdata) { union iucv_param *parm; @@ -876,7 +876,7 @@ static struct notifier_block iucv_reboot_notifier = { * Returns the result of the CP IUCV call. */ int iucv_path_accept(struct iucv_path *path, struct iucv_handler *handler, - u8 userdata[16], void *private) + u8 *userdata, void *private) { union iucv_param *parm; int rc; @@ -923,7 +923,7 @@ EXPORT_SYMBOL(iucv_path_accept); * Returns the result of the CP IUCV call. */ int iucv_path_connect(struct iucv_path *path, struct iucv_handler *handler, - u8 userid[8], u8 system[8], u8 userdata[16], + u8 *userid, u8 *system, u8 *userdata, void *private) { union iucv_param *parm; @@ -985,7 +985,7 @@ EXPORT_SYMBOL(iucv_path_connect); * * Returns the result from the CP IUCV call. */ -int iucv_path_quiesce(struct iucv_path *path, u8 userdata[16]) +int iucv_path_quiesce(struct iucv_path *path, u8 *userdata) { union iucv_param *parm; int rc; @@ -1017,7 +1017,7 @@ EXPORT_SYMBOL(iucv_path_quiesce); * * Returns the result from the CP IUCV call. */ -int iucv_path_resume(struct iucv_path *path, u8 userdata[16]) +int iucv_path_resume(struct iucv_path *path, u8 *userdata) { union iucv_param *parm; int rc; @@ -1047,7 +1047,7 @@ out: * * Returns the result from the CP IUCV call. */ -int iucv_path_sever(struct iucv_path *path, u8 userdata[16]) +int iucv_path_sever(struct iucv_path *path, u8 *userdata) { int rc; diff --git a/net/key/af_key.c b/net/key/af_key.c index b397f0aa9005..83a70688784b 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -219,7 +219,7 @@ static int pfkey_broadcast_one(struct sk_buff *skb, struct sk_buff **skb2, #define BROADCAST_ONE 1 #define BROADCAST_REGISTERED 2 #define BROADCAST_PROMISC_ONLY 4 -static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation, +static int pfkey_broadcast(struct sk_buff *skb, int broadcast_flags, struct sock *one_sk, struct net *net) { @@ -244,7 +244,7 @@ static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation, * socket. */ if (pfk->promisc) - pfkey_broadcast_one(skb, &skb2, allocation, sk); + pfkey_broadcast_one(skb, &skb2, GFP_ATOMIC, sk); /* the exact target will be processed later */ if (sk == one_sk) @@ -259,7 +259,7 @@ static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation, continue; } - err2 = pfkey_broadcast_one(skb, &skb2, allocation, sk); + err2 = pfkey_broadcast_one(skb, &skb2, GFP_ATOMIC, sk); /* Error is cleare after succecful sending to at least one * registered KM */ @@ -269,7 +269,7 @@ static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation, rcu_read_unlock(); if (one_sk != NULL) - err = pfkey_broadcast_one(skb, &skb2, allocation, one_sk); + err = pfkey_broadcast_one(skb, &skb2, GFP_KERNEL, one_sk); kfree_skb(skb2); kfree_skb(skb); @@ -292,7 +292,7 @@ static int pfkey_do_dump(struct pfkey_sock *pfk) hdr = (struct sadb_msg *) pfk->dump.skb->data; hdr->sadb_msg_seq = 0; hdr->sadb_msg_errno = rc; - pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE, + pfkey_broadcast(pfk->dump.skb, BROADCAST_ONE, &pfk->sk, sock_net(&pfk->sk)); pfk->dump.skb = NULL; } @@ -333,7 +333,7 @@ static int pfkey_error(const struct sadb_msg *orig, int err, struct sock *sk) hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); - pfkey_broadcast(skb, GFP_KERNEL, BROADCAST_ONE, sk, sock_net(sk)); + pfkey_broadcast(skb, BROADCAST_ONE, sk, sock_net(sk)); return 0; } @@ -1365,7 +1365,7 @@ static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, const struct sadb_ xfrm_state_put(x); - pfkey_broadcast(resp_skb, GFP_KERNEL, BROADCAST_ONE, sk, net); + pfkey_broadcast(resp_skb, BROADCAST_ONE, sk, net); return 0; } @@ -1452,7 +1452,7 @@ static int key_notify_sa(struct xfrm_state *x, const struct km_event *c) hdr->sadb_msg_seq = c->seq; hdr->sadb_msg_pid = c->portid; - pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, xs_net(x)); + pfkey_broadcast(skb, BROADCAST_ALL, NULL, xs_net(x)); return 0; } @@ -1565,7 +1565,7 @@ static int pfkey_get(struct sock *sk, struct sk_buff *skb, const struct sadb_msg out_hdr->sadb_msg_reserved = 0; out_hdr->sadb_msg_seq = hdr->sadb_msg_seq; out_hdr->sadb_msg_pid = hdr->sadb_msg_pid; - pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, sk, sock_net(sk)); + pfkey_broadcast(out_skb, BROADCAST_ONE, sk, sock_net(sk)); return 0; } @@ -1670,7 +1670,7 @@ static int pfkey_register(struct sock *sk, struct sk_buff *skb, const struct sad return -ENOBUFS; } - pfkey_broadcast(supp_skb, GFP_KERNEL, BROADCAST_REGISTERED, sk, sock_net(sk)); + pfkey_broadcast(supp_skb, BROADCAST_REGISTERED, sk, sock_net(sk)); return 0; } @@ -1689,7 +1689,7 @@ static int unicast_flush_resp(struct sock *sk, const struct sadb_msg *ihdr) hdr->sadb_msg_errno = (uint8_t) 0; hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); - return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ONE, sk, sock_net(sk)); + return pfkey_broadcast(skb, BROADCAST_ONE, sk, sock_net(sk)); } static int key_notify_sa_flush(const struct km_event *c) @@ -1710,7 +1710,7 @@ static int key_notify_sa_flush(const struct km_event *c) hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); hdr->sadb_msg_reserved = 0; - pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, c->net); + pfkey_broadcast(skb, BROADCAST_ALL, NULL, c->net); return 0; } @@ -1767,7 +1767,7 @@ static int dump_sa(struct xfrm_state *x, int count, void *ptr) out_hdr->sadb_msg_pid = pfk->dump.msg_portid; if (pfk->dump.skb) - pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE, + pfkey_broadcast(pfk->dump.skb, BROADCAST_ONE, &pfk->sk, sock_net(&pfk->sk)); pfk->dump.skb = out_skb; @@ -1847,7 +1847,7 @@ static int pfkey_promisc(struct sock *sk, struct sk_buff *skb, const struct sadb new_hdr->sadb_msg_errno = 0; } - pfkey_broadcast(skb, GFP_KERNEL, BROADCAST_ALL, NULL, sock_net(sk)); + pfkey_broadcast(skb, BROADCAST_ALL, NULL, sock_net(sk)); return 0; } @@ -2181,7 +2181,7 @@ static int key_notify_policy(struct xfrm_policy *xp, int dir, const struct km_ev out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_seq = c->seq; out_hdr->sadb_msg_pid = c->portid; - pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, NULL, xp_net(xp)); + pfkey_broadcast(out_skb, BROADCAST_ALL, NULL, xp_net(xp)); return 0; } @@ -2401,7 +2401,7 @@ static int key_pol_get_resp(struct sock *sk, struct xfrm_policy *xp, const struc out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_seq = hdr->sadb_msg_seq; out_hdr->sadb_msg_pid = hdr->sadb_msg_pid; - pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, sk, xp_net(xp)); + pfkey_broadcast(out_skb, BROADCAST_ONE, sk, xp_net(xp)); err = 0; out: @@ -2655,7 +2655,7 @@ static int dump_sp(struct xfrm_policy *xp, int dir, int count, void *ptr) out_hdr->sadb_msg_pid = pfk->dump.msg_portid; if (pfk->dump.skb) - pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE, + pfkey_broadcast(pfk->dump.skb, BROADCAST_ONE, &pfk->sk, sock_net(&pfk->sk)); pfk->dump.skb = out_skb; @@ -2708,7 +2708,7 @@ static int key_notify_policy_flush(const struct km_event *c) hdr->sadb_msg_satype = SADB_SATYPE_UNSPEC; hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); hdr->sadb_msg_reserved = 0; - pfkey_broadcast(skb_out, GFP_ATOMIC, BROADCAST_ALL, NULL, c->net); + pfkey_broadcast(skb_out, BROADCAST_ALL, NULL, c->net); return 0; } @@ -2770,7 +2770,7 @@ static int pfkey_process(struct sock *sk, struct sk_buff *skb, const struct sadb void *ext_hdrs[SADB_EXT_MAX]; int err; - pfkey_broadcast(skb_clone(skb, GFP_KERNEL), GFP_KERNEL, + pfkey_broadcast(skb_clone(skb, GFP_KERNEL), BROADCAST_PROMISC_ONLY, NULL, sock_net(sk)); memset(ext_hdrs, 0, sizeof(ext_hdrs)); @@ -2992,7 +2992,7 @@ static int key_notify_sa_expire(struct xfrm_state *x, const struct km_event *c) out_hdr->sadb_msg_seq = 0; out_hdr->sadb_msg_pid = 0; - pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, xs_net(x)); + pfkey_broadcast(out_skb, BROADCAST_REGISTERED, NULL, xs_net(x)); return 0; } @@ -3182,7 +3182,7 @@ static int pfkey_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_ctx->ctx_len); } - return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, xs_net(x)); + return pfkey_broadcast(skb, BROADCAST_REGISTERED, NULL, xs_net(x)); } static struct xfrm_policy *pfkey_compile_policy(struct sock *sk, int opt, @@ -3380,7 +3380,7 @@ static int pfkey_send_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, n_port->sadb_x_nat_t_port_port = sport; n_port->sadb_x_nat_t_port_reserved = 0; - return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, xs_net(x)); + return pfkey_broadcast(skb, BROADCAST_REGISTERED, NULL, xs_net(x)); } #ifdef CONFIG_NET_KEY_MIGRATE @@ -3572,7 +3572,7 @@ static int pfkey_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, } /* broadcast migrate message to sockets */ - pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, &init_net); + pfkey_broadcast(skb, BROADCAST_ALL, NULL, &init_net); return 0; diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index f6b090df3930..afca2eb4dfa7 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1319,7 +1319,7 @@ static void l2tp_tunnel_del_work(struct work_struct *work) tunnel = container_of(work, struct l2tp_tunnel, del_work); sk = l2tp_tunnel_sock_lookup(tunnel); if (!sk) - return; + goto out; sock = sk->sk_socket; @@ -1341,6 +1341,8 @@ static void l2tp_tunnel_del_work(struct work_struct *work) } l2tp_tunnel_sock_put(sk); +out: + l2tp_tunnel_dec_refcount(tunnel); } /* Create a socket for the tunnel, if one isn't set up by @@ -1636,8 +1638,13 @@ EXPORT_SYMBOL_GPL(l2tp_tunnel_create); */ int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel) { + l2tp_tunnel_inc_refcount(tunnel); l2tp_tunnel_closeall(tunnel); - return (false == queue_work(l2tp_wq, &tunnel->del_work)); + if (false == queue_work(l2tp_wq, &tunnel->del_work)) { + l2tp_tunnel_dec_refcount(tunnel); + return 1; + } + return 0; } EXPORT_SYMBOL_GPL(l2tp_tunnel_delete); diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index 68aa9ffd4ae4..5871537af387 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -321,4 +321,7 @@ do { \ #define l2tp_dbg(ptr, type, fmt, ...) \ l2tp_printk(ptr, type, pr_debug, fmt, ##__VA_ARGS__) +#define MODULE_ALIAS_L2TP_PWTYPE(type) \ + MODULE_ALIAS("net-l2tp-type-" __stringify(type)) + #endif /* _L2TP_CORE_H_ */ diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index 4b552873b556..e253c26f31ac 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -358,3 +358,4 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("James Chapman <jchapman@katalix.com>"); MODULE_DESCRIPTION("L2TP ethernet pseudowire driver"); MODULE_VERSION("1.0"); +MODULE_ALIAS_L2TP_PWTYPE(5); diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 79649937ec71..ec22078b0914 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -655,3 +655,4 @@ MODULE_VERSION("1.0"); * enums */ MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 2, IPPROTO_L2TP); +MODULE_ALIAS_NET_PF_PROTO(PF_INET, IPPROTO_L2TP); diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index d1ded3777815..aca38d8aed8e 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -801,3 +801,4 @@ MODULE_VERSION("1.0"); * enums */ MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 2, IPPROTO_L2TP); +MODULE_ALIAS_NET_PF_PROTO(PF_INET6, IPPROTO_L2TP); diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index 9e13c2ff8789..f93c5be612a7 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -576,6 +576,13 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf if (info->attrs[L2TP_ATTR_MRU]) cfg.mru = nla_get_u16(info->attrs[L2TP_ATTR_MRU]); +#ifdef CONFIG_MODULES + if (l2tp_nl_cmd_ops[cfg.pw_type] == NULL) { + genl_unlock(); + request_module("net-l2tp-type-%u", cfg.pw_type); + genl_lock(); + } +#endif if ((l2tp_nl_cmd_ops[cfg.pw_type] == NULL) || (l2tp_nl_cmd_ops[cfg.pw_type]->session_create == NULL)) { ret = -EPROTONOSUPPORT; diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index f56c9f69e9f2..1ad18c55064c 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -1863,3 +1863,4 @@ MODULE_DESCRIPTION("PPP over L2TP over UDP"); MODULE_LICENSE("GPL"); MODULE_VERSION(PPPOL2TP_DRV_VERSION); MODULE_ALIAS("pppox-proto-" __stringify(PX_PROTO_OL2TP)); +MODULE_ALIAS_L2TP_PWTYPE(11); diff --git a/net/l3mdev/Kconfig b/net/l3mdev/Kconfig new file mode 100644 index 000000000000..5d47325037bc --- /dev/null +++ b/net/l3mdev/Kconfig @@ -0,0 +1,10 @@ +# +# Configuration for L3 master device support +# + +config NET_L3_MASTER_DEV + bool "L3 Master device support" + depends on INET || IPV6 + ---help--- + This module provides glue between core networking code and device + drivers to support L3 master devices like VRF. diff --git a/net/l3mdev/Makefile b/net/l3mdev/Makefile new file mode 100644 index 000000000000..84a53a6f609a --- /dev/null +++ b/net/l3mdev/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for the L3 device API +# + +obj-$(CONFIG_NET_L3_MASTER_DEV) += l3mdev.o diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c new file mode 100644 index 000000000000..8e5ead366e7f --- /dev/null +++ b/net/l3mdev/l3mdev.c @@ -0,0 +1,92 @@ +/* + * net/l3mdev/l3mdev.c - L3 master device implementation + * Copyright (c) 2015 Cumulus Networks + * Copyright (c) 2015 David Ahern <dsa@cumulusnetworks.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/netdevice.h> +#include <net/l3mdev.h> + +/** + * l3mdev_master_ifindex - get index of L3 master device + * @dev: targeted interface + */ + +int l3mdev_master_ifindex_rcu(struct net_device *dev) +{ + int ifindex = 0; + + if (!dev) + return 0; + + if (netif_is_l3_master(dev)) { + ifindex = dev->ifindex; + } else if (netif_is_l3_slave(dev)) { + struct net_device *master; + + master = netdev_master_upper_dev_get_rcu(dev); + if (master) + ifindex = master->ifindex; + } + + return ifindex; +} +EXPORT_SYMBOL_GPL(l3mdev_master_ifindex_rcu); + +/** + * l3mdev_fib_table - get FIB table id associated with an L3 + * master interface + * @dev: targeted interface + */ + +u32 l3mdev_fib_table_rcu(const struct net_device *dev) +{ + u32 tb_id = 0; + + if (!dev) + return 0; + + if (netif_is_l3_master(dev)) { + if (dev->l3mdev_ops->l3mdev_fib_table) + tb_id = dev->l3mdev_ops->l3mdev_fib_table(dev); + } else if (netif_is_l3_slave(dev)) { + /* Users of netdev_master_upper_dev_get_rcu need non-const, + * but current inet_*type functions take a const + */ + struct net_device *_dev = (struct net_device *) dev; + const struct net_device *master; + + master = netdev_master_upper_dev_get_rcu(_dev); + if (master && + master->l3mdev_ops->l3mdev_fib_table) + tb_id = master->l3mdev_ops->l3mdev_fib_table(master); + } + + return tb_id; +} +EXPORT_SYMBOL_GPL(l3mdev_fib_table_rcu); + +u32 l3mdev_fib_table_by_index(struct net *net, int ifindex) +{ + struct net_device *dev; + u32 tb_id = 0; + + if (!ifindex) + return 0; + + rcu_read_lock(); + + dev = dev_get_by_index_rcu(net, ifindex); + if (dev) + tb_id = l3mdev_fib_table_rcu(dev); + + rcu_read_unlock(); + + return tb_id; +} +EXPORT_SYMBOL_GPL(l3mdev_fib_table_by_index); diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 1b91fcd0aa11..68e551e263c6 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2480,8 +2480,13 @@ static int ieee80211_set_cqm_rssi_config(struct wiphy *wiphy, rssi_hyst == bss_conf->cqm_rssi_hyst) return 0; + if (sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER && + !(sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_CQM_RSSI)) + return -EOPNOTSUPP; + bss_conf->cqm_rssi_thold = rssi_thold; bss_conf->cqm_rssi_hyst = rssi_hyst; + sdata->u.mgd.last_cqm_event_signal = 0; /* tell the driver upon association, unless already associated */ if (sdata->u.mgd.associated && @@ -2526,15 +2531,17 @@ static int ieee80211_set_bitrate_mask(struct wiphy *wiphy, continue; for (j = 0; j < IEEE80211_HT_MCS_MASK_LEN; j++) { - if (~sdata->rc_rateidx_mcs_mask[i][j]) + if (~sdata->rc_rateidx_mcs_mask[i][j]) { sdata->rc_has_mcs_mask[i] = true; + break; + } + } - if (~sdata->rc_rateidx_vht_mcs_mask[i][j]) + for (j = 0; j < NL80211_VHT_NSS_MAX; j++) { + if (~sdata->rc_rateidx_vht_mcs_mask[i][j]) { sdata->rc_has_vht_mcs_mask[i] = true; - - if (sdata->rc_has_mcs_mask[i] && - sdata->rc_has_vht_mcs_mask[i]) break; + } } } diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 88047bf6c0e0..56ef9a8e151c 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -4218,6 +4218,8 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband; struct cfg80211_chan_def chandef; int ret; + u32 i; + bool have_80mhz; sband = local->hw.wiphy->bands[cbss->channel->band]; @@ -4268,6 +4270,20 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, } } + /* Allow VHT if at least one channel on the sband supports 80 MHz */ + have_80mhz = false; + for (i = 0; i < sband->n_channels; i++) { + if (sband->channels[i].flags & (IEEE80211_CHAN_DISABLED | + IEEE80211_CHAN_NO_80MHZ)) + continue; + + have_80mhz = true; + break; + } + + if (!have_80mhz) + ifmgd->flags |= IEEE80211_STA_DISABLE_VHT; + ifmgd->flags |= ieee80211_determine_chantype(sdata, sband, cbss->channel, ht_cap, ht_oper, vht_oper, diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index 48d053b00858..b07e2f748f93 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -719,7 +719,7 @@ static bool rate_control_cap_mask(struct ieee80211_sub_if_data *sdata, /* Filter out rates that the STA does not support */ *mask &= sta->supp_rates[sband->band]; - for (i = 0; i < sizeof(mcs_mask); i++) + for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++) mcs_mask[i] &= sta->ht_cap.mcs.rx_mask[i]; sta_vht_cap = sta->vht_cap.vht_mcs.rx_mcs_map; diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 64f1936350c6..c3644458e2ee 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -303,7 +303,6 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, struct ieee80211_local *local = sdata->local; struct ieee80211_hw *hw = &local->hw; struct sta_info *sta; - struct timespec uptime; int i; sta = kzalloc(sizeof(*sta) + hw->sta_data_size, gfp); @@ -339,8 +338,7 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, /* Mark TID as unreserved */ sta->reserved_tid = IEEE80211_TID_UNRESERVED; - ktime_get_ts(&uptime); - sta->last_connected = uptime.tv_sec; + sta->last_connected = ktime_get_seconds(); ewma_signal_init(&sta->avg_signal); for (i = 0; i < ARRAY_SIZE(sta->chain_signal_avg); i++) ewma_signal_init(&sta->chain_signal_avg[i]); @@ -1813,7 +1811,6 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; struct rate_control_ref *ref = NULL; - struct timespec uptime; u32 thr = 0; int i, ac; @@ -1838,8 +1835,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) BIT(NL80211_STA_INFO_RX_DROP_MISC) | BIT(NL80211_STA_INFO_BEACON_LOSS); - ktime_get_ts(&uptime); - sinfo->connected_time = uptime.tv_sec - sta->last_connected; + sinfo->connected_time = ktime_get_seconds() - sta->last_connected; sinfo->inactive_time = jiffies_to_msecs(jiffies - sta->last_rx); if (!(sinfo->filled & (BIT(NL80211_STA_INFO_TX_BYTES64) | diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index 52f3187f90ba..ecc5e2a8f80b 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -1251,6 +1251,58 @@ static void iee80211_tdls_recalc_chanctx(struct ieee80211_sub_if_data *sdata) mutex_unlock(&local->chanctx_mtx); } +static int iee80211_tdls_have_ht_peers(struct ieee80211_sub_if_data *sdata) +{ + struct sta_info *sta; + bool result = false; + + rcu_read_lock(); + list_for_each_entry_rcu(sta, &sdata->local->sta_list, list) { + if (!sta->sta.tdls || sta->sdata != sdata || !sta->uploaded || + !test_sta_flag(sta, WLAN_STA_AUTHORIZED) || + !test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH) || + !sta->sta.ht_cap.ht_supported) + continue; + result = true; + break; + } + rcu_read_unlock(); + + return result; +} + +static void +iee80211_tdls_recalc_ht_protection(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta) +{ + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + bool tdls_ht; + u16 protection = IEEE80211_HT_OP_MODE_PROTECTION_NONHT_MIXED | + IEEE80211_HT_OP_MODE_NON_GF_STA_PRSNT | + IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT; + u16 opmode; + + /* Nothing to do if the BSS connection uses HT */ + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) + return; + + tdls_ht = (sta && sta->sta.ht_cap.ht_supported) || + iee80211_tdls_have_ht_peers(sdata); + + opmode = sdata->vif.bss_conf.ht_operation_mode; + + if (tdls_ht) + opmode |= protection; + else + opmode &= ~protection; + + if (opmode == sdata->vif.bss_conf.ht_operation_mode) + return; + + sdata->vif.bss_conf.ht_operation_mode = opmode; + ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_HT); +} + int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, const u8 *peer, enum nl80211_tdls_operation oper) { @@ -1276,6 +1328,10 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, return -ENOTSUPP; } + /* protect possible bss_conf changes and avoid concurrency in + * ieee80211_bss_info_change_notify() + */ + sdata_lock(sdata); mutex_lock(&local->mtx); tdls_dbg(sdata, "TDLS oper %d peer %pM\n", oper, peer); @@ -1289,16 +1345,18 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, iee80211_tdls_recalc_chanctx(sdata); - rcu_read_lock(); + mutex_lock(&local->sta_mtx); sta = sta_info_get(sdata, peer); if (!sta) { - rcu_read_unlock(); + mutex_unlock(&local->sta_mtx); ret = -ENOLINK; break; } + iee80211_tdls_recalc_ht_protection(sdata, sta); + set_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH); - rcu_read_unlock(); + mutex_unlock(&local->sta_mtx); WARN_ON_ONCE(is_zero_ether_addr(sdata->u.mgd.tdls_peer) || !ether_addr_equal(sdata->u.mgd.tdls_peer, peer)); @@ -1320,6 +1378,11 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, ieee80211_flush_queues(local, sdata, false); ret = sta_info_destroy_addr(sdata, peer); + + mutex_lock(&local->sta_mtx); + iee80211_tdls_recalc_ht_protection(sdata, NULL); + mutex_unlock(&local->sta_mtx); + iee80211_tdls_recalc_chanctx(sdata); break; default: @@ -1337,6 +1400,7 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, &sdata->u.mgd.request_smps_work); mutex_unlock(&local->mtx); + sdata_unlock(sdata); return ret; } diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index 834ccdbc74be..ff1c798921a6 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -120,6 +120,7 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta_vht_cap *vht_cap = &sta->sta.vht_cap; struct ieee80211_sta_vht_cap own_cap; u32 cap_info, i; + bool have_80mhz; memset(vht_cap, 0, sizeof(*vht_cap)); @@ -129,6 +130,20 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, if (!vht_cap_ie || !sband->vht_cap.vht_supported) return; + /* Allow VHT if at least one channel on the sband supports 80 MHz */ + have_80mhz = false; + for (i = 0; i < sband->n_channels; i++) { + if (sband->channels[i].flags & (IEEE80211_CHAN_DISABLED | + IEEE80211_CHAN_NO_80MHZ)) + continue; + + have_80mhz = true; + break; + } + + if (!have_80mhz) + return; + /* * A VHT STA must support 40 MHz, but if we verify that here * then we break a few things - some APs (e.g. Netgear R6300v2 diff --git a/net/mac802154/cfg.c b/net/mac802154/cfg.c index c865ebb2ace2..57b5e94471af 100644 --- a/net/mac802154/cfg.c +++ b/net/mac802154/cfg.c @@ -266,6 +266,195 @@ ieee802154_set_ackreq_default(struct wpan_phy *wpan_phy, return 0; } +#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL +static void +ieee802154_get_llsec_table(struct wpan_phy *wpan_phy, + struct wpan_dev *wpan_dev, + struct ieee802154_llsec_table **table) +{ + struct net_device *dev = wpan_dev->netdev; + struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); + + *table = &sdata->sec.table; +} + +static void +ieee802154_lock_llsec_table(struct wpan_phy *wpan_phy, + struct wpan_dev *wpan_dev) +{ + struct net_device *dev = wpan_dev->netdev; + struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); + + mutex_lock(&sdata->sec_mtx); +} + +static void +ieee802154_unlock_llsec_table(struct wpan_phy *wpan_phy, + struct wpan_dev *wpan_dev) +{ + struct net_device *dev = wpan_dev->netdev; + struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); + + mutex_unlock(&sdata->sec_mtx); +} + +static int +ieee802154_set_llsec_params(struct wpan_phy *wpan_phy, + struct wpan_dev *wpan_dev, + const struct ieee802154_llsec_params *params, + int changed) +{ + struct net_device *dev = wpan_dev->netdev; + struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); + int res; + + mutex_lock(&sdata->sec_mtx); + res = mac802154_llsec_set_params(&sdata->sec, params, changed); + mutex_unlock(&sdata->sec_mtx); + + return res; +} + +static int +ieee802154_get_llsec_params(struct wpan_phy *wpan_phy, + struct wpan_dev *wpan_dev, + struct ieee802154_llsec_params *params) +{ + struct net_device *dev = wpan_dev->netdev; + struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); + int res; + + mutex_lock(&sdata->sec_mtx); + res = mac802154_llsec_get_params(&sdata->sec, params); + mutex_unlock(&sdata->sec_mtx); + + return res; +} + +static int +ieee802154_add_llsec_key(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + const struct ieee802154_llsec_key_id *id, + const struct ieee802154_llsec_key *key) +{ + struct net_device *dev = wpan_dev->netdev; + struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); + int res; + + mutex_lock(&sdata->sec_mtx); + res = mac802154_llsec_key_add(&sdata->sec, id, key); + mutex_unlock(&sdata->sec_mtx); + + return res; +} + +static int +ieee802154_del_llsec_key(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + const struct ieee802154_llsec_key_id *id) +{ + struct net_device *dev = wpan_dev->netdev; + struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); + int res; + + mutex_lock(&sdata->sec_mtx); + res = mac802154_llsec_key_del(&sdata->sec, id); + mutex_unlock(&sdata->sec_mtx); + + return res; +} + +static int +ieee802154_add_seclevel(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + const struct ieee802154_llsec_seclevel *sl) +{ + struct net_device *dev = wpan_dev->netdev; + struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); + int res; + + mutex_lock(&sdata->sec_mtx); + res = mac802154_llsec_seclevel_add(&sdata->sec, sl); + mutex_unlock(&sdata->sec_mtx); + + return res; +} + +static int +ieee802154_del_seclevel(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + const struct ieee802154_llsec_seclevel *sl) +{ + struct net_device *dev = wpan_dev->netdev; + struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); + int res; + + mutex_lock(&sdata->sec_mtx); + res = mac802154_llsec_seclevel_del(&sdata->sec, sl); + mutex_unlock(&sdata->sec_mtx); + + return res; +} + +static int +ieee802154_add_device(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + const struct ieee802154_llsec_device *dev_desc) +{ + struct net_device *dev = wpan_dev->netdev; + struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); + int res; + + mutex_lock(&sdata->sec_mtx); + res = mac802154_llsec_dev_add(&sdata->sec, dev_desc); + mutex_unlock(&sdata->sec_mtx); + + return res; +} + +static int +ieee802154_del_device(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + __le64 extended_addr) +{ + struct net_device *dev = wpan_dev->netdev; + struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); + int res; + + mutex_lock(&sdata->sec_mtx); + res = mac802154_llsec_dev_del(&sdata->sec, extended_addr); + mutex_unlock(&sdata->sec_mtx); + + return res; +} + +static int +ieee802154_add_devkey(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + __le64 extended_addr, + const struct ieee802154_llsec_device_key *key) +{ + struct net_device *dev = wpan_dev->netdev; + struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); + int res; + + mutex_lock(&sdata->sec_mtx); + res = mac802154_llsec_devkey_add(&sdata->sec, extended_addr, key); + mutex_unlock(&sdata->sec_mtx); + + return res; +} + +static int +ieee802154_del_devkey(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + __le64 extended_addr, + const struct ieee802154_llsec_device_key *key) +{ + struct net_device *dev = wpan_dev->netdev; + struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); + int res; + + mutex_lock(&sdata->sec_mtx); + res = mac802154_llsec_devkey_del(&sdata->sec, extended_addr, key); + mutex_unlock(&sdata->sec_mtx); + + return res; +} +#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ + const struct cfg802154_ops mac802154_config_ops = { .add_virtual_intf_deprecated = ieee802154_add_iface_deprecated, .del_virtual_intf_deprecated = ieee802154_del_iface_deprecated, @@ -284,4 +473,20 @@ const struct cfg802154_ops mac802154_config_ops = { .set_max_frame_retries = ieee802154_set_max_frame_retries, .set_lbt_mode = ieee802154_set_lbt_mode, .set_ackreq_default = ieee802154_set_ackreq_default, +#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL + .get_llsec_table = ieee802154_get_llsec_table, + .lock_llsec_table = ieee802154_lock_llsec_table, + .unlock_llsec_table = ieee802154_unlock_llsec_table, + /* TODO above */ + .set_llsec_params = ieee802154_set_llsec_params, + .get_llsec_params = ieee802154_get_llsec_params, + .add_llsec_key = ieee802154_add_llsec_key, + .del_llsec_key = ieee802154_del_llsec_key, + .add_seclevel = ieee802154_add_seclevel, + .del_seclevel = ieee802154_del_seclevel, + .add_device = ieee802154_add_device, + .del_device = ieee802154_del_device, + .add_devkey = ieee802154_add_devkey, + .del_devkey = ieee802154_del_devkey, +#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ }; diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c index ed26952f9e14..7079cd32a7ad 100644 --- a/net/mac802154/iface.c +++ b/net/mac802154/iface.c @@ -367,12 +367,11 @@ static int mac802154_set_header_security(struct ieee802154_sub_if_data *sdata, return 0; } -static int mac802154_header_create(struct sk_buff *skb, - struct net_device *dev, - unsigned short type, - const void *daddr, - const void *saddr, - unsigned len) +static int ieee802154_header_create(struct sk_buff *skb, + struct net_device *dev, + const struct ieee802154_addr *daddr, + const struct ieee802154_addr *saddr, + unsigned len) { struct ieee802154_hdr hdr; struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); @@ -423,24 +422,89 @@ static int mac802154_header_create(struct sk_buff *skb, return hlen; } +static const struct wpan_dev_header_ops ieee802154_header_ops = { + .create = ieee802154_header_create, +}; + +/* This header create functionality assumes a 8 byte array for + * source and destination pointer at maximum. To adapt this for + * the 802.15.4 dataframe header we use extended address handling + * here only and intra pan connection. fc fields are mostly fallback + * handling. For provide dev_hard_header for dgram sockets. + */ +static int mac802154_header_create(struct sk_buff *skb, + struct net_device *dev, + unsigned short type, + const void *daddr, + const void *saddr, + unsigned len) +{ + struct ieee802154_hdr hdr; + struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); + struct wpan_dev *wpan_dev = &sdata->wpan_dev; + struct ieee802154_mac_cb cb = { }; + int hlen; + + if (!daddr) + return -EINVAL; + + memset(&hdr.fc, 0, sizeof(hdr.fc)); + hdr.fc.type = IEEE802154_FC_TYPE_DATA; + hdr.fc.ack_request = wpan_dev->ackreq; + hdr.seq = atomic_inc_return(&dev->ieee802154_ptr->dsn) & 0xFF; + + /* TODO currently a workaround to give zero cb block to set + * security parameters defaults according MIB. + */ + if (mac802154_set_header_security(sdata, &hdr, &cb) < 0) + return -EINVAL; + + hdr.dest.pan_id = wpan_dev->pan_id; + hdr.dest.mode = IEEE802154_ADDR_LONG; + ieee802154_be64_to_le64(&hdr.dest.extended_addr, daddr); + + hdr.source.pan_id = hdr.dest.pan_id; + hdr.source.mode = IEEE802154_ADDR_LONG; + + if (!saddr) + hdr.source.extended_addr = wpan_dev->extended_addr; + else + ieee802154_be64_to_le64(&hdr.source.extended_addr, saddr); + + hlen = ieee802154_hdr_push(skb, &hdr); + if (hlen < 0) + return -EINVAL; + + skb_reset_mac_header(skb); + skb->mac_len = hlen; + + if (len > ieee802154_max_payload(&hdr)) + return -EMSGSIZE; + + return hlen; +} + static int mac802154_header_parse(const struct sk_buff *skb, unsigned char *haddr) { struct ieee802154_hdr hdr; - struct ieee802154_addr *addr = (struct ieee802154_addr *)haddr; if (ieee802154_hdr_peek_addrs(skb, &hdr) < 0) { pr_debug("malformed packet\n"); return 0; } - *addr = hdr.source; - return sizeof(*addr); + if (hdr.source.mode == IEEE802154_ADDR_LONG) { + ieee802154_le64_to_be64(haddr, &hdr.source.extended_addr); + return IEEE802154_EXTENDED_ADDR_LEN; + } + + return 0; } -static struct header_ops mac802154_header_ops = { - .create = mac802154_header_create, - .parse = mac802154_header_parse, +static const struct header_ops mac802154_header_ops = { + .create = mac802154_header_create, + .parse = mac802154_header_parse, }; static const struct net_device_ops mac802154_wpan_ops = { @@ -471,9 +535,29 @@ static void ieee802154_if_setup(struct net_device *dev) dev->addr_len = IEEE802154_EXTENDED_ADDR_LEN; memset(dev->broadcast, 0xff, IEEE802154_EXTENDED_ADDR_LEN); - dev->hard_header_len = MAC802154_FRAME_HARD_HEADER_LEN; - dev->needed_tailroom = 2 + 16; /* FCS + MIC */ - dev->mtu = IEEE802154_MTU; + /* Let hard_header_len set to IEEE802154_MIN_HEADER_LEN. AF_PACKET + * will not send frames without any payload, but ack frames + * has no payload, so substract one that we can send a 3 bytes + * frame. The xmit callback assumes at least a hard header where two + * bytes fc and sequence field are set. + */ + dev->hard_header_len = IEEE802154_MIN_HEADER_LEN - 1; + /* The auth_tag header is for security and places in private payload + * room of mac frame which stucks between payload and FCS field. + */ + dev->needed_tailroom = IEEE802154_MAX_AUTH_TAG_LEN + + IEEE802154_FCS_LEN; + /* The mtu size is the payload without mac header in this case. + * We have a dynamic length header with a minimum header length + * which is hard_header_len. In this case we let mtu to the size + * of maximum payload which is IEEE802154_MTU - IEEE802154_FCS_LEN - + * hard_header_len. The FCS which is set by hardware or ndo_start_xmit + * and the minimum mac header which can be evaluated inside driver + * layer. The rest of mac header will be part of payload if greater + * than hard_header_len. + */ + dev->mtu = IEEE802154_MTU - IEEE802154_FCS_LEN - + dev->hard_header_len; dev->tx_queue_len = 300; dev->flags = IFF_NOARP | IFF_BROADCAST; } @@ -513,6 +597,7 @@ ieee802154_setup_sdata(struct ieee802154_sub_if_data *sdata, sdata->dev->netdev_ops = &mac802154_wpan_ops; sdata->dev->ml_priv = &mac802154_mlme_wpan; wpan_dev->promiscuous_mode = false; + wpan_dev->header_ops = &ieee802154_header_ops; mutex_init(&sdata->sec_mtx); @@ -550,7 +635,8 @@ ieee802154_if_add(struct ieee802154_local *local, const char *name, if (!ndev) return ERR_PTR(-ENOMEM); - ndev->needed_headroom = local->hw.extra_tx_headroom; + ndev->needed_headroom = local->hw.extra_tx_headroom + + IEEE802154_MAX_HEADER_LEN; ret = dev_alloc_name(ndev, ndev->name); if (ret < 0) diff --git a/net/mac802154/llsec.c b/net/mac802154/llsec.c index 985e9394e2af..7799d3c41fe2 100644 --- a/net/mac802154/llsec.c +++ b/net/mac802154/llsec.c @@ -401,6 +401,7 @@ int mac802154_llsec_dev_del(struct mac802154_llsec *sec, __le64 device_addr) hash_del_rcu(&pos->bucket_s); hash_del_rcu(&pos->bucket_hw); + list_del_rcu(&pos->dev.list); call_rcu(&pos->rcu, llsec_dev_free_rcu); return 0; diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c index d1c33c1d6b9b..42e96729dae6 100644 --- a/net/mac802154/rx.c +++ b/net/mac802154/rx.c @@ -87,6 +87,10 @@ ieee802154_subif_frame(struct ieee802154_sub_if_data *sdata, skb->dev = sdata->dev; + /* TODO this should be moved after netif_receive_skb call, otherwise + * wireshark will show a mac header with security fields and the + * payload is already decrypted. + */ rc = mac802154_llsec_decrypt(&sdata->sec, skb); if (rc) { pr_debug("decryption failed: %i\n", rc); diff --git a/net/mac802154/tx.c b/net/mac802154/tx.c index 7ed439172f30..3827f359b336 100644 --- a/net/mac802154/tx.c +++ b/net/mac802154/tx.c @@ -77,9 +77,6 @@ ieee802154_tx(struct ieee802154_local *local, struct sk_buff *skb) put_unaligned_le16(crc, skb_put(skb, 2)); } - if (skb_cow_head(skb, local->hw.extra_tx_headroom)) - goto err_tx; - /* Stop the netif queue on each sub_if_data object. */ ieee802154_stop_queue(&local->hw); @@ -121,6 +118,10 @@ ieee802154_subif_start_xmit(struct sk_buff *skb, struct net_device *dev) struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); int rc; + /* TODO we should move it to wpan_dev_hard_header and dev_hard_header + * functions. The reason is wireshark will show a mac header which is + * with security fields but the payload is not encrypted. + */ rc = mac802154_llsec_encrypt(&sdata->sec, skb); if (rc) { netdev_warn(dev, "encryption failed: %i\n", rc); diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 8c5707db53c5..bb185a28de98 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1177,8 +1177,10 @@ static int mpls_net_init(struct net *net) table[0].data = net; net->mpls.ctl = register_net_sysctl(net, "net/mpls", table); - if (net->mpls.ctl == NULL) + if (net->mpls.ctl == NULL) { + kfree(table); return -ENOMEM; + } return 0; } diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c index 21e70bc9af98..67591aef9cae 100644 --- a/net/mpls/mpls_iptunnel.c +++ b/net/mpls/mpls_iptunnel.c @@ -37,7 +37,7 @@ static unsigned int mpls_encap_size(struct mpls_iptunnel_encap *en) return en->labels * sizeof(struct mpls_shim_hdr); } -int mpls_output(struct sock *sk, struct sk_buff *skb) +int mpls_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct mpls_iptunnel_encap *tun_encap_info; struct mpls_shim_hdr *hdr; diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 2a5a0704245c..2e907335ee81 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -269,7 +269,7 @@ unsigned int nf_iterate(struct list_head *head, /* Optimization: we don't need to hold module reference here, since function can't sleep. --RR */ repeat: - verdict = (*elemp)->hook(*elemp, skb, state); + verdict = (*elemp)->hook((*elemp)->priv, skb, state); if (verdict != NF_ACCEPT) { #ifdef CONFIG_NETFILTER_DEBUG if (unlikely((verdict & NF_VERDICT_MASK) @@ -388,9 +388,12 @@ EXPORT_SYMBOL(nf_conntrack_destroy); struct nfq_ct_hook __rcu *nfq_ct_hook __read_mostly; EXPORT_SYMBOL_GPL(nfq_ct_hook); -struct nfq_ct_nat_hook __rcu *nfq_ct_nat_hook __read_mostly; -EXPORT_SYMBOL_GPL(nfq_ct_nat_hook); - +/* Built-in default zone used e.g. by modules. */ +const struct nf_conntrack_zone nf_ct_zone_dflt = { + .id = NF_CT_DEFAULT_ZONE_ID, + .dir = NF_CT_DEFAULT_ZONE_DIR, +}; +EXPORT_SYMBOL_GPL(nf_ct_zone_dflt); #endif /* CONFIG_NF_CONNTRACK */ #ifdef CONFIG_NF_NAT_NEEDED diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 338b4047776f..69ab9c2634e1 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -519,8 +519,7 @@ int ip_set_test(ip_set_id_t index, const struct sk_buff *skb, const struct xt_action_param *par, struct ip_set_adt_opt *opt) { - struct ip_set *set = ip_set_rcu_get( - dev_net(par->in ? par->in : par->out), index); + struct ip_set *set = ip_set_rcu_get(par->net, index); int ret = 0; BUG_ON(!set); @@ -558,8 +557,7 @@ int ip_set_add(ip_set_id_t index, const struct sk_buff *skb, const struct xt_action_param *par, struct ip_set_adt_opt *opt) { - struct ip_set *set = ip_set_rcu_get( - dev_net(par->in ? par->in : par->out), index); + struct ip_set *set = ip_set_rcu_get(par->net, index); int ret; BUG_ON(!set); @@ -581,8 +579,7 @@ int ip_set_del(ip_set_id_t index, const struct sk_buff *skb, const struct xt_action_param *par, struct ip_set_adt_opt *opt) { - struct ip_set *set = ip_set_rcu_get( - dev_net(par->in ? par->in : par->out), index); + struct ip_set *set = ip_set_rcu_get(par->net, index); int ret = 0; BUG_ON(!set); diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index afe905c208af..691b54fcaf2a 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -152,9 +152,13 @@ htable_bits(u32 hashsize) #define SET_HOST_MASK(family) (family == AF_INET ? 32 : 128) #ifdef IP_SET_HASH_WITH_NET0 +/* cidr from 0 to SET_HOST_MASK() value and c = cidr + 1 */ #define NLEN(family) (SET_HOST_MASK(family) + 1) +#define CIDR_POS(c) ((c) - 1) #else +/* cidr from 1 to SET_HOST_MASK() value and c = cidr + 1 */ #define NLEN(family) SET_HOST_MASK(family) +#define CIDR_POS(c) ((c) - 2) #endif #else @@ -305,7 +309,7 @@ mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n) } else if (h->nets[i].cidr[n] < cidr) { j = i; } else if (h->nets[i].cidr[n] == cidr) { - h->nets[cidr - 1].nets[n]++; + h->nets[CIDR_POS(cidr)].nets[n]++; return; } } @@ -314,7 +318,7 @@ mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n) h->nets[i].cidr[n] = h->nets[i - 1].cidr[n]; } h->nets[i].cidr[n] = cidr; - h->nets[cidr - 1].nets[n] = 1; + h->nets[CIDR_POS(cidr)].nets[n] = 1; } static void @@ -325,8 +329,8 @@ mtype_del_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n) for (i = 0; i < nets_length; i++) { if (h->nets[i].cidr[n] != cidr) continue; - h->nets[cidr - 1].nets[n]--; - if (h->nets[cidr - 1].nets[n] > 0) + h->nets[CIDR_POS(cidr)].nets[n]--; + if (h->nets[CIDR_POS(cidr)].nets[n] > 0) return; for (j = i; j < net_end && h->nets[j].cidr[n]; j++) h->nets[j].cidr[n] = h->nets[j + 1].cidr[n]; diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index 3c862c0a76d1..a93dfebffa81 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -131,6 +131,13 @@ hash_netnet4_data_next(struct hash_netnet4_elem *next, #define HOST_MASK 32 #include "ip_set_hash_gen.h" +static void +hash_netnet4_init(struct hash_netnet4_elem *e) +{ + e->cidr[0] = HOST_MASK; + e->cidr[1] = HOST_MASK; +} + static int hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, @@ -160,7 +167,7 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], { const struct hash_netnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_netnet4_elem e = { .cidr = { HOST_MASK, HOST_MASK, }, }; + struct hash_netnet4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0, last; u32 ip2 = 0, ip2_from = 0, ip2_to = 0, last2; @@ -169,6 +176,7 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + hash_netnet4_init(&e); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; @@ -357,6 +365,13 @@ hash_netnet6_data_next(struct hash_netnet4_elem *next, #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" +static void +hash_netnet6_init(struct hash_netnet6_elem *e) +{ + e->cidr[0] = HOST_MASK; + e->cidr[1] = HOST_MASK; +} + static int hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, @@ -385,13 +400,14 @@ hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_netnet6_elem e = { .cidr = { HOST_MASK, HOST_MASK, }, }; + struct hash_netnet6_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + hash_netnet6_init(&e); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index 0c68734f5cc4..9a14c237830f 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -142,6 +142,13 @@ hash_netportnet4_data_next(struct hash_netportnet4_elem *next, #define HOST_MASK 32 #include "ip_set_hash_gen.h" +static void +hash_netportnet4_init(struct hash_netportnet4_elem *e) +{ + e->cidr[0] = HOST_MASK; + e->cidr[1] = HOST_MASK; +} + static int hash_netportnet4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, @@ -175,7 +182,7 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], { const struct hash_netportnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_netportnet4_elem e = { .cidr = { HOST_MASK, HOST_MASK, }, }; + struct hash_netportnet4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0, ip_last, p = 0, port, port_to; u32 ip2_from = 0, ip2_to = 0, ip2_last, ip2; @@ -185,6 +192,7 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + hash_netportnet4_init(&e); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || @@ -412,6 +420,13 @@ hash_netportnet6_data_next(struct hash_netportnet4_elem *next, #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" +static void +hash_netportnet6_init(struct hash_netportnet6_elem *e) +{ + e->cidr[0] = HOST_MASK; + e->cidr[1] = HOST_MASK; +} + static int hash_netportnet6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, @@ -445,7 +460,7 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[], { const struct hash_netportnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_netportnet6_elem e = { .cidr = { HOST_MASK, HOST_MASK, }, }; + struct hash_netportnet6_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 port, port_to; bool with_ports = false; @@ -454,6 +469,7 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + hash_netportnet6_init(&e); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index 3b6929dec748..b32fb0dbe237 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -162,6 +162,17 @@ config IP_VS_FO If you want to compile it in kernel, say Y. To compile it as a module, choose M here. If unsure, say N. +config IP_VS_OVF + tristate "weighted overflow scheduling" + ---help--- + The weighted overflow scheduling algorithm directs network + connections to the server with the highest weight that is + currently available and overflows to the next when active + connections exceed the node's weight. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + config IP_VS_LBLC tristate "locality-based least-connection scheduling" ---help--- diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile index 38b2723b2e3d..67f3f4389602 100644 --- a/net/netfilter/ipvs/Makefile +++ b/net/netfilter/ipvs/Makefile @@ -27,6 +27,7 @@ obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o obj-$(CONFIG_IP_VS_FO) += ip_vs_fo.o +obj-$(CONFIG_IP_VS_OVF) += ip_vs_ovf.o obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index dfd7b65b3d2a..0328f7250693 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -75,7 +75,7 @@ static void ip_vs_app_inc_rcu_free(struct rcu_head *head) * Allocate/initialize app incarnation and register it in proto apps. */ static int -ip_vs_app_inc_new(struct net *net, struct ip_vs_app *app, __u16 proto, +ip_vs_app_inc_new(struct netns_ipvs *ipvs, struct ip_vs_app *app, __u16 proto, __u16 port) { struct ip_vs_protocol *pp; @@ -107,7 +107,7 @@ ip_vs_app_inc_new(struct net *net, struct ip_vs_app *app, __u16 proto, } } - ret = pp->register_app(net, inc); + ret = pp->register_app(ipvs, inc); if (ret) goto out; @@ -127,7 +127,7 @@ ip_vs_app_inc_new(struct net *net, struct ip_vs_app *app, __u16 proto, * Release app incarnation */ static void -ip_vs_app_inc_release(struct net *net, struct ip_vs_app *inc) +ip_vs_app_inc_release(struct netns_ipvs *ipvs, struct ip_vs_app *inc) { struct ip_vs_protocol *pp; @@ -135,7 +135,7 @@ ip_vs_app_inc_release(struct net *net, struct ip_vs_app *inc) return; if (pp->unregister_app) - pp->unregister_app(net, inc); + pp->unregister_app(ipvs, inc); IP_VS_DBG(9, "%s App %s:%u unregistered\n", pp->name, inc->name, ntohs(inc->port)); @@ -175,14 +175,14 @@ void ip_vs_app_inc_put(struct ip_vs_app *inc) * Register an application incarnation in protocol applications */ int -register_ip_vs_app_inc(struct net *net, struct ip_vs_app *app, __u16 proto, +register_ip_vs_app_inc(struct netns_ipvs *ipvs, struct ip_vs_app *app, __u16 proto, __u16 port) { int result; mutex_lock(&__ip_vs_app_mutex); - result = ip_vs_app_inc_new(net, app, proto, port); + result = ip_vs_app_inc_new(ipvs, app, proto, port); mutex_unlock(&__ip_vs_app_mutex); @@ -191,15 +191,11 @@ register_ip_vs_app_inc(struct net *net, struct ip_vs_app *app, __u16 proto, /* Register application for netns */ -struct ip_vs_app *register_ip_vs_app(struct net *net, struct ip_vs_app *app) +struct ip_vs_app *register_ip_vs_app(struct netns_ipvs *ipvs, struct ip_vs_app *app) { - struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_app *a; int err = 0; - if (!ipvs) - return ERR_PTR(-ENOENT); - mutex_lock(&__ip_vs_app_mutex); list_for_each_entry(a, &ipvs->app_list, a_list) { @@ -230,21 +226,17 @@ out_unlock: * We are sure there are no app incarnations attached to services * Caller should use synchronize_rcu() or rcu_barrier() */ -void unregister_ip_vs_app(struct net *net, struct ip_vs_app *app) +void unregister_ip_vs_app(struct netns_ipvs *ipvs, struct ip_vs_app *app) { - struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_app *a, *anxt, *inc, *nxt; - if (!ipvs) - return; - mutex_lock(&__ip_vs_app_mutex); list_for_each_entry_safe(a, anxt, &ipvs->app_list, a_list) { if (app && strcmp(app->name, a->name)) continue; list_for_each_entry_safe(inc, nxt, &a->incs_list, a_list) { - ip_vs_app_inc_release(net, inc); + ip_vs_app_inc_release(ipvs, inc); } list_del(&a->a_list); @@ -611,17 +603,19 @@ static const struct file_operations ip_vs_app_fops = { }; #endif -int __net_init ip_vs_app_net_init(struct net *net) +int __net_init ip_vs_app_net_init(struct netns_ipvs *ipvs) { - struct netns_ipvs *ipvs = net_ipvs(net); + struct net *net = ipvs->net; INIT_LIST_HEAD(&ipvs->app_list); proc_create("ip_vs_app", 0, net->proc_net, &ip_vs_app_fops); return 0; } -void __net_exit ip_vs_app_net_cleanup(struct net *net) +void __net_exit ip_vs_app_net_cleanup(struct netns_ipvs *ipvs) { - unregister_ip_vs_app(net, NULL /* all */); + struct net *net = ipvs->net; + + unregister_ip_vs_app(ipvs, NULL /* all */); remove_proc_entry("ip_vs_app", net->proc_net); } diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index b0f7b626b56d..d1d168c7fc68 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -108,7 +108,7 @@ static inline void ct_write_unlock_bh(unsigned int key) /* * Returns hash value for IPVS connection entry */ -static unsigned int ip_vs_conn_hashkey(struct net *net, int af, unsigned int proto, +static unsigned int ip_vs_conn_hashkey(struct netns_ipvs *ipvs, int af, unsigned int proto, const union nf_inet_addr *addr, __be16 port) { @@ -116,11 +116,11 @@ static unsigned int ip_vs_conn_hashkey(struct net *net, int af, unsigned int pro if (af == AF_INET6) return (jhash_3words(jhash(addr, 16, ip_vs_conn_rnd), (__force u32)port, proto, ip_vs_conn_rnd) ^ - ((size_t)net>>8)) & ip_vs_conn_tab_mask; + ((size_t)ipvs>>8)) & ip_vs_conn_tab_mask; #endif return (jhash_3words((__force u32)addr->ip, (__force u32)port, proto, ip_vs_conn_rnd) ^ - ((size_t)net>>8)) & ip_vs_conn_tab_mask; + ((size_t)ipvs>>8)) & ip_vs_conn_tab_mask; } static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p, @@ -141,14 +141,14 @@ static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p, port = p->vport; } - return ip_vs_conn_hashkey(p->net, p->af, p->protocol, addr, port); + return ip_vs_conn_hashkey(p->ipvs, p->af, p->protocol, addr, port); } static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp) { struct ip_vs_conn_param p; - ip_vs_conn_fill_param(ip_vs_conn_net(cp), cp->af, cp->protocol, + ip_vs_conn_fill_param(cp->ipvs, cp->af, cp->protocol, &cp->caddr, cp->cport, NULL, 0, &p); if (cp->pe) { @@ -279,7 +279,7 @@ __ip_vs_conn_in_get(const struct ip_vs_conn_param *p) ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) && ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && p->protocol == cp->protocol && - ip_vs_conn_net_eq(cp, p->net)) { + cp->ipvs == p->ipvs) { if (!__ip_vs_conn_get(cp)) continue; /* HIT */ @@ -314,33 +314,34 @@ struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p) } static int -ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb, +ip_vs_conn_fill_param_proto(struct netns_ipvs *ipvs, + int af, const struct sk_buff *skb, const struct ip_vs_iphdr *iph, - int inverse, struct ip_vs_conn_param *p) + struct ip_vs_conn_param *p) { __be16 _ports[2], *pptr; - struct net *net = skb_net(skb); pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph); if (pptr == NULL) return 1; - if (likely(!inverse)) - ip_vs_conn_fill_param(net, af, iph->protocol, &iph->saddr, + if (likely(!ip_vs_iph_inverse(iph))) + ip_vs_conn_fill_param(ipvs, af, iph->protocol, &iph->saddr, pptr[0], &iph->daddr, pptr[1], p); else - ip_vs_conn_fill_param(net, af, iph->protocol, &iph->daddr, + ip_vs_conn_fill_param(ipvs, af, iph->protocol, &iph->daddr, pptr[1], &iph->saddr, pptr[0], p); return 0; } struct ip_vs_conn * -ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb, - const struct ip_vs_iphdr *iph, int inverse) +ip_vs_conn_in_get_proto(struct netns_ipvs *ipvs, int af, + const struct sk_buff *skb, + const struct ip_vs_iphdr *iph) { struct ip_vs_conn_param p; - if (ip_vs_conn_fill_param_proto(af, skb, iph, inverse, &p)) + if (ip_vs_conn_fill_param_proto(ipvs, af, skb, iph, &p)) return NULL; return ip_vs_conn_in_get(&p); @@ -359,7 +360,7 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { if (unlikely(p->pe_data && p->pe->ct_match)) { - if (!ip_vs_conn_net_eq(cp, p->net)) + if (cp->ipvs != p->ipvs) continue; if (p->pe == cp->pe && p->pe->ct_match(p, cp)) { if (__ip_vs_conn_get(cp)) @@ -377,7 +378,7 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) p->vport == cp->vport && p->cport == cp->cport && cp->flags & IP_VS_CONN_F_TEMPLATE && p->protocol == cp->protocol && - ip_vs_conn_net_eq(cp, p->net)) { + cp->ipvs == p->ipvs) { if (__ip_vs_conn_get(cp)) goto out; } @@ -418,7 +419,7 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p) ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) && ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) && p->protocol == cp->protocol && - ip_vs_conn_net_eq(cp, p->net)) { + cp->ipvs == p->ipvs) { if (!__ip_vs_conn_get(cp)) continue; /* HIT */ @@ -439,12 +440,13 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p) } struct ip_vs_conn * -ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb, - const struct ip_vs_iphdr *iph, int inverse) +ip_vs_conn_out_get_proto(struct netns_ipvs *ipvs, int af, + const struct sk_buff *skb, + const struct ip_vs_iphdr *iph) { struct ip_vs_conn_param p; - if (ip_vs_conn_fill_param_proto(af, skb, iph, inverse, &p)) + if (ip_vs_conn_fill_param_proto(ipvs, af, skb, iph, &p)) return NULL; return ip_vs_conn_out_get(&p); @@ -638,7 +640,7 @@ void ip_vs_try_bind_dest(struct ip_vs_conn *cp) * so we can make the assumption that the svc_af is the same as the * dest_af */ - dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, cp->af, &cp->daddr, + dest = ip_vs_find_dest(cp->ipvs, cp->af, cp->af, &cp->daddr, cp->dport, &cp->vaddr, cp->vport, cp->protocol, cp->fwmark, cp->flags); if (dest) { @@ -668,7 +670,7 @@ void ip_vs_try_bind_dest(struct ip_vs_conn *cp) #endif ip_vs_bind_xmit(cp); - pd = ip_vs_proto_data_get(ip_vs_conn_net(cp), cp->protocol); + pd = ip_vs_proto_data_get(cp->ipvs, cp->protocol); if (pd && atomic_read(&pd->appcnt)) ip_vs_bind_app(cp, pd->pp); } @@ -746,7 +748,7 @@ static int expire_quiescent_template(struct netns_ipvs *ipvs, int ip_vs_check_template(struct ip_vs_conn *ct) { struct ip_vs_dest *dest = ct->dest; - struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(ct)); + struct netns_ipvs *ipvs = ct->ipvs; /* * Checking the dest server status. @@ -800,8 +802,7 @@ static void ip_vs_conn_rcu_free(struct rcu_head *head) static void ip_vs_conn_expire(unsigned long data) { struct ip_vs_conn *cp = (struct ip_vs_conn *)data; - struct net *net = ip_vs_conn_net(cp); - struct netns_ipvs *ipvs = net_ipvs(net); + struct netns_ipvs *ipvs = cp->ipvs; /* * do I control anybody? @@ -847,7 +848,7 @@ static void ip_vs_conn_expire(unsigned long data) cp->timeout = 60*HZ; if (ipvs->sync_state & IP_VS_STATE_MASTER) - ip_vs_sync_conn(net, cp, sysctl_sync_threshold(ipvs)); + ip_vs_sync_conn(ipvs, cp, sysctl_sync_threshold(ipvs)); ip_vs_conn_put(cp); } @@ -875,8 +876,8 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, struct ip_vs_dest *dest, __u32 fwmark) { struct ip_vs_conn *cp; - struct netns_ipvs *ipvs = net_ipvs(p->net); - struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->net, + struct netns_ipvs *ipvs = p->ipvs; + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->ipvs, p->protocol); cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC); @@ -887,7 +888,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, INIT_HLIST_NODE(&cp->c_list); setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp); - ip_vs_conn_net_set(cp, p->net); + cp->ipvs = ipvs; cp->af = p->af; cp->daf = dest_af; cp->protocol = p->protocol; @@ -1061,7 +1062,7 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) size_t len = 0; char dbuf[IP_VS_ADDRSTRLEN]; - if (!ip_vs_conn_net_eq(cp, net)) + if (!net_eq(cp->ipvs->net, net)) return 0; if (cp->pe_data) { pe_data[0] = ' '; @@ -1146,7 +1147,7 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) const struct ip_vs_conn *cp = v; struct net *net = seq_file_net(seq); - if (!ip_vs_conn_net_eq(cp, net)) + if (!net_eq(cp->ipvs->net, net)) return 0; #ifdef CONFIG_IP_VS_IPV6 @@ -1240,7 +1241,7 @@ static inline int todrop_entry(struct ip_vs_conn *cp) } /* Called from keventd and must protect itself from softirqs */ -void ip_vs_random_dropentry(struct net *net) +void ip_vs_random_dropentry(struct netns_ipvs *ipvs) { int idx; struct ip_vs_conn *cp, *cp_c; @@ -1256,7 +1257,7 @@ void ip_vs_random_dropentry(struct net *net) if (cp->flags & IP_VS_CONN_F_TEMPLATE) /* connection template */ continue; - if (!ip_vs_conn_net_eq(cp, net)) + if (cp->ipvs != ipvs) continue; if (cp->protocol == IPPROTO_TCP) { switch(cp->state) { @@ -1308,18 +1309,17 @@ void ip_vs_random_dropentry(struct net *net) /* * Flush all the connection entries in the ip_vs_conn_tab */ -static void ip_vs_conn_flush(struct net *net) +static void ip_vs_conn_flush(struct netns_ipvs *ipvs) { int idx; struct ip_vs_conn *cp, *cp_c; - struct netns_ipvs *ipvs = net_ipvs(net); flush_again: rcu_read_lock(); for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { - if (!ip_vs_conn_net_eq(cp, net)) + if (cp->ipvs != ipvs) continue; IP_VS_DBG(4, "del connection\n"); ip_vs_conn_expire_now(cp); @@ -1345,9 +1345,9 @@ flush_again: /* * per netns init and exit */ -int __net_init ip_vs_conn_net_init(struct net *net) +int __net_init ip_vs_conn_net_init(struct netns_ipvs *ipvs) { - struct netns_ipvs *ipvs = net_ipvs(net); + struct net *net = ipvs->net; atomic_set(&ipvs->conn_count, 0); @@ -1356,10 +1356,12 @@ int __net_init ip_vs_conn_net_init(struct net *net) return 0; } -void __net_exit ip_vs_conn_net_cleanup(struct net *net) +void __net_exit ip_vs_conn_net_cleanup(struct netns_ipvs *ipvs) { + struct net *net = ipvs->net; + /* flush all the connection entries first */ - ip_vs_conn_flush(net); + ip_vs_conn_flush(ipvs); remove_proc_entry("ip_vs_conn", net->proc_net); remove_proc_entry("ip_vs_conn_sync", net->proc_net); } diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 38fbc194b9cb..07a791ecdfba 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -112,7 +112,7 @@ static inline void ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) { struct ip_vs_dest *dest = cp->dest; - struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); + struct netns_ipvs *ipvs = cp->ipvs; if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { struct ip_vs_cpu_stats *s; @@ -146,7 +146,7 @@ static inline void ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) { struct ip_vs_dest *dest = cp->dest; - struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); + struct netns_ipvs *ipvs = cp->ipvs; if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { struct ip_vs_cpu_stats *s; @@ -179,7 +179,7 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) static inline void ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) { - struct netns_ipvs *ipvs = net_ipvs(svc->net); + struct netns_ipvs *ipvs = svc->ipvs; struct ip_vs_cpu_stats *s; s = this_cpu_ptr(cp->dest->stats.cpustats); @@ -215,7 +215,7 @@ ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc, const union nf_inet_addr *vaddr, __be16 vport, struct ip_vs_conn_param *p) { - ip_vs_conn_fill_param(svc->net, svc->af, protocol, caddr, cport, vaddr, + ip_vs_conn_fill_param(svc->ipvs, svc->af, protocol, caddr, cport, vaddr, vport, p); p->pe = rcu_dereference(svc->pe); if (p->pe && p->pe->fill_param) @@ -245,20 +245,30 @@ ip_vs_sched_persist(struct ip_vs_service *svc, const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) }; union nf_inet_addr snet; /* source network of the client, after masking */ + const union nf_inet_addr *src_addr, *dst_addr; + + if (likely(!ip_vs_iph_inverse(iph))) { + src_addr = &iph->saddr; + dst_addr = &iph->daddr; + } else { + src_addr = &iph->daddr; + dst_addr = &iph->saddr; + } + /* Mask saddr with the netmask to adjust template granularity */ #ifdef CONFIG_IP_VS_IPV6 if (svc->af == AF_INET6) - ipv6_addr_prefix(&snet.in6, &iph->saddr.in6, + ipv6_addr_prefix(&snet.in6, &src_addr->in6, (__force __u32) svc->netmask); else #endif - snet.ip = iph->saddr.ip & svc->netmask; + snet.ip = src_addr->ip & svc->netmask; IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u " "mnet %s\n", - IP_VS_DBG_ADDR(svc->af, &iph->saddr), ntohs(src_port), - IP_VS_DBG_ADDR(svc->af, &iph->daddr), ntohs(dst_port), + IP_VS_DBG_ADDR(svc->af, src_addr), ntohs(src_port), + IP_VS_DBG_ADDR(svc->af, dst_addr), ntohs(dst_port), IP_VS_DBG_ADDR(svc->af, &snet)); /* @@ -276,7 +286,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc, */ { int protocol = iph->protocol; - const union nf_inet_addr *vaddr = &iph->daddr; + const union nf_inet_addr *vaddr = dst_addr; __be16 vport = 0; if (dst_port == svc->port) { @@ -366,8 +376,8 @@ ip_vs_sched_persist(struct ip_vs_service *svc, /* * Create a new connection according to the template */ - ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, &iph->saddr, - src_port, &iph->daddr, dst_port, ¶m); + ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, src_addr, + src_port, dst_addr, dst_port, ¶m); cp = ip_vs_conn_new(¶m, dest->af, &dest->addr, dport, flags, dest, skb->mark); @@ -418,7 +428,8 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, struct ip_vs_conn *cp = NULL; struct ip_vs_scheduler *sched; struct ip_vs_dest *dest; - __be16 _ports[2], *pptr; + __be16 _ports[2], *pptr, cport, vport; + const void *caddr, *vaddr; unsigned int flags; *ignored = 1; @@ -429,14 +440,26 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, if (pptr == NULL) return NULL; + if (likely(!ip_vs_iph_inverse(iph))) { + cport = pptr[0]; + caddr = &iph->saddr; + vport = pptr[1]; + vaddr = &iph->daddr; + } else { + cport = pptr[1]; + caddr = &iph->daddr; + vport = pptr[0]; + vaddr = &iph->saddr; + } + /* * FTPDATA needs this check when using local real server. * Never schedule Active FTPDATA connections from real server. * For LVS-NAT they must be already created. For other methods * with persistence the connection is created on SYN+ACK. */ - if (pptr[0] == FTPDATA) { - IP_VS_DBG_PKT(12, svc->af, pp, skb, 0, + if (cport == FTPDATA) { + IP_VS_DBG_PKT(12, svc->af, pp, skb, iph->off, "Not scheduling FTPDATA"); return NULL; } @@ -444,19 +467,25 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, /* * Do not schedule replies from local real server. */ - if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) && - (cp = pp->conn_in_get(svc->af, skb, iph, 1))) { - IP_VS_DBG_PKT(12, svc->af, pp, skb, 0, - "Not scheduling reply for existing connection"); - __ip_vs_conn_put(cp); - return NULL; + if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK)) { + iph->hdr_flags ^= IP_VS_HDR_INVERSE; + cp = pp->conn_in_get(svc->ipvs, svc->af, skb, iph); + iph->hdr_flags ^= IP_VS_HDR_INVERSE; + + if (cp) { + IP_VS_DBG_PKT(12, svc->af, pp, skb, iph->off, + "Not scheduling reply for existing" + " connection"); + __ip_vs_conn_put(cp); + return NULL; + } } /* * Persistent service */ if (svc->flags & IP_VS_SVC_F_PERSISTENT) - return ip_vs_sched_persist(svc, skb, pptr[0], pptr[1], ignored, + return ip_vs_sched_persist(svc, skb, cport, vport, ignored, iph); *ignored = 0; @@ -464,7 +493,7 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, /* * Non-persistent service */ - if (!svc->fwmark && pptr[1] != svc->port) { + if (!svc->fwmark && vport != svc->port) { if (!svc->port) pr_err("Schedule: port zero only supported " "in persistent services, " @@ -495,11 +524,10 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, { struct ip_vs_conn_param p; - ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, - &iph->saddr, pptr[0], &iph->daddr, - pptr[1], &p); + ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, + caddr, cport, vaddr, vport, &p); cp = ip_vs_conn_new(&p, dest->af, &dest->addr, - dest->port ? dest->port : pptr[1], + dest->port ? dest->port : vport, flags, dest, skb->mark); if (!cp) { *ignored = -1; @@ -519,6 +547,17 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, return cp; } +#ifdef CONFIG_SYSCTL +static inline int ip_vs_addr_is_unicast(struct net *net, int af, + union nf_inet_addr *addr) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + return ipv6_addr_type(&addr->in6) & IPV6_ADDR_UNICAST; +#endif + return (inet_addr_type(net, addr->ip) == RTN_UNICAST); +} +#endif /* * Pass or drop the packet. @@ -528,33 +567,21 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, struct ip_vs_proto_data *pd, struct ip_vs_iphdr *iph) { - __be16 _ports[2], *pptr; -#ifdef CONFIG_SYSCTL - struct net *net; - struct netns_ipvs *ipvs; - int unicast; -#endif + __be16 _ports[2], *pptr, dport; + struct netns_ipvs *ipvs = svc->ipvs; + struct net *net = ipvs->net; pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph); - if (pptr == NULL) { + if (!pptr) return NF_DROP; - } - -#ifdef CONFIG_SYSCTL - net = skb_net(skb); - -#ifdef CONFIG_IP_VS_IPV6 - if (svc->af == AF_INET6) - unicast = ipv6_addr_type(&iph->daddr.in6) & IPV6_ADDR_UNICAST; - else -#endif - unicast = (inet_addr_type(net, iph->daddr.ip) == RTN_UNICAST); + dport = likely(!ip_vs_iph_inverse(iph)) ? pptr[1] : pptr[0]; /* if it is fwmark-based service, the cache_bypass sysctl is up and the destination is a non-local unicast, then create a cache_bypass connection entry */ - ipvs = net_ipvs(net); - if (ipvs->sysctl_cache_bypass && svc->fwmark && unicast) { + if (sysctl_cache_bypass(ipvs) && svc->fwmark && + !(iph->hdr_flags & (IP_VS_HDR_INVERSE | IP_VS_HDR_ICMP)) && + ip_vs_addr_is_unicast(net, svc->af, &iph->daddr)) { int ret; struct ip_vs_conn *cp; unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET && @@ -566,7 +593,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__); { struct ip_vs_conn_param p; - ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, + ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, &iph->saddr, pptr[0], &iph->daddr, pptr[1], &p); cp = ip_vs_conn_new(&p, svc->af, &daddr, 0, @@ -590,7 +617,6 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, ip_vs_conn_put(cp); return ret; } -#endif /* * When the virtual ftp service is presented, packets destined @@ -598,9 +624,12 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, * listed in the ipvs table), pass the packets, because it is * not ipvs job to decide to drop the packets. */ - if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) + if (svc->port == FTPPORT && dport != FTPPORT) return NF_ACCEPT; + if (unlikely(ip_vs_iph_icmp(iph))) + return NF_DROP; + /* * Notify the client that the destination is unreachable, and * release the socket buffer. @@ -610,11 +639,8 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, */ #ifdef CONFIG_IP_VS_IPV6 if (svc->af == AF_INET6) { - if (!skb->dev) { - struct net *net_ = dev_net(skb_dst(skb)->dev); - - skb->dev = net_->loopback_dev; - } + if (!skb->dev) + skb->dev = net->loopback_dev; icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); } else #endif @@ -625,15 +651,13 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, #ifdef CONFIG_SYSCTL -static int sysctl_snat_reroute(struct sk_buff *skb) +static int sysctl_snat_reroute(struct netns_ipvs *ipvs) { - struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); return ipvs->sysctl_snat_reroute; } -static int sysctl_nat_icmp_send(struct net *net) +static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs) { - struct netns_ipvs *ipvs = net_ipvs(net); return ipvs->sysctl_nat_icmp_send; } @@ -644,8 +668,8 @@ static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) #else -static int sysctl_snat_reroute(struct sk_buff *skb) { return 0; } -static int sysctl_nat_icmp_send(struct net *net) { return 0; } +static int sysctl_snat_reroute(struct netns_ipvs *ipvs) { return 0; } +static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs) { return 0; } static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) { return 0; } #endif @@ -664,12 +688,13 @@ static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum) return IP_DEFRAG_VS_OUT; } -static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user) +static inline int ip_vs_gather_frags(struct netns_ipvs *ipvs, + struct sk_buff *skb, u_int32_t user) { int err; local_bh_disable(); - err = ip_defrag(skb, user); + err = ip_defrag(ipvs->net, skb, user); local_bh_enable(); if (!err) ip_send_check(ip_hdr(skb)); @@ -677,10 +702,10 @@ static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user) return err; } -static int ip_vs_route_me_harder(int af, struct sk_buff *skb, - unsigned int hooknum) +static int ip_vs_route_me_harder(struct netns_ipvs *ipvs, int af, + struct sk_buff *skb, unsigned int hooknum) { - if (!sysctl_snat_reroute(skb)) + if (!sysctl_snat_reroute(ipvs)) return 0; /* Reroute replies only to remote clients (FORWARD and LOCAL_OUT) */ if (NF_INET_LOCAL_IN == hooknum) @@ -690,12 +715,12 @@ static int ip_vs_route_me_harder(int af, struct sk_buff *skb, struct dst_entry *dst = skb_dst(skb); if (dst->dev && !(dst->dev->flags & IFF_LOOPBACK) && - ip6_route_me_harder(skb) != 0) + ip6_route_me_harder(ipvs->net, skb) != 0) return 1; } else #endif if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL) && - ip_route_me_harder(skb, RTN_LOCAL) != 0) + ip_route_me_harder(ipvs->net, skb, RTN_LOCAL) != 0) return 1; return 0; @@ -848,7 +873,7 @@ static int handle_response_icmp(int af, struct sk_buff *skb, #endif ip_vs_nat_icmp(skb, pp, cp, 1); - if (ip_vs_route_me_harder(af, skb, hooknum)) + if (ip_vs_route_me_harder(cp->ipvs, af, skb, hooknum)) goto out; /* do the statistics and put it back */ @@ -872,8 +897,8 @@ out: * Find any that might be relevant, check against existing connections. * Currently handles error types - unreachable, quench, ttl exceeded. */ -static int ip_vs_out_icmp(struct sk_buff *skb, int *related, - unsigned int hooknum) +static int ip_vs_out_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, + int *related, unsigned int hooknum) { struct iphdr *iph; struct icmphdr _icmph, *ic; @@ -888,7 +913,7 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related, /* reassemble IP fragments */ if (ip_is_fragment(ip_hdr(skb))) { - if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum))) + if (ip_vs_gather_frags(ipvs, skb, ip_vs_defrag_user(hooknum))) return NF_STOLEN; } @@ -934,10 +959,10 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related, IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, "Checking outgoing ICMP for"); - ip_vs_fill_ip4hdr(cih, &ciph); - ciph.len += offset; + ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, true, &ciph); + /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_out_get(AF_INET, skb, &ciph, 1); + cp = pp->conn_out_get(ipvs, AF_INET, skb, &ciph); if (!cp) return NF_ACCEPT; @@ -947,16 +972,16 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related, } #ifdef CONFIG_IP_VS_IPV6 -static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related, - unsigned int hooknum, struct ip_vs_iphdr *ipvsh) +static int ip_vs_out_icmp_v6(struct netns_ipvs *ipvs, struct sk_buff *skb, + int *related, unsigned int hooknum, + struct ip_vs_iphdr *ipvsh) { struct icmp6hdr _icmph, *ic; - struct ipv6hdr _ip6h, *ip6h; /* The ip header contained within ICMP */ struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */ struct ip_vs_conn *cp; struct ip_vs_protocol *pp; union nf_inet_addr snet; - unsigned int writable; + unsigned int offset; *related = 1; ic = frag_safe_skb_hp(skb, ipvsh->len, sizeof(_icmph), &_icmph, ipvsh); @@ -984,31 +1009,23 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related, ic->icmp6_type, ntohs(icmpv6_id(ic)), &ipvsh->saddr, &ipvsh->daddr); - /* Now find the contained IP header */ - ciph.len = ipvsh->len + sizeof(_icmph); - ip6h = skb_header_pointer(skb, ciph.len, sizeof(_ip6h), &_ip6h); - if (ip6h == NULL) + if (!ip_vs_fill_iph_skb_icmp(AF_INET6, skb, ipvsh->len + sizeof(_icmph), + true, &ciph)) return NF_ACCEPT; /* The packet looks wrong, ignore */ - ciph.saddr.in6 = ip6h->saddr; /* conn_out_get() handles reverse order */ - ciph.daddr.in6 = ip6h->daddr; - /* skip possible IPv6 exthdrs of contained IPv6 packet */ - ciph.protocol = ipv6_find_hdr(skb, &ciph.len, -1, &ciph.fragoffs, NULL); - if (ciph.protocol < 0) - return NF_ACCEPT; /* Contained IPv6 hdr looks wrong, ignore */ pp = ip_vs_proto_get(ciph.protocol); if (!pp) return NF_ACCEPT; /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_out_get(AF_INET6, skb, &ciph, 1); + cp = pp->conn_out_get(ipvs, AF_INET6, skb, &ciph); if (!cp) return NF_ACCEPT; snet.in6 = ciph.saddr.in6; - writable = ciph.len; + offset = ciph.len; return handle_response_icmp(AF_INET6, skb, &snet, ciph.protocol, cp, - pp, writable, sizeof(struct ipv6hdr), + pp, offset, sizeof(struct ipv6hdr), hooknum); } #endif @@ -1093,7 +1110,7 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, { struct ip_vs_protocol *pp = pd->pp; - IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet"); + IP_VS_DBG_PKT(11, af, pp, skb, iph->off, "Outgoing packet"); if (!skb_make_writable(skb, iph->len)) goto drop; @@ -1127,10 +1144,10 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, * if it came from this machine itself. So re-compute * the routing information. */ - if (ip_vs_route_me_harder(af, skb, hooknum)) + if (ip_vs_route_me_harder(cp->ipvs, af, skb, hooknum)) goto drop; - IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT"); + IP_VS_DBG_PKT(10, af, pp, skb, iph->off, "After SNAT"); ip_vs_out_stats(cp, skb); ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd); @@ -1155,9 +1172,9 @@ drop: * Check if outgoing packet belongs to the established ip_vs_conn. */ static unsigned int -ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) +ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int af) { - struct net *net = NULL; + struct net *net = ipvs->net; struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; @@ -1182,16 +1199,15 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) if (unlikely(!skb_dst(skb))) return NF_ACCEPT; - net = skb_net(skb); - if (!net_ipvs(net)->enable) + if (!ipvs->enable) return NF_ACCEPT; - ip_vs_fill_iph_skb(af, skb, &iph); + ip_vs_fill_iph_skb(af, skb, false, &iph); #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { int related; - int verdict = ip_vs_out_icmp_v6(skb, &related, + int verdict = ip_vs_out_icmp_v6(ipvs, skb, &related, hooknum, &iph); if (related) @@ -1201,13 +1217,13 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) #endif if (unlikely(iph.protocol == IPPROTO_ICMP)) { int related; - int verdict = ip_vs_out_icmp(skb, &related, hooknum); + int verdict = ip_vs_out_icmp(ipvs, skb, &related, hooknum); if (related) return verdict; } - pd = ip_vs_proto_data_get(net, iph.protocol); + pd = ip_vs_proto_data_get(ipvs, iph.protocol); if (unlikely(!pd)) return NF_ACCEPT; pp = pd->pp; @@ -1217,21 +1233,21 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) if (af == AF_INET) #endif if (unlikely(ip_is_fragment(ip_hdr(skb)) && !pp->dont_defrag)) { - if (ip_vs_gather_frags(skb, + if (ip_vs_gather_frags(ipvs, skb, ip_vs_defrag_user(hooknum))) return NF_STOLEN; - ip_vs_fill_ip4hdr(skb_network_header(skb), &iph); + ip_vs_fill_iph_skb(AF_INET, skb, false, &iph); } /* * Check if the packet belongs to an existing entry */ - cp = pp->conn_out_get(af, skb, &iph, 0); + cp = pp->conn_out_get(ipvs, af, skb, &iph); if (likely(cp)) return handle_response(af, skb, pd, cp, &iph, hooknum); - if (sysctl_nat_icmp_send(net) && + if (sysctl_nat_icmp_send(ipvs) && (pp->protocol == IPPROTO_TCP || pp->protocol == IPPROTO_UDP || pp->protocol == IPPROTO_SCTP)) { @@ -1241,7 +1257,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) sizeof(_ports), _ports, &iph); if (pptr == NULL) return NF_ACCEPT; /* Not for me */ - if (ip_vs_has_real_service(net, af, iph.protocol, &iph.saddr, + if (ip_vs_has_real_service(ipvs, af, iph.protocol, &iph.saddr, pptr[0])) { /* * Notify the real server: there is no @@ -1272,7 +1288,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) } } } - IP_VS_DBG_PKT(12, af, pp, skb, 0, + IP_VS_DBG_PKT(12, af, pp, skb, iph.off, "ip_vs_out: packet continues traversal as normal"); return NF_ACCEPT; } @@ -1283,10 +1299,10 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) * Check if packet is reply for established ip_vs_conn. */ static unsigned int -ip_vs_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb, +ip_vs_reply4(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ip_vs_out(ops->hooknum, skb, AF_INET); + return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET); } /* @@ -1294,10 +1310,10 @@ ip_vs_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb, * Check if packet is reply for established ip_vs_conn. */ static unsigned int -ip_vs_local_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb, +ip_vs_local_reply4(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ip_vs_out(ops->hooknum, skb, AF_INET); + return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET); } #ifdef CONFIG_IP_VS_IPV6 @@ -1308,10 +1324,10 @@ ip_vs_local_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb, * Check if packet is reply for established ip_vs_conn. */ static unsigned int -ip_vs_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb, +ip_vs_reply6(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ip_vs_out(ops->hooknum, skb, AF_INET6); + return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET6); } /* @@ -1319,14 +1335,51 @@ ip_vs_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb, * Check if packet is reply for established ip_vs_conn. */ static unsigned int -ip_vs_local_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb, +ip_vs_local_reply6(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ip_vs_out(ops->hooknum, skb, AF_INET6); + return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET6); } #endif +static unsigned int +ip_vs_try_to_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, + struct ip_vs_proto_data *pd, + int *verdict, struct ip_vs_conn **cpp, + struct ip_vs_iphdr *iph) +{ + struct ip_vs_protocol *pp = pd->pp; + + if (!iph->fragoffs) { + /* No (second) fragments need to enter here, as nf_defrag_ipv6 + * replayed fragment zero will already have created the cp + */ + + /* Schedule and create new connection entry into cpp */ + if (!pp->conn_schedule(ipvs, af, skb, pd, verdict, cpp, iph)) + return 0; + } + + if (unlikely(!*cpp)) { + /* sorry, all this trouble for a no-hit :) */ + IP_VS_DBG_PKT(12, af, pp, skb, iph->off, + "ip_vs_in: packet continues traversal as normal"); + if (iph->fragoffs) { + /* Fragment that couldn't be mapped to a conn entry + * is missing module nf_defrag_ipv6 + */ + IP_VS_DBG_RL("Unhandled frag, load nf_defrag_ipv6\n"); + IP_VS_DBG_PKT(7, af, pp, skb, iph->off, + "unhandled fragment"); + } + *verdict = NF_ACCEPT; + return 0; + } + + return 1; +} + /* * Handle ICMP messages in the outside-to-inside direction (incoming). * Find any that might be relevant, check against existing connections, @@ -1334,9 +1387,9 @@ ip_vs_local_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb, * Currently handles error types - unreachable, quench, ttl exceeded. */ static int -ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) +ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, + unsigned int hooknum) { - struct net *net = NULL; struct iphdr *iph; struct icmphdr _icmph, *ic; struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ @@ -1345,13 +1398,13 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; unsigned int offset, offset2, ihl, verdict; - bool ipip; + bool ipip, new_cp = false; *related = 1; /* reassemble IP fragments */ if (ip_is_fragment(ip_hdr(skb))) { - if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum))) + if (ip_vs_gather_frags(ipvs, skb, ip_vs_defrag_user(hooknum))) return NF_STOLEN; } @@ -1385,8 +1438,6 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) if (cih == NULL) return NF_ACCEPT; /* The packet looks wrong, ignore */ - net = skb_net(skb); - /* Special case for errors for IPIP packets */ ipip = false; if (cih->protocol == IPPROTO_IPIP) { @@ -1402,7 +1453,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) ipip = true; } - pd = ip_vs_proto_data_get(net, cih->protocol); + pd = ip_vs_proto_data_get(ipvs, cih->protocol); if (!pd) return NF_ACCEPT; pp = pd->pp; @@ -1416,15 +1467,24 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) "Checking incoming ICMP for"); offset2 = offset; - ip_vs_fill_ip4hdr(cih, &ciph); - ciph.len += offset; + ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, !ipip, &ciph); offset = ciph.len; + /* The embedded headers contain source and dest in reverse order. * For IPIP this is error for request, not for reply. */ - cp = pp->conn_in_get(AF_INET, skb, &ciph, ipip ? 0 : 1); - if (!cp) - return NF_ACCEPT; + cp = pp->conn_in_get(ipvs, AF_INET, skb, &ciph); + + if (!cp) { + int v; + + if (!sysctl_schedule_icmp(ipvs)) + return NF_ACCEPT; + + if (!ip_vs_try_to_schedule(ipvs, AF_INET, skb, pd, &v, &cp, &ciph)) + return v; + new_cp = true; + } verdict = NF_DROP; @@ -1455,7 +1515,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) skb_reset_network_header(skb); IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n", &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, mtu); - ipv4_update_pmtu(skb, dev_net(skb->dev), + ipv4_update_pmtu(skb, ipvs->net, mtu, 0, 0, 0, 0); /* Client uses PMTUD? */ if (!(frag_off & htons(IP_DF))) @@ -1501,23 +1561,26 @@ ignore_ipip: verdict = ip_vs_icmp_xmit(skb, cp, pp, offset, hooknum, &ciph); out: - __ip_vs_conn_put(cp); + if (likely(!new_cp)) + __ip_vs_conn_put(cp); + else + ip_vs_conn_put(cp); return verdict; } #ifdef CONFIG_IP_VS_IPV6 -static int ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, - unsigned int hooknum, struct ip_vs_iphdr *iph) +static int ip_vs_in_icmp_v6(struct netns_ipvs *ipvs, struct sk_buff *skb, + int *related, unsigned int hooknum, + struct ip_vs_iphdr *iph) { - struct net *net = NULL; - struct ipv6hdr _ip6h, *ip6h; struct icmp6hdr _icmph, *ic; struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */ struct ip_vs_conn *cp; struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; - unsigned int offs_ciph, writable, verdict; + unsigned int offset, verdict; + bool new_cp = false; *related = 1; @@ -1546,21 +1609,11 @@ static int ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, ic->icmp6_type, ntohs(icmpv6_id(ic)), &iph->saddr, &iph->daddr); - /* Now find the contained IP header */ - ciph.len = iph->len + sizeof(_icmph); - offs_ciph = ciph.len; /* Save ip header offset */ - ip6h = skb_header_pointer(skb, ciph.len, sizeof(_ip6h), &_ip6h); - if (ip6h == NULL) - return NF_ACCEPT; /* The packet looks wrong, ignore */ - ciph.saddr.in6 = ip6h->saddr; /* conn_in_get() handles reverse order */ - ciph.daddr.in6 = ip6h->daddr; - /* skip possible IPv6 exthdrs of contained IPv6 packet */ - ciph.protocol = ipv6_find_hdr(skb, &ciph.len, -1, &ciph.fragoffs, NULL); - if (ciph.protocol < 0) - return NF_ACCEPT; /* Contained IPv6 hdr looks wrong, ignore */ - - net = skb_net(skb); - pd = ip_vs_proto_data_get(net, ciph.protocol); + offset = iph->len + sizeof(_icmph); + if (!ip_vs_fill_iph_skb_icmp(AF_INET6, skb, offset, true, &ciph)) + return NF_ACCEPT; + + pd = ip_vs_proto_data_get(ipvs, ciph.protocol); if (!pd) return NF_ACCEPT; pp = pd->pp; @@ -1569,36 +1622,49 @@ static int ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, if (ciph.fragoffs) return NF_ACCEPT; - IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offs_ciph, + IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset, "Checking incoming ICMPv6 for"); /* The embedded headers contain source and dest in reverse order * if not from localhost */ - cp = pp->conn_in_get(AF_INET6, skb, &ciph, - (hooknum == NF_INET_LOCAL_OUT) ? 0 : 1); + cp = pp->conn_in_get(ipvs, AF_INET6, skb, &ciph); + + if (!cp) { + int v; + + if (!sysctl_schedule_icmp(ipvs)) + return NF_ACCEPT; + + if (!ip_vs_try_to_schedule(ipvs, AF_INET6, skb, pd, &v, &cp, &ciph)) + return v; + + new_cp = true; + } - if (!cp) - return NF_ACCEPT; /* VS/TUN, VS/DR and LOCALNODE just let it go */ if ((hooknum == NF_INET_LOCAL_OUT) && (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)) { - __ip_vs_conn_put(cp); - return NF_ACCEPT; + verdict = NF_ACCEPT; + goto out; } /* do the statistics and put it back */ ip_vs_in_stats(cp, skb); /* Need to mangle contained IPv6 header in ICMPv6 packet */ - writable = ciph.len; + offset = ciph.len; if (IPPROTO_TCP == ciph.protocol || IPPROTO_UDP == ciph.protocol || IPPROTO_SCTP == ciph.protocol) - writable += 2 * sizeof(__u16); /* Also mangle ports */ + offset += 2 * sizeof(__u16); /* Also mangle ports */ - verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, writable, hooknum, &ciph); + verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset, hooknum, &ciph); - __ip_vs_conn_put(cp); +out: + if (likely(!new_cp)) + __ip_vs_conn_put(cp); + else + ip_vs_conn_put(cp); return verdict; } @@ -1610,15 +1676,13 @@ static int ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, * and send it on its way... */ static unsigned int -ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) +ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int af) { - struct net *net; struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; struct ip_vs_conn *cp; int ret, pkts; - struct netns_ipvs *ipvs; int conn_reuse_mode; /* Already marked as IPVS request or reply? */ @@ -1633,7 +1697,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) if (unlikely((skb->pkt_type != PACKET_HOST && hooknum != NF_INET_LOCAL_OUT) || !skb_dst(skb))) { - ip_vs_fill_iph_skb(af, skb, &iph); + ip_vs_fill_iph_skb(af, skb, false, &iph); IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s" " ignored in hook %u\n", skb->pkt_type, iph.protocol, @@ -1641,12 +1705,10 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) return NF_ACCEPT; } /* ipvs enabled in this netns ? */ - net = skb_net(skb); - ipvs = net_ipvs(net); if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) return NF_ACCEPT; - ip_vs_fill_iph_skb(af, skb, &iph); + ip_vs_fill_iph_skb(af, skb, false, &iph); /* Bad... Do not break raw sockets */ if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT && @@ -1662,8 +1724,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) if (af == AF_INET6) { if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { int related; - int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum, - &iph); + int verdict = ip_vs_in_icmp_v6(ipvs, skb, &related, + hooknum, &iph); if (related) return verdict; @@ -1672,21 +1734,30 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) #endif if (unlikely(iph.protocol == IPPROTO_ICMP)) { int related; - int verdict = ip_vs_in_icmp(skb, &related, hooknum); + int verdict = ip_vs_in_icmp(ipvs, skb, &related, + hooknum); if (related) return verdict; } /* Protocol supported? */ - pd = ip_vs_proto_data_get(net, iph.protocol); - if (unlikely(!pd)) + pd = ip_vs_proto_data_get(ipvs, iph.protocol); + if (unlikely(!pd)) { + /* The only way we'll see this packet again is if it's + * encapsulated, so mark it with ipvs_property=1 so we + * skip it if we're ignoring tunneled packets + */ + if (sysctl_ignore_tunneled(ipvs)) + skb->ipvs_property = 1; + return NF_ACCEPT; + } pp = pd->pp; /* * Check if the packet belongs to an existing connection entry */ - cp = pp->conn_in_get(af, skb, &iph, 0); + cp = pp->conn_in_get(ipvs, af, skb, &iph); conn_reuse_mode = sysctl_conn_reuse_mode(ipvs); if (conn_reuse_mode && !iph.fragoffs && @@ -1700,32 +1771,15 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) cp = NULL; } - if (unlikely(!cp) && !iph.fragoffs) { - /* No (second) fragments need to enter here, as nf_defrag_ipv6 - * replayed fragment zero will already have created the cp - */ + if (unlikely(!cp)) { int v; - /* Schedule and create new connection entry into &cp */ - if (!pp->conn_schedule(af, skb, pd, &v, &cp, &iph)) + if (!ip_vs_try_to_schedule(ipvs, af, skb, pd, &v, &cp, &iph)) return v; } - if (unlikely(!cp)) { - /* sorry, all this trouble for a no-hit :) */ - IP_VS_DBG_PKT(12, af, pp, skb, 0, - "ip_vs_in: packet continues traversal as normal"); - if (iph.fragoffs) { - /* Fragment that couldn't be mapped to a conn entry - * is missing module nf_defrag_ipv6 - */ - IP_VS_DBG_RL("Unhandled frag, load nf_defrag_ipv6\n"); - IP_VS_DBG_PKT(7, af, pp, skb, 0, "unhandled fragment"); - } - return NF_ACCEPT; - } + IP_VS_DBG_PKT(11, af, pp, skb, iph.off, "Incoming packet"); - IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet"); /* Check the server status */ if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { /* the destination server is not available */ @@ -1765,7 +1819,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) pkts = atomic_add_return(1, &cp->in_pkts); if (ipvs->sync_state & IP_VS_STATE_MASTER) - ip_vs_sync_conn(net, cp, pkts); + ip_vs_sync_conn(ipvs, cp, pkts); ip_vs_conn_put(cp); return ret; @@ -1776,10 +1830,10 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) * Schedule and forward packets from remote clients */ static unsigned int -ip_vs_remote_request4(const struct nf_hook_ops *ops, struct sk_buff *skb, +ip_vs_remote_request4(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ip_vs_in(ops->hooknum, skb, AF_INET); + return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET); } /* @@ -1787,10 +1841,10 @@ ip_vs_remote_request4(const struct nf_hook_ops *ops, struct sk_buff *skb, * Schedule and forward packets from local clients */ static unsigned int -ip_vs_local_request4(const struct nf_hook_ops *ops, struct sk_buff *skb, +ip_vs_local_request4(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ip_vs_in(ops->hooknum, skb, AF_INET); + return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET); } #ifdef CONFIG_IP_VS_IPV6 @@ -1800,10 +1854,10 @@ ip_vs_local_request4(const struct nf_hook_ops *ops, struct sk_buff *skb, * Schedule and forward packets from remote clients */ static unsigned int -ip_vs_remote_request6(const struct nf_hook_ops *ops, struct sk_buff *skb, +ip_vs_remote_request6(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ip_vs_in(ops->hooknum, skb, AF_INET6); + return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET6); } /* @@ -1811,10 +1865,10 @@ ip_vs_remote_request6(const struct nf_hook_ops *ops, struct sk_buff *skb, * Schedule and forward packets from local clients */ static unsigned int -ip_vs_local_request6(const struct nf_hook_ops *ops, struct sk_buff *skb, +ip_vs_local_request6(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ip_vs_in(ops->hooknum, skb, AF_INET6); + return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET6); } #endif @@ -1830,46 +1884,40 @@ ip_vs_local_request6(const struct nf_hook_ops *ops, struct sk_buff *skb, * and send them to ip_vs_in_icmp. */ static unsigned int -ip_vs_forward_icmp(const struct nf_hook_ops *ops, struct sk_buff *skb, +ip_vs_forward_icmp(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { int r; - struct net *net; - struct netns_ipvs *ipvs; + struct netns_ipvs *ipvs = net_ipvs(state->net); if (ip_hdr(skb)->protocol != IPPROTO_ICMP) return NF_ACCEPT; /* ipvs enabled in this netns ? */ - net = skb_net(skb); - ipvs = net_ipvs(net); if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) return NF_ACCEPT; - return ip_vs_in_icmp(skb, &r, ops->hooknum); + return ip_vs_in_icmp(ipvs, skb, &r, state->hook); } #ifdef CONFIG_IP_VS_IPV6 static unsigned int -ip_vs_forward_icmp_v6(const struct nf_hook_ops *ops, struct sk_buff *skb, +ip_vs_forward_icmp_v6(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { int r; - struct net *net; - struct netns_ipvs *ipvs; + struct netns_ipvs *ipvs = net_ipvs(state->net); struct ip_vs_iphdr iphdr; - ip_vs_fill_iph_skb(AF_INET6, skb, &iphdr); + ip_vs_fill_iph_skb(AF_INET6, skb, false, &iphdr); if (iphdr.protocol != IPPROTO_ICMPV6) return NF_ACCEPT; /* ipvs enabled in this netns ? */ - net = skb_net(skb); - ipvs = net_ipvs(net); if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) return NF_ACCEPT; - return ip_vs_in_icmp_v6(skb, &r, ops->hooknum, &iphdr); + return ip_vs_in_icmp_v6(ipvs, skb, &r, state->hook, &iphdr); } #endif @@ -1999,22 +2047,22 @@ static int __net_init __ip_vs_init(struct net *net) atomic_inc(&ipvs_netns_cnt); net->ipvs = ipvs; - if (ip_vs_estimator_net_init(net) < 0) + if (ip_vs_estimator_net_init(ipvs) < 0) goto estimator_fail; - if (ip_vs_control_net_init(net) < 0) + if (ip_vs_control_net_init(ipvs) < 0) goto control_fail; - if (ip_vs_protocol_net_init(net) < 0) + if (ip_vs_protocol_net_init(ipvs) < 0) goto protocol_fail; - if (ip_vs_app_net_init(net) < 0) + if (ip_vs_app_net_init(ipvs) < 0) goto app_fail; - if (ip_vs_conn_net_init(net) < 0) + if (ip_vs_conn_net_init(ipvs) < 0) goto conn_fail; - if (ip_vs_sync_net_init(net) < 0) + if (ip_vs_sync_net_init(ipvs) < 0) goto sync_fail; printk(KERN_INFO "IPVS: Creating netns size=%zu id=%d\n", @@ -2025,15 +2073,15 @@ static int __net_init __ip_vs_init(struct net *net) */ sync_fail: - ip_vs_conn_net_cleanup(net); + ip_vs_conn_net_cleanup(ipvs); conn_fail: - ip_vs_app_net_cleanup(net); + ip_vs_app_net_cleanup(ipvs); app_fail: - ip_vs_protocol_net_cleanup(net); + ip_vs_protocol_net_cleanup(ipvs); protocol_fail: - ip_vs_control_net_cleanup(net); + ip_vs_control_net_cleanup(ipvs); control_fail: - ip_vs_estimator_net_cleanup(net); + ip_vs_estimator_net_cleanup(ipvs); estimator_fail: net->ipvs = NULL; return -ENOMEM; @@ -2041,22 +2089,25 @@ estimator_fail: static void __net_exit __ip_vs_cleanup(struct net *net) { - ip_vs_service_net_cleanup(net); /* ip_vs_flush() with locks */ - ip_vs_conn_net_cleanup(net); - ip_vs_app_net_cleanup(net); - ip_vs_protocol_net_cleanup(net); - ip_vs_control_net_cleanup(net); - ip_vs_estimator_net_cleanup(net); - IP_VS_DBG(2, "ipvs netns %d released\n", net_ipvs(net)->gen); + struct netns_ipvs *ipvs = net_ipvs(net); + + ip_vs_service_net_cleanup(ipvs); /* ip_vs_flush() with locks */ + ip_vs_conn_net_cleanup(ipvs); + ip_vs_app_net_cleanup(ipvs); + ip_vs_protocol_net_cleanup(ipvs); + ip_vs_control_net_cleanup(ipvs); + ip_vs_estimator_net_cleanup(ipvs); + IP_VS_DBG(2, "ipvs netns %d released\n", ipvs->gen); net->ipvs = NULL; } static void __net_exit __ip_vs_dev_cleanup(struct net *net) { + struct netns_ipvs *ipvs = net_ipvs(net); EnterFunction(2); - net_ipvs(net)->enable = 0; /* Disable packet reception */ + ipvs->enable = 0; /* Disable packet reception */ smp_wmb(); - ip_vs_sync_net_cleanup(net); + ip_vs_sync_net_cleanup(ipvs); LeaveFunction(2); } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 24c554201a76..e7c1b052c2a3 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -228,7 +228,7 @@ static void defense_work_handler(struct work_struct *work) update_defense_level(ipvs); if (atomic_read(&ipvs->dropentry)) - ip_vs_random_dropentry(ipvs->net); + ip_vs_random_dropentry(ipvs); schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD); } #endif @@ -263,7 +263,7 @@ static struct hlist_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; * Returns hash value for virtual service */ static inline unsigned int -ip_vs_svc_hashkey(struct net *net, int af, unsigned int proto, +ip_vs_svc_hashkey(struct netns_ipvs *ipvs, int af, unsigned int proto, const union nf_inet_addr *addr, __be16 port) { register unsigned int porth = ntohs(port); @@ -276,7 +276,7 @@ ip_vs_svc_hashkey(struct net *net, int af, unsigned int proto, addr->ip6[2]^addr->ip6[3]; #endif ahash = ntohl(addr_fold); - ahash ^= ((size_t) net >> 8); + ahash ^= ((size_t) ipvs >> 8); return (proto ^ ahash ^ (porth >> IP_VS_SVC_TAB_BITS) ^ porth) & IP_VS_SVC_TAB_MASK; @@ -285,9 +285,9 @@ ip_vs_svc_hashkey(struct net *net, int af, unsigned int proto, /* * Returns hash value of fwmark for virtual service lookup */ -static inline unsigned int ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark) +static inline unsigned int ip_vs_svc_fwm_hashkey(struct netns_ipvs *ipvs, __u32 fwmark) { - return (((size_t)net>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK; + return (((size_t)ipvs>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK; } /* @@ -309,14 +309,14 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc) /* * Hash it by <netns,protocol,addr,port> in ip_vs_svc_table */ - hash = ip_vs_svc_hashkey(svc->net, svc->af, svc->protocol, + hash = ip_vs_svc_hashkey(svc->ipvs, svc->af, svc->protocol, &svc->addr, svc->port); hlist_add_head_rcu(&svc->s_list, &ip_vs_svc_table[hash]); } else { /* * Hash it by fwmark in svc_fwm_table */ - hash = ip_vs_svc_fwm_hashkey(svc->net, svc->fwmark); + hash = ip_vs_svc_fwm_hashkey(svc->ipvs, svc->fwmark); hlist_add_head_rcu(&svc->f_list, &ip_vs_svc_fwm_table[hash]); } @@ -357,21 +357,21 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc) * Get service by {netns, proto,addr,port} in the service table. */ static inline struct ip_vs_service * -__ip_vs_service_find(struct net *net, int af, __u16 protocol, +__ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u16 protocol, const union nf_inet_addr *vaddr, __be16 vport) { unsigned int hash; struct ip_vs_service *svc; /* Check for "full" addressed entries */ - hash = ip_vs_svc_hashkey(net, af, protocol, vaddr, vport); + hash = ip_vs_svc_hashkey(ipvs, af, protocol, vaddr, vport); hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[hash], s_list) { if ((svc->af == af) && ip_vs_addr_equal(af, &svc->addr, vaddr) && (svc->port == vport) && (svc->protocol == protocol) - && net_eq(svc->net, net)) { + && (svc->ipvs == ipvs)) { /* HIT */ return svc; } @@ -385,17 +385,17 @@ __ip_vs_service_find(struct net *net, int af, __u16 protocol, * Get service by {fwmark} in the service table. */ static inline struct ip_vs_service * -__ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark) +__ip_vs_svc_fwm_find(struct netns_ipvs *ipvs, int af, __u32 fwmark) { unsigned int hash; struct ip_vs_service *svc; /* Check for fwmark addressed entries */ - hash = ip_vs_svc_fwm_hashkey(net, fwmark); + hash = ip_vs_svc_fwm_hashkey(ipvs, fwmark); hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[hash], f_list) { if (svc->fwmark == fwmark && svc->af == af - && net_eq(svc->net, net)) { + && (svc->ipvs == ipvs)) { /* HIT */ return svc; } @@ -406,17 +406,16 @@ __ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark) /* Find service, called under RCU lock */ struct ip_vs_service * -ip_vs_service_find(struct net *net, int af, __u32 fwmark, __u16 protocol, +ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol, const union nf_inet_addr *vaddr, __be16 vport) { struct ip_vs_service *svc; - struct netns_ipvs *ipvs = net_ipvs(net); /* * Check the table hashed by fwmark first */ if (fwmark) { - svc = __ip_vs_svc_fwm_find(net, af, fwmark); + svc = __ip_vs_svc_fwm_find(ipvs, af, fwmark); if (svc) goto out; } @@ -425,7 +424,7 @@ ip_vs_service_find(struct net *net, int af, __u32 fwmark, __u16 protocol, * Check the table hashed by <protocol,addr,port> * for "full" addressed entries */ - svc = __ip_vs_service_find(net, af, protocol, vaddr, vport); + svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, vport); if (svc == NULL && protocol == IPPROTO_TCP @@ -435,7 +434,7 @@ ip_vs_service_find(struct net *net, int af, __u32 fwmark, __u16 protocol, * Check if ftp service entry exists, the packet * might belong to FTP data connections. */ - svc = __ip_vs_service_find(net, af, protocol, vaddr, FTPPORT); + svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, FTPPORT); } if (svc == NULL @@ -443,7 +442,7 @@ ip_vs_service_find(struct net *net, int af, __u32 fwmark, __u16 protocol, /* * Check if the catch-all port (port zero) exists */ - svc = __ip_vs_service_find(net, af, protocol, vaddr, 0); + svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, 0); } out: @@ -543,10 +542,9 @@ static void ip_vs_rs_unhash(struct ip_vs_dest *dest) } /* Check if real service by <proto,addr,port> is present */ -bool ip_vs_has_real_service(struct net *net, int af, __u16 protocol, +bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol, const union nf_inet_addr *daddr, __be16 dport) { - struct netns_ipvs *ipvs = net_ipvs(net); unsigned int hash; struct ip_vs_dest *dest; @@ -601,7 +599,7 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, int dest_af, * on the backup. * Called under RCU lock, no refcnt is returned. */ -struct ip_vs_dest *ip_vs_find_dest(struct net *net, int svc_af, int dest_af, +struct ip_vs_dest *ip_vs_find_dest(struct netns_ipvs *ipvs, int svc_af, int dest_af, const union nf_inet_addr *daddr, __be16 dport, const union nf_inet_addr *vaddr, @@ -612,7 +610,7 @@ struct ip_vs_dest *ip_vs_find_dest(struct net *net, int svc_af, int dest_af, struct ip_vs_service *svc; __be16 port = dport; - svc = ip_vs_service_find(net, svc_af, fwmark, protocol, vaddr, vport); + svc = ip_vs_service_find(ipvs, svc_af, fwmark, protocol, vaddr, vport); if (!svc) return NULL; if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) @@ -660,7 +658,7 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, int dest_af, const union nf_inet_addr *daddr, __be16 dport) { struct ip_vs_dest *dest; - struct netns_ipvs *ipvs = net_ipvs(svc->net); + struct netns_ipvs *ipvs = svc->ipvs; /* * Find the destination in trash @@ -715,10 +713,9 @@ static void ip_vs_dest_free(struct ip_vs_dest *dest) * are expired, and the refcnt of each destination in the trash must * be 0, so we simply release them here. */ -static void ip_vs_trash_cleanup(struct net *net) +static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs) { struct ip_vs_dest *dest, *nxt; - struct netns_ipvs *ipvs = net_ipvs(net); del_timer_sync(&ipvs->dest_trash_timer); /* No need to use dest_trash_lock */ @@ -788,7 +785,7 @@ static void __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, struct ip_vs_dest_user_kern *udest, int add) { - struct netns_ipvs *ipvs = net_ipvs(svc->net); + struct netns_ipvs *ipvs = svc->ipvs; struct ip_vs_service *old_svc; struct ip_vs_scheduler *sched; int conn_flags; @@ -843,7 +840,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, spin_unlock_bh(&dest->dst_lock); if (add) { - ip_vs_start_estimator(svc->net, &dest->stats); + ip_vs_start_estimator(svc->ipvs, &dest->stats); list_add_rcu(&dest->n_list, &svc->destinations); svc->num_dests++; sched = rcu_dereference_protected(svc->scheduler, 1); @@ -874,12 +871,12 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, atype = ipv6_addr_type(&udest->addr.in6); if ((!(atype & IPV6_ADDR_UNICAST) || atype & IPV6_ADDR_LINKLOCAL) && - !__ip_vs_addr_is_local_v6(svc->net, &udest->addr.in6)) + !__ip_vs_addr_is_local_v6(svc->ipvs->net, &udest->addr.in6)) return -EINVAL; } else #endif { - atype = inet_addr_type(svc->net, udest->addr.ip); + atype = inet_addr_type(svc->ipvs->net, udest->addr.ip); if (atype != RTN_LOCAL && atype != RTN_UNICAST) return -EINVAL; } @@ -1036,12 +1033,10 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) /* * Delete a destination (must be already unlinked from the service) */ -static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest, +static void __ip_vs_del_dest(struct netns_ipvs *ipvs, struct ip_vs_dest *dest, bool cleanup) { - struct netns_ipvs *ipvs = net_ipvs(net); - - ip_vs_stop_estimator(net, &dest->stats); + ip_vs_stop_estimator(ipvs, &dest->stats); /* * Remove it from the d-linked list with the real services. @@ -1079,7 +1074,7 @@ static void __ip_vs_unlink_dest(struct ip_vs_service *svc, svc->num_dests--; if (dest->af != svc->af) - net_ipvs(svc->net)->mixed_address_family_dests--; + svc->ipvs->mixed_address_family_dests--; if (svcupd) { struct ip_vs_scheduler *sched; @@ -1120,7 +1115,7 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) /* * Delete the destination */ - __ip_vs_del_dest(svc->net, dest, false); + __ip_vs_del_dest(svc->ipvs, dest, false); LeaveFunction(2); @@ -1129,8 +1124,7 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) static void ip_vs_dest_trash_expire(unsigned long data) { - struct net *net = (struct net *) data; - struct netns_ipvs *ipvs = net_ipvs(net); + struct netns_ipvs *ipvs = (struct netns_ipvs *)data; struct ip_vs_dest *dest, *next; unsigned long now = jiffies; @@ -1163,14 +1157,13 @@ static void ip_vs_dest_trash_expire(unsigned long data) * Add a service into the service hash table */ static int -ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, +ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, struct ip_vs_service **svc_p) { int ret = 0, i; struct ip_vs_scheduler *sched = NULL; struct ip_vs_pe *pe = NULL; struct ip_vs_service *svc = NULL; - struct netns_ipvs *ipvs = net_ipvs(net); /* increase the module use count */ ip_vs_use_count_inc(); @@ -1237,7 +1230,7 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, svc->flags = u->flags; svc->timeout = u->timeout * HZ; svc->netmask = u->netmask; - svc->net = net; + svc->ipvs = ipvs; INIT_LIST_HEAD(&svc->destinations); spin_lock_init(&svc->sched_lock); @@ -1261,7 +1254,7 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, else if (svc->port == 0) atomic_inc(&ipvs->nullsvc_counter); - ip_vs_start_estimator(net, &svc->stats); + ip_vs_start_estimator(ipvs, &svc->stats); /* Count only IPv4 services for old get/setsockopt interface */ if (svc->af == AF_INET) @@ -1381,7 +1374,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) struct ip_vs_dest *dest, *nxt; struct ip_vs_scheduler *old_sched; struct ip_vs_pe *old_pe; - struct netns_ipvs *ipvs = net_ipvs(svc->net); + struct netns_ipvs *ipvs = svc->ipvs; pr_info("%s: enter\n", __func__); @@ -1389,7 +1382,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) if (svc->af == AF_INET) ipvs->num_services--; - ip_vs_stop_estimator(svc->net, &svc->stats); + ip_vs_stop_estimator(svc->ipvs, &svc->stats); /* Unbind scheduler */ old_sched = rcu_dereference_protected(svc->scheduler, 1); @@ -1405,7 +1398,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) */ list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) { __ip_vs_unlink_dest(svc, dest, 0); - __ip_vs_del_dest(svc->net, dest, cleanup); + __ip_vs_del_dest(svc->ipvs, dest, cleanup); } /* @@ -1456,7 +1449,7 @@ static int ip_vs_del_service(struct ip_vs_service *svc) /* * Flush all the virtual services */ -static int ip_vs_flush(struct net *net, bool cleanup) +static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup) { int idx; struct ip_vs_service *svc; @@ -1468,7 +1461,7 @@ static int ip_vs_flush(struct net *net, bool cleanup) for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry_safe(svc, n, &ip_vs_svc_table[idx], s_list) { - if (net_eq(svc->net, net)) + if (svc->ipvs == ipvs) ip_vs_unlink_service(svc, cleanup); } } @@ -1479,7 +1472,7 @@ static int ip_vs_flush(struct net *net, bool cleanup) for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry_safe(svc, n, &ip_vs_svc_fwm_table[idx], f_list) { - if (net_eq(svc->net, net)) + if (svc->ipvs == ipvs) ip_vs_unlink_service(svc, cleanup); } } @@ -1491,12 +1484,12 @@ static int ip_vs_flush(struct net *net, bool cleanup) * Delete service by {netns} in the service table. * Called by __ip_vs_cleanup() */ -void ip_vs_service_net_cleanup(struct net *net) +void ip_vs_service_net_cleanup(struct netns_ipvs *ipvs) { EnterFunction(2); /* Check for "full" addressed entries */ mutex_lock(&__ip_vs_mutex); - ip_vs_flush(net, true); + ip_vs_flush(ipvs, true); mutex_unlock(&__ip_vs_mutex); LeaveFunction(2); } @@ -1540,7 +1533,7 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, mutex_lock(&__ip_vs_mutex); for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { - if (net_eq(svc->net, net)) { + if (svc->ipvs == ipvs) { list_for_each_entry(dest, &svc->destinations, n_list) { ip_vs_forget_dev(dest, dev); @@ -1549,7 +1542,7 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, } hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { - if (net_eq(svc->net, net)) { + if (svc->ipvs == ipvs) { list_for_each_entry(dest, &svc->destinations, n_list) { ip_vs_forget_dev(dest, dev); @@ -1583,26 +1576,26 @@ static int ip_vs_zero_service(struct ip_vs_service *svc) return 0; } -static int ip_vs_zero_all(struct net *net) +static int ip_vs_zero_all(struct netns_ipvs *ipvs) { int idx; struct ip_vs_service *svc; for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { - if (net_eq(svc->net, net)) + if (svc->ipvs == ipvs) ip_vs_zero_service(svc); } } for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { - if (net_eq(svc->net, net)) + if (svc->ipvs == ipvs) ip_vs_zero_service(svc); } } - ip_vs_zero_stats(&net_ipvs(net)->tot_stats); + ip_vs_zero_stats(&ipvs->tot_stats); return 0; } @@ -1615,7 +1608,7 @@ static int proc_do_defense_mode(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - struct net *net = current->nsproxy->net_ns; + struct netns_ipvs *ipvs = table->extra2; int *valp = table->data; int val = *valp; int rc; @@ -1626,7 +1619,7 @@ proc_do_defense_mode(struct ctl_table *table, int write, /* Restore the correct value */ *valp = val; } else { - update_defense_level(net_ipvs(net)); + update_defense_level(ipvs); } } return rc; @@ -1844,6 +1837,18 @@ static struct ctl_table vs_vars[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "schedule_icmp", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "ignore_tunneled", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #ifdef CONFIG_IP_VS_DEBUG { .procname = "debug_level", @@ -1889,6 +1894,7 @@ static inline const char *ip_vs_fwd_name(unsigned int flags) static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) { struct net *net = seq_file_net(seq); + struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_iter *iter = seq->private; int idx; struct ip_vs_service *svc; @@ -1896,7 +1902,7 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) /* look in hash by protocol */ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[idx], s_list) { - if (net_eq(svc->net, net) && pos-- == 0) { + if ((svc->ipvs == ipvs) && pos-- == 0) { iter->table = ip_vs_svc_table; iter->bucket = idx; return svc; @@ -1908,7 +1914,7 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[idx], f_list) { - if (net_eq(svc->net, net) && pos-- == 0) { + if ((svc->ipvs == ipvs) && pos-- == 0) { iter->table = ip_vs_svc_fwm_table; iter->bucket = idx; return svc; @@ -2196,7 +2202,7 @@ static const struct file_operations ip_vs_stats_percpu_fops = { /* * Set timeout values for tcp tcpfin udp in the timeout_table. */ -static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u) +static int ip_vs_set_timeout(struct netns_ipvs *ipvs, struct ip_vs_timeout_user *u) { #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) struct ip_vs_proto_data *pd; @@ -2209,13 +2215,13 @@ static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u) #ifdef CONFIG_IP_VS_PROTO_TCP if (u->tcp_timeout) { - pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] = u->tcp_timeout * HZ; } if (u->tcp_fin_timeout) { - pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] = u->tcp_fin_timeout * HZ; } @@ -2223,7 +2229,7 @@ static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u) #ifdef CONFIG_IP_VS_PROTO_UDP if (u->udp_timeout) { - pd = ip_vs_proto_data_get(net, IPPROTO_UDP); + pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP); pd->timeout_table[IP_VS_UDP_S_NORMAL] = u->udp_timeout * HZ; } @@ -2335,24 +2341,34 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) cmd == IP_VS_SO_SET_STOPDAEMON) { struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; - mutex_lock(&ipvs->sync_mutex); - if (cmd == IP_VS_SO_SET_STARTDAEMON) - ret = start_sync_thread(net, dm->state, dm->mcast_ifn, - dm->syncid); - else - ret = stop_sync_thread(net, dm->state); - mutex_unlock(&ipvs->sync_mutex); + if (cmd == IP_VS_SO_SET_STARTDAEMON) { + struct ipvs_sync_daemon_cfg cfg; + + memset(&cfg, 0, sizeof(cfg)); + strlcpy(cfg.mcast_ifn, dm->mcast_ifn, + sizeof(cfg.mcast_ifn)); + cfg.syncid = dm->syncid; + rtnl_lock(); + mutex_lock(&ipvs->sync_mutex); + ret = start_sync_thread(ipvs, &cfg, dm->state); + mutex_unlock(&ipvs->sync_mutex); + rtnl_unlock(); + } else { + mutex_lock(&ipvs->sync_mutex); + ret = stop_sync_thread(ipvs, dm->state); + mutex_unlock(&ipvs->sync_mutex); + } goto out_dec; } mutex_lock(&__ip_vs_mutex); if (cmd == IP_VS_SO_SET_FLUSH) { /* Flush the virtual service */ - ret = ip_vs_flush(net, false); + ret = ip_vs_flush(ipvs, false); goto out_unlock; } else if (cmd == IP_VS_SO_SET_TIMEOUT) { /* Set timeout values for (tcp tcpfin udp) */ - ret = ip_vs_set_timeout(net, (struct ip_vs_timeout_user *)arg); + ret = ip_vs_set_timeout(ipvs, (struct ip_vs_timeout_user *)arg); goto out_unlock; } @@ -2367,7 +2383,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) if (cmd == IP_VS_SO_SET_ZERO) { /* if no service address is set, zero counters in all */ if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) { - ret = ip_vs_zero_all(net); + ret = ip_vs_zero_all(ipvs); goto out_unlock; } } @@ -2385,10 +2401,10 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) /* Lookup the exact service by <protocol, addr, port> or fwmark */ rcu_read_lock(); if (usvc.fwmark == 0) - svc = __ip_vs_service_find(net, usvc.af, usvc.protocol, + svc = __ip_vs_service_find(ipvs, usvc.af, usvc.protocol, &usvc.addr, usvc.port); else - svc = __ip_vs_svc_fwm_find(net, usvc.af, usvc.fwmark); + svc = __ip_vs_svc_fwm_find(ipvs, usvc.af, usvc.fwmark); rcu_read_unlock(); if (cmd != IP_VS_SO_SET_ADD @@ -2402,7 +2418,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) if (svc != NULL) ret = -EEXIST; else - ret = ip_vs_add_service(net, &usvc, &svc); + ret = ip_vs_add_service(ipvs, &usvc, &svc); break; case IP_VS_SO_SET_EDIT: ret = ip_vs_edit_service(svc, &usvc); @@ -2461,7 +2477,7 @@ ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) } static inline int -__ip_vs_get_service_entries(struct net *net, +__ip_vs_get_service_entries(struct netns_ipvs *ipvs, const struct ip_vs_get_services *get, struct ip_vs_get_services __user *uptr) { @@ -2473,7 +2489,7 @@ __ip_vs_get_service_entries(struct net *net, for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { /* Only expose IPv4 entries to old interface */ - if (svc->af != AF_INET || !net_eq(svc->net, net)) + if (svc->af != AF_INET || (svc->ipvs != ipvs)) continue; if (count >= get->num_services) @@ -2492,7 +2508,7 @@ __ip_vs_get_service_entries(struct net *net, for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { /* Only expose IPv4 entries to old interface */ - if (svc->af != AF_INET || !net_eq(svc->net, net)) + if (svc->af != AF_INET || (svc->ipvs != ipvs)) continue; if (count >= get->num_services) @@ -2512,7 +2528,7 @@ out: } static inline int -__ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get, +__ip_vs_get_dest_entries(struct netns_ipvs *ipvs, const struct ip_vs_get_dests *get, struct ip_vs_get_dests __user *uptr) { struct ip_vs_service *svc; @@ -2521,9 +2537,9 @@ __ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get, rcu_read_lock(); if (get->fwmark) - svc = __ip_vs_svc_fwm_find(net, AF_INET, get->fwmark); + svc = __ip_vs_svc_fwm_find(ipvs, AF_INET, get->fwmark); else - svc = __ip_vs_service_find(net, AF_INET, get->protocol, &addr, + svc = __ip_vs_service_find(ipvs, AF_INET, get->protocol, &addr, get->port); rcu_read_unlock(); @@ -2568,7 +2584,7 @@ __ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get, } static inline void -__ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u) +__ip_vs_get_timeouts(struct netns_ipvs *ipvs, struct ip_vs_timeout_user *u) { #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) struct ip_vs_proto_data *pd; @@ -2577,12 +2593,12 @@ __ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u) memset(u, 0, sizeof (*u)); #ifdef CONFIG_IP_VS_PROTO_TCP - pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ; u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ; #endif #ifdef CONFIG_IP_VS_PROTO_UDP - pd = ip_vs_proto_data_get(net, IPPROTO_UDP); + pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP); u->udp_timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ; #endif @@ -2645,15 +2661,15 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) mutex_lock(&ipvs->sync_mutex); if (ipvs->sync_state & IP_VS_STATE_MASTER) { d[0].state = IP_VS_STATE_MASTER; - strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn, + strlcpy(d[0].mcast_ifn, ipvs->mcfg.mcast_ifn, sizeof(d[0].mcast_ifn)); - d[0].syncid = ipvs->master_syncid; + d[0].syncid = ipvs->mcfg.syncid; } if (ipvs->sync_state & IP_VS_STATE_BACKUP) { d[1].state = IP_VS_STATE_BACKUP; - strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn, + strlcpy(d[1].mcast_ifn, ipvs->bcfg.mcast_ifn, sizeof(d[1].mcast_ifn)); - d[1].syncid = ipvs->backup_syncid; + d[1].syncid = ipvs->bcfg.syncid; } if (copy_to_user(user, &d, sizeof(d)) != 0) ret = -EFAULT; @@ -2701,7 +2717,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) ret = -EINVAL; goto out; } - ret = __ip_vs_get_service_entries(net, get, user); + ret = __ip_vs_get_service_entries(ipvs, get, user); } break; @@ -2715,9 +2731,9 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) addr.ip = entry->addr; rcu_read_lock(); if (entry->fwmark) - svc = __ip_vs_svc_fwm_find(net, AF_INET, entry->fwmark); + svc = __ip_vs_svc_fwm_find(ipvs, AF_INET, entry->fwmark); else - svc = __ip_vs_service_find(net, AF_INET, + svc = __ip_vs_service_find(ipvs, AF_INET, entry->protocol, &addr, entry->port); rcu_read_unlock(); @@ -2743,7 +2759,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) ret = -EINVAL; goto out; } - ret = __ip_vs_get_dest_entries(net, get, user); + ret = __ip_vs_get_dest_entries(ipvs, get, user); } break; @@ -2751,7 +2767,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) { struct ip_vs_timeout_user t; - __ip_vs_get_timeouts(net, &t); + __ip_vs_get_timeouts(ipvs, &t); if (copy_to_user(user, &t, sizeof(t)) != 0) ret = -EFAULT; } @@ -2808,6 +2824,11 @@ static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = { [IPVS_DAEMON_ATTR_MCAST_IFN] = { .type = NLA_NUL_STRING, .len = IP_VS_IFNAME_MAXLEN }, [IPVS_DAEMON_ATTR_SYNC_ID] = { .type = NLA_U32 }, + [IPVS_DAEMON_ATTR_SYNC_MAXLEN] = { .type = NLA_U16 }, + [IPVS_DAEMON_ATTR_MCAST_GROUP] = { .type = NLA_U32 }, + [IPVS_DAEMON_ATTR_MCAST_GROUP6] = { .len = sizeof(struct in6_addr) }, + [IPVS_DAEMON_ATTR_MCAST_PORT] = { .type = NLA_U16 }, + [IPVS_DAEMON_ATTR_MCAST_TTL] = { .type = NLA_U8 }, }; /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */ @@ -2981,12 +3002,13 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb, int idx = 0, i; int start = cb->args[0]; struct ip_vs_service *svc; - struct net *net = skb_sknet(skb); + struct net *net = sock_net(skb->sk); + struct netns_ipvs *ipvs = net_ipvs(net); mutex_lock(&__ip_vs_mutex); for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { hlist_for_each_entry(svc, &ip_vs_svc_table[i], s_list) { - if (++idx <= start || !net_eq(svc->net, net)) + if (++idx <= start || (svc->ipvs != ipvs)) continue; if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { idx--; @@ -2997,7 +3019,7 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb, for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) { - if (++idx <= start || !net_eq(svc->net, net)) + if (++idx <= start || (svc->ipvs != ipvs)) continue; if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { idx--; @@ -3013,7 +3035,7 @@ nla_put_failure: return skb->len; } -static int ip_vs_genl_parse_service(struct net *net, +static int ip_vs_genl_parse_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *usvc, struct nlattr *nla, int full_entry, struct ip_vs_service **ret_svc) @@ -3058,9 +3080,9 @@ static int ip_vs_genl_parse_service(struct net *net, rcu_read_lock(); if (usvc->fwmark) - svc = __ip_vs_svc_fwm_find(net, usvc->af, usvc->fwmark); + svc = __ip_vs_svc_fwm_find(ipvs, usvc->af, usvc->fwmark); else - svc = __ip_vs_service_find(net, usvc->af, usvc->protocol, + svc = __ip_vs_service_find(ipvs, usvc->af, usvc->protocol, &usvc->addr, usvc->port); rcu_read_unlock(); *ret_svc = svc; @@ -3098,14 +3120,14 @@ static int ip_vs_genl_parse_service(struct net *net, return 0; } -static struct ip_vs_service *ip_vs_genl_find_service(struct net *net, +static struct ip_vs_service *ip_vs_genl_find_service(struct netns_ipvs *ipvs, struct nlattr *nla) { struct ip_vs_service_user_kern usvc; struct ip_vs_service *svc; int ret; - ret = ip_vs_genl_parse_service(net, &usvc, nla, 0, &svc); + ret = ip_vs_genl_parse_service(ipvs, &usvc, nla, 0, &svc); return ret ? ERR_PTR(ret) : svc; } @@ -3180,7 +3202,8 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb, struct ip_vs_service *svc; struct ip_vs_dest *dest; struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1]; - struct net *net = skb_sknet(skb); + struct net *net = sock_net(skb->sk); + struct netns_ipvs *ipvs = net_ipvs(net); mutex_lock(&__ip_vs_mutex); @@ -3190,7 +3213,7 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb, goto out_err; - svc = ip_vs_genl_find_service(net, attrs[IPVS_CMD_ATTR_SERVICE]); + svc = ip_vs_genl_find_service(ipvs, attrs[IPVS_CMD_ATTR_SERVICE]); if (IS_ERR(svc) || svc == NULL) goto out_err; @@ -3266,7 +3289,7 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, } static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state, - const char *mcast_ifn, __u32 syncid) + struct ipvs_sync_daemon_cfg *c) { struct nlattr *nl_daemon; @@ -3275,9 +3298,23 @@ static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state, return -EMSGSIZE; if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) || - nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn) || - nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid)) + nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, c->mcast_ifn) || + nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, c->syncid) || + nla_put_u16(skb, IPVS_DAEMON_ATTR_SYNC_MAXLEN, c->sync_maxlen) || + nla_put_u16(skb, IPVS_DAEMON_ATTR_MCAST_PORT, c->mcast_port) || + nla_put_u8(skb, IPVS_DAEMON_ATTR_MCAST_TTL, c->mcast_ttl)) goto nla_put_failure; +#ifdef CONFIG_IP_VS_IPV6 + if (c->mcast_af == AF_INET6) { + if (nla_put_in6_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP6, + &c->mcast_group.in6)) + goto nla_put_failure; + } else +#endif + if (c->mcast_af == AF_INET && + nla_put_in_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP, + c->mcast_group.ip)) + goto nla_put_failure; nla_nest_end(skb, nl_daemon); return 0; @@ -3288,7 +3325,7 @@ nla_put_failure: } static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state, - const char *mcast_ifn, __u32 syncid, + struct ipvs_sync_daemon_cfg *c, struct netlink_callback *cb) { void *hdr; @@ -3298,7 +3335,7 @@ static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state, if (!hdr) return -EMSGSIZE; - if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid)) + if (ip_vs_genl_fill_daemon(skb, state, c)) goto nla_put_failure; genlmsg_end(skb, hdr); @@ -3312,14 +3349,13 @@ nla_put_failure: static int ip_vs_genl_dump_daemons(struct sk_buff *skb, struct netlink_callback *cb) { - struct net *net = skb_sknet(skb); + struct net *net = sock_net(skb->sk); struct netns_ipvs *ipvs = net_ipvs(net); mutex_lock(&ipvs->sync_mutex); if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) { if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER, - ipvs->master_mcast_ifn, - ipvs->master_syncid, cb) < 0) + &ipvs->mcfg, cb) < 0) goto nla_put_failure; cb->args[0] = 1; @@ -3327,8 +3363,7 @@ static int ip_vs_genl_dump_daemons(struct sk_buff *skb, if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) { if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP, - ipvs->backup_mcast_ifn, - ipvs->backup_syncid, cb) < 0) + &ipvs->bcfg, cb) < 0) goto nla_put_failure; cb->args[1] = 1; @@ -3340,39 +3375,90 @@ nla_put_failure: return skb->len; } -static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs) +static int ip_vs_genl_new_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs) { + struct ipvs_sync_daemon_cfg c; + struct nlattr *a; + int ret; + + memset(&c, 0, sizeof(c)); if (!(attrs[IPVS_DAEMON_ATTR_STATE] && attrs[IPVS_DAEMON_ATTR_MCAST_IFN] && attrs[IPVS_DAEMON_ATTR_SYNC_ID])) return -EINVAL; + strlcpy(c.mcast_ifn, nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), + sizeof(c.mcast_ifn)); + c.syncid = nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]); + + a = attrs[IPVS_DAEMON_ATTR_SYNC_MAXLEN]; + if (a) + c.sync_maxlen = nla_get_u16(a); + + a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP]; + if (a) { + c.mcast_af = AF_INET; + c.mcast_group.ip = nla_get_in_addr(a); + if (!ipv4_is_multicast(c.mcast_group.ip)) + return -EINVAL; + } else { + a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP6]; + if (a) { +#ifdef CONFIG_IP_VS_IPV6 + int addr_type; + + c.mcast_af = AF_INET6; + c.mcast_group.in6 = nla_get_in6_addr(a); + addr_type = ipv6_addr_type(&c.mcast_group.in6); + if (!(addr_type & IPV6_ADDR_MULTICAST)) + return -EINVAL; +#else + return -EAFNOSUPPORT; +#endif + } + } + + a = attrs[IPVS_DAEMON_ATTR_MCAST_PORT]; + if (a) + c.mcast_port = nla_get_u16(a); + + a = attrs[IPVS_DAEMON_ATTR_MCAST_TTL]; + if (a) + c.mcast_ttl = nla_get_u8(a); /* The synchronization protocol is incompatible with mixed family * services */ - if (net_ipvs(net)->mixed_address_family_dests > 0) + if (ipvs->mixed_address_family_dests > 0) return -EINVAL; - return start_sync_thread(net, - nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]), - nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), - nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID])); + rtnl_lock(); + mutex_lock(&ipvs->sync_mutex); + ret = start_sync_thread(ipvs, &c, + nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); + mutex_unlock(&ipvs->sync_mutex); + rtnl_unlock(); + return ret; } -static int ip_vs_genl_del_daemon(struct net *net, struct nlattr **attrs) +static int ip_vs_genl_del_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs) { + int ret; + if (!attrs[IPVS_DAEMON_ATTR_STATE]) return -EINVAL; - return stop_sync_thread(net, - nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); + mutex_lock(&ipvs->sync_mutex); + ret = stop_sync_thread(ipvs, + nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); + mutex_unlock(&ipvs->sync_mutex); + return ret; } -static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs) +static int ip_vs_genl_set_config(struct netns_ipvs *ipvs, struct nlattr **attrs) { struct ip_vs_timeout_user t; - __ip_vs_get_timeouts(net, &t); + __ip_vs_get_timeouts(ipvs, &t); if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]) t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]); @@ -3384,38 +3470,33 @@ static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs) if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]) t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]); - return ip_vs_set_timeout(net, &t); + return ip_vs_set_timeout(ipvs, &t); } static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info) { - int ret = 0, cmd; - struct net *net; - struct netns_ipvs *ipvs; + int ret = -EINVAL, cmd; + struct net *net = sock_net(skb->sk); + struct netns_ipvs *ipvs = net_ipvs(net); - net = skb_sknet(skb); - ipvs = net_ipvs(net); cmd = info->genlhdr->cmd; if (cmd == IPVS_CMD_NEW_DAEMON || cmd == IPVS_CMD_DEL_DAEMON) { struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1]; - mutex_lock(&ipvs->sync_mutex); if (!info->attrs[IPVS_CMD_ATTR_DAEMON] || nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX, info->attrs[IPVS_CMD_ATTR_DAEMON], - ip_vs_daemon_policy)) { - ret = -EINVAL; + ip_vs_daemon_policy)) goto out; - } if (cmd == IPVS_CMD_NEW_DAEMON) - ret = ip_vs_genl_new_daemon(net, daemon_attrs); + ret = ip_vs_genl_new_daemon(ipvs, daemon_attrs); else - ret = ip_vs_genl_del_daemon(net, daemon_attrs); -out: - mutex_unlock(&ipvs->sync_mutex); + ret = ip_vs_genl_del_daemon(ipvs, daemon_attrs); } + +out: return ret; } @@ -3426,22 +3507,22 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) struct ip_vs_dest_user_kern udest; int ret = 0, cmd; int need_full_svc = 0, need_full_dest = 0; - struct net *net; + struct net *net = sock_net(skb->sk); + struct netns_ipvs *ipvs = net_ipvs(net); - net = skb_sknet(skb); cmd = info->genlhdr->cmd; mutex_lock(&__ip_vs_mutex); if (cmd == IPVS_CMD_FLUSH) { - ret = ip_vs_flush(net, false); + ret = ip_vs_flush(ipvs, false); goto out; } else if (cmd == IPVS_CMD_SET_CONFIG) { - ret = ip_vs_genl_set_config(net, info->attrs); + ret = ip_vs_genl_set_config(ipvs, info->attrs); goto out; } else if (cmd == IPVS_CMD_ZERO && !info->attrs[IPVS_CMD_ATTR_SERVICE]) { - ret = ip_vs_zero_all(net); + ret = ip_vs_zero_all(ipvs); goto out; } @@ -3451,7 +3532,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE) need_full_svc = 1; - ret = ip_vs_genl_parse_service(net, &usvc, + ret = ip_vs_genl_parse_service(ipvs, &usvc, info->attrs[IPVS_CMD_ATTR_SERVICE], need_full_svc, &svc); if (ret) @@ -3490,7 +3571,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) /* The synchronization protocol is incompatible * with mixed family services */ - if (net_ipvs(net)->sync_state) { + if (ipvs->sync_state) { ret = -EINVAL; goto out; } @@ -3510,7 +3591,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) switch (cmd) { case IPVS_CMD_NEW_SERVICE: if (svc == NULL) - ret = ip_vs_add_service(net, &usvc, &svc); + ret = ip_vs_add_service(ipvs, &usvc, &svc); else ret = -EEXIST; break; @@ -3548,9 +3629,9 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) struct sk_buff *msg; void *reply; int ret, cmd, reply_cmd; - struct net *net; + struct net *net = sock_net(skb->sk); + struct netns_ipvs *ipvs = net_ipvs(net); - net = skb_sknet(skb); cmd = info->genlhdr->cmd; if (cmd == IPVS_CMD_GET_SERVICE) @@ -3579,7 +3660,7 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) { struct ip_vs_service *svc; - svc = ip_vs_genl_find_service(net, + svc = ip_vs_genl_find_service(ipvs, info->attrs[IPVS_CMD_ATTR_SERVICE]); if (IS_ERR(svc)) { ret = PTR_ERR(svc); @@ -3600,7 +3681,7 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) { struct ip_vs_timeout_user t; - __ip_vs_get_timeouts(net, &t); + __ip_vs_get_timeouts(ipvs, &t); #ifdef CONFIG_IP_VS_PROTO_TCP if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout) || @@ -3755,10 +3836,10 @@ static void ip_vs_genl_unregister(void) * per netns intit/exit func. */ #ifdef CONFIG_SYSCTL -static int __net_init ip_vs_control_net_init_sysctl(struct net *net) +static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) { + struct net *net = ipvs->net; int idx; - struct netns_ipvs *ipvs = net_ipvs(net); struct ctl_table *tbl; atomic_set(&ipvs->dropentry, 0); @@ -3777,6 +3858,10 @@ static int __net_init ip_vs_control_net_init_sysctl(struct net *net) } else tbl = vs_vars; /* Initialize sysctl defaults */ + for (idx = 0; idx < ARRAY_SIZE(vs_vars); idx++) { + if (tbl[idx].proc_handler == proc_do_defense_mode) + tbl[idx].extra2 = ipvs; + } idx = 0; ipvs->sysctl_amemthresh = 1024; tbl[idx++].data = &ipvs->sysctl_amemthresh; @@ -3818,7 +3903,8 @@ static int __net_init ip_vs_control_net_init_sysctl(struct net *net) tbl[idx++].data = &ipvs->sysctl_backup_only; ipvs->sysctl_conn_reuse_mode = 1; tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode; - + tbl[idx++].data = &ipvs->sysctl_schedule_icmp; + tbl[idx++].data = &ipvs->sysctl_ignore_tunneled; ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl); if (ipvs->sysctl_hdr == NULL) { @@ -3826,7 +3912,7 @@ static int __net_init ip_vs_control_net_init_sysctl(struct net *net) kfree(tbl); return -ENOMEM; } - ip_vs_start_estimator(net, &ipvs->tot_stats); + ip_vs_start_estimator(ipvs, &ipvs->tot_stats); ipvs->sysctl_tbl = tbl; /* Schedule defense work */ INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler); @@ -3835,14 +3921,14 @@ static int __net_init ip_vs_control_net_init_sysctl(struct net *net) return 0; } -static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) +static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) { - struct netns_ipvs *ipvs = net_ipvs(net); + struct net *net = ipvs->net; cancel_delayed_work_sync(&ipvs->defense_work); cancel_work_sync(&ipvs->defense_work.work); unregister_net_sysctl_table(ipvs->sysctl_hdr); - ip_vs_stop_estimator(net, &ipvs->tot_stats); + ip_vs_stop_estimator(ipvs, &ipvs->tot_stats); if (!net_eq(net, &init_net)) kfree(ipvs->sysctl_tbl); @@ -3850,8 +3936,8 @@ static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) #else -static int __net_init ip_vs_control_net_init_sysctl(struct net *net) { return 0; } -static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) { } +static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) { return 0; } +static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) { } #endif @@ -3859,10 +3945,10 @@ static struct notifier_block ip_vs_dst_notifier = { .notifier_call = ip_vs_dst_event, }; -int __net_init ip_vs_control_net_init(struct net *net) +int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) { + struct net *net = ipvs->net; int i, idx; - struct netns_ipvs *ipvs = net_ipvs(net); /* Initialize rs_table */ for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++) @@ -3871,7 +3957,7 @@ int __net_init ip_vs_control_net_init(struct net *net) INIT_LIST_HEAD(&ipvs->dest_trash); spin_lock_init(&ipvs->dest_trash_lock); setup_timer(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire, - (unsigned long) net); + (unsigned long) ipvs); atomic_set(&ipvs->ftpsvc_counter, 0); atomic_set(&ipvs->nullsvc_counter, 0); @@ -3893,7 +3979,7 @@ int __net_init ip_vs_control_net_init(struct net *net) proc_create("ip_vs_stats_percpu", 0, net->proc_net, &ip_vs_stats_percpu_fops); - if (ip_vs_control_net_init_sysctl(net)) + if (ip_vs_control_net_init_sysctl(ipvs)) goto err; return 0; @@ -3903,12 +3989,12 @@ err: return -ENOMEM; } -void __net_exit ip_vs_control_net_cleanup(struct net *net) +void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs) { - struct netns_ipvs *ipvs = net_ipvs(net); + struct net *net = ipvs->net; - ip_vs_trash_cleanup(net); - ip_vs_control_net_cleanup_sysctl(net); + ip_vs_trash_cleanup(ipvs); + ip_vs_control_net_cleanup_sysctl(ipvs); remove_proc_entry("ip_vs_stats_percpu", net->proc_net); remove_proc_entry("ip_vs_stats", net->proc_net); remove_proc_entry("ip_vs", net->proc_net); diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index ef0eb0a8d552..457c6c193e13 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -102,10 +102,8 @@ static void estimation_timer(unsigned long arg) struct ip_vs_estimator *e; struct ip_vs_stats *s; u64 rate; - struct net *net = (struct net *)arg; - struct netns_ipvs *ipvs; + struct netns_ipvs *ipvs = (struct netns_ipvs *)arg; - ipvs = net_ipvs(net); spin_lock(&ipvs->est_lock); list_for_each_entry(e, &ipvs->est_list, list) { s = container_of(e, struct ip_vs_stats, est); @@ -140,9 +138,8 @@ static void estimation_timer(unsigned long arg) mod_timer(&ipvs->est_timer, jiffies + 2*HZ); } -void ip_vs_start_estimator(struct net *net, struct ip_vs_stats *stats) +void ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) { - struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_estimator *est = &stats->est; INIT_LIST_HEAD(&est->list); @@ -152,9 +149,8 @@ void ip_vs_start_estimator(struct net *net, struct ip_vs_stats *stats) spin_unlock_bh(&ipvs->est_lock); } -void ip_vs_stop_estimator(struct net *net, struct ip_vs_stats *stats) +void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) { - struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_estimator *est = &stats->est; spin_lock_bh(&ipvs->est_lock); @@ -192,18 +188,16 @@ void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats) dst->outbps = (e->outbps + 0xF) >> 5; } -int __net_init ip_vs_estimator_net_init(struct net *net) +int __net_init ip_vs_estimator_net_init(struct netns_ipvs *ipvs) { - struct netns_ipvs *ipvs = net_ipvs(net); - INIT_LIST_HEAD(&ipvs->est_list); spin_lock_init(&ipvs->est_lock); - setup_timer(&ipvs->est_timer, estimation_timer, (unsigned long)net); + setup_timer(&ipvs->est_timer, estimation_timer, (unsigned long)ipvs); mod_timer(&ipvs->est_timer, jiffies + 2 * HZ); return 0; } -void __net_exit ip_vs_estimator_net_cleanup(struct net *net) +void __net_exit ip_vs_estimator_net_cleanup(struct netns_ipvs *ipvs) { - del_timer_sync(&net_ipvs(net)->est_timer); + del_timer_sync(&ipvs->est_timer); } diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index 5d3daae98bf0..d30c327bb578 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -181,7 +181,6 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, int ret = 0; enum ip_conntrack_info ctinfo; struct nf_conn *ct; - struct net *net; *diff = 0; @@ -223,14 +222,14 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, */ { struct ip_vs_conn_param p; - ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET, + ip_vs_conn_fill_param(cp->ipvs, AF_INET, iph->protocol, &from, port, &cp->caddr, 0, &p); n_cp = ip_vs_conn_out_get(&p); } if (!n_cp) { struct ip_vs_conn_param p; - ip_vs_conn_fill_param(ip_vs_conn_net(cp), + ip_vs_conn_fill_param(cp->ipvs, AF_INET, IPPROTO_TCP, &cp->caddr, 0, &cp->vaddr, port, &p); /* As above, this is ipv4 only */ @@ -289,9 +288,8 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, * would be adjusted twice. */ - net = skb_net(skb); cp->app_data = NULL; - ip_vs_tcp_conn_listen(net, n_cp); + ip_vs_tcp_conn_listen(n_cp); ip_vs_conn_put(n_cp); return ret; } @@ -320,7 +318,6 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, union nf_inet_addr to; __be16 port; struct ip_vs_conn *n_cp; - struct net *net; /* no diff required for incoming packets */ *diff = 0; @@ -392,7 +389,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, { struct ip_vs_conn_param p; - ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET, + ip_vs_conn_fill_param(cp->ipvs, AF_INET, iph->protocol, &to, port, &cp->vaddr, htons(ntohs(cp->vport)-1), &p); n_cp = ip_vs_conn_in_get(&p); @@ -413,8 +410,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, /* * Move tunnel to listen state */ - net = skb_net(skb); - ip_vs_tcp_conn_listen(net, n_cp); + ip_vs_tcp_conn_listen(n_cp); ip_vs_conn_put(n_cp); return 1; @@ -447,14 +443,14 @@ static int __net_init __ip_vs_ftp_init(struct net *net) if (!ipvs) return -ENOENT; - app = register_ip_vs_app(net, &ip_vs_ftp); + app = register_ip_vs_app(ipvs, &ip_vs_ftp); if (IS_ERR(app)) return PTR_ERR(app); for (i = 0; i < ports_count; i++) { if (!ports[i]) continue; - ret = register_ip_vs_app_inc(net, app, app->protocol, ports[i]); + ret = register_ip_vs_app_inc(ipvs, app, app->protocol, ports[i]); if (ret) goto err_unreg; pr_info("%s: loaded support on port[%d] = %d\n", @@ -463,7 +459,7 @@ static int __net_init __ip_vs_ftp_init(struct net *net) return 0; err_unreg: - unregister_ip_vs_app(net, &ip_vs_ftp); + unregister_ip_vs_app(ipvs, &ip_vs_ftp); return ret; } /* @@ -471,7 +467,12 @@ err_unreg: */ static void __ip_vs_ftp_exit(struct net *net) { - unregister_ip_vs_app(net, &ip_vs_ftp); + struct netns_ipvs *ipvs = net_ipvs(net); + + if (!ipvs) + return; + + unregister_ip_vs_app(ipvs, &ip_vs_ftp); } static struct pernet_operations ip_vs_ftp_ops = { diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index 127f14046c51..cccf4d637412 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -250,8 +250,7 @@ static void ip_vs_lblc_flush(struct ip_vs_service *svc) static int sysctl_lblc_expiration(struct ip_vs_service *svc) { #ifdef CONFIG_SYSCTL - struct netns_ipvs *ipvs = net_ipvs(svc->net); - return ipvs->sysctl_lblc_expiration; + return svc->ipvs->sysctl_lblc_expiration; #else return DEFAULT_EXPIRATION; #endif diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index 2229d2d8bbe0..796d70e47ddd 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -415,8 +415,7 @@ static void ip_vs_lblcr_flush(struct ip_vs_service *svc) static int sysctl_lblcr_expiration(struct ip_vs_service *svc) { #ifdef CONFIG_SYSCTL - struct netns_ipvs *ipvs = net_ipvs(svc->net); - return ipvs->sysctl_lblcr_expiration; + return svc->ipvs->sysctl_lblcr_expiration; #else return DEFAULT_EXPIRATION; #endif diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c index 136184572fc9..30434fb133df 100644 --- a/net/netfilter/ipvs/ip_vs_nfct.c +++ b/net/netfilter/ipvs/ip_vs_nfct.c @@ -161,7 +161,7 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct, /* RS->CLIENT */ orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; - ip_vs_conn_fill_param(net, exp->tuple.src.l3num, orig->dst.protonum, + ip_vs_conn_fill_param(net_ipvs(net), exp->tuple.src.l3num, orig->dst.protonum, &orig->src.u3, orig->src.u.tcp.port, &orig->dst.u3, orig->dst.u.tcp.port, &p); cp = ip_vs_conn_out_get(&p); @@ -274,8 +274,7 @@ void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp) " for conn " FMT_CONN "\n", __func__, ARG_TUPLE(&tuple), ARG_CONN(cp)); - h = nf_conntrack_find_get(ip_vs_conn_net(cp), &nf_ct_zone_dflt, - &tuple); + h = nf_conntrack_find_get(cp->ipvs->net, &nf_ct_zone_dflt, &tuple); if (h) { ct = nf_ct_tuplehash_to_ctrack(h); /* Show what happens instead of calling nf_ct_kill() */ diff --git a/net/netfilter/ipvs/ip_vs_ovf.c b/net/netfilter/ipvs/ip_vs_ovf.c new file mode 100644 index 000000000000..f7d62c3b7329 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_ovf.c @@ -0,0 +1,86 @@ +/* + * IPVS: Overflow-Connection Scheduling module + * + * Authors: Raducu Deaconu <rhadoo_io@yahoo.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Scheduler implements "overflow" loadbalancing according to number of active + * connections , will keep all conections to the node with the highest weight + * and overflow to the next node if the number of connections exceeds the node's + * weight. + * Note that this scheduler might not be suitable for UDP because it only uses + * active connections + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> + +#include <net/ip_vs.h> + +/* OVF Connection scheduling */ +static struct ip_vs_dest * +ip_vs_ovf_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + struct ip_vs_dest *dest, *h = NULL; + int hw = 0, w; + + IP_VS_DBG(6, "ip_vs_ovf_schedule(): Scheduling...\n"); + /* select the node with highest weight, go to next in line if active + * connections exceed weight + */ + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { + w = atomic_read(&dest->weight); + if ((dest->flags & IP_VS_DEST_F_OVERLOAD) || + atomic_read(&dest->activeconns) > w || + w == 0) + continue; + if (!h || w > hw) { + h = dest; + hw = w; + } + } + + if (h) { + IP_VS_DBG_BUF(6, "OVF: server %s:%u active %d w %d\n", + IP_VS_DBG_ADDR(h->af, &h->addr), + ntohs(h->port), + atomic_read(&h->activeconns), + atomic_read(&h->weight)); + return h; + } + + ip_vs_scheduler_err(svc, "no destination available"); + return NULL; +} + +static struct ip_vs_scheduler ip_vs_ovf_scheduler = { + .name = "ovf", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_ovf_scheduler.n_list), + .schedule = ip_vs_ovf_schedule, +}; + +static int __init ip_vs_ovf_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_ovf_scheduler); +} + +static void __exit ip_vs_ovf_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_ovf_scheduler); + synchronize_rcu(); +} + +module_init(ip_vs_ovf_init); +module_exit(ip_vs_ovf_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c index bed5f7042529..1b8d594e493a 100644 --- a/net/netfilter/ipvs/ip_vs_pe_sip.c +++ b/net/netfilter/ipvs/ip_vs_pe_sip.c @@ -70,7 +70,7 @@ ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb) const char *dptr; int retc; - ip_vs_fill_iph_skb(p->af, skb, &iph); + ip_vs_fill_iph_skb(p->af, skb, false, &iph); /* Only useful with UDP */ if (iph.protocol != IPPROTO_UDP) diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c index 939f7fbe9b46..8ae480715cea 100644 --- a/net/netfilter/ipvs/ip_vs_proto.c +++ b/net/netfilter/ipvs/ip_vs_proto.c @@ -63,9 +63,8 @@ static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp) * register an ipvs protocols netns related data */ static int -register_ip_vs_proto_netns(struct net *net, struct ip_vs_protocol *pp) +register_ip_vs_proto_netns(struct netns_ipvs *ipvs, struct ip_vs_protocol *pp) { - struct netns_ipvs *ipvs = net_ipvs(net); unsigned int hash = IP_VS_PROTO_HASH(pp->protocol); struct ip_vs_proto_data *pd = kzalloc(sizeof(struct ip_vs_proto_data), GFP_KERNEL); @@ -79,7 +78,7 @@ register_ip_vs_proto_netns(struct net *net, struct ip_vs_protocol *pp) atomic_set(&pd->appcnt, 0); /* Init app counter */ if (pp->init_netns != NULL) { - int ret = pp->init_netns(net, pd); + int ret = pp->init_netns(ipvs, pd); if (ret) { /* unlink an free proto data */ ipvs->proto_data_table[hash] = pd->next; @@ -116,9 +115,8 @@ static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp) * unregister an ipvs protocols netns data */ static int -unregister_ip_vs_proto_netns(struct net *net, struct ip_vs_proto_data *pd) +unregister_ip_vs_proto_netns(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) { - struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_proto_data **pd_p; unsigned int hash = IP_VS_PROTO_HASH(pd->pp->protocol); @@ -127,7 +125,7 @@ unregister_ip_vs_proto_netns(struct net *net, struct ip_vs_proto_data *pd) if (*pd_p == pd) { *pd_p = pd->next; if (pd->pp->exit_netns != NULL) - pd->pp->exit_netns(net, pd); + pd->pp->exit_netns(ipvs, pd); kfree(pd); return 0; } @@ -156,8 +154,8 @@ EXPORT_SYMBOL(ip_vs_proto_get); /* * get ip_vs_protocol object data by netns and proto */ -static struct ip_vs_proto_data * -__ipvs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto) +struct ip_vs_proto_data * +ip_vs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto) { struct ip_vs_proto_data *pd; unsigned int hash = IP_VS_PROTO_HASH(proto); @@ -169,14 +167,6 @@ __ipvs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto) return NULL; } - -struct ip_vs_proto_data * -ip_vs_proto_data_get(struct net *net, unsigned short proto) -{ - struct netns_ipvs *ipvs = net_ipvs(net); - - return __ipvs_proto_data_get(ipvs, proto); -} EXPORT_SYMBOL(ip_vs_proto_data_get); /* @@ -317,7 +307,7 @@ ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp, /* * per network name-space init */ -int __net_init ip_vs_protocol_net_init(struct net *net) +int __net_init ip_vs_protocol_net_init(struct netns_ipvs *ipvs) { int i, ret; static struct ip_vs_protocol *protos[] = { @@ -339,27 +329,26 @@ int __net_init ip_vs_protocol_net_init(struct net *net) }; for (i = 0; i < ARRAY_SIZE(protos); i++) { - ret = register_ip_vs_proto_netns(net, protos[i]); + ret = register_ip_vs_proto_netns(ipvs, protos[i]); if (ret < 0) goto cleanup; } return 0; cleanup: - ip_vs_protocol_net_cleanup(net); + ip_vs_protocol_net_cleanup(ipvs); return ret; } -void __net_exit ip_vs_protocol_net_cleanup(struct net *net) +void __net_exit ip_vs_protocol_net_cleanup(struct netns_ipvs *ipvs) { - struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_proto_data *pd; int i; /* unregister all the ipvs proto data for this netns */ for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { while ((pd = ipvs->proto_data_table[i]) != NULL) - unregister_ip_vs_proto_netns(net, pd); + unregister_ip_vs_proto_netns(ipvs, pd); } } diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c index 5de3dd312c0f..5320d39976e1 100644 --- a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c +++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c @@ -41,30 +41,28 @@ struct isakmp_hdr { #define PORT_ISAKMP 500 static void -ah_esp_conn_fill_param_proto(struct net *net, int af, - const struct ip_vs_iphdr *iph, int inverse, +ah_esp_conn_fill_param_proto(struct netns_ipvs *ipvs, int af, + const struct ip_vs_iphdr *iph, struct ip_vs_conn_param *p) { - if (likely(!inverse)) - ip_vs_conn_fill_param(net, af, IPPROTO_UDP, + if (likely(!ip_vs_iph_inverse(iph))) + ip_vs_conn_fill_param(ipvs, af, IPPROTO_UDP, &iph->saddr, htons(PORT_ISAKMP), &iph->daddr, htons(PORT_ISAKMP), p); else - ip_vs_conn_fill_param(net, af, IPPROTO_UDP, + ip_vs_conn_fill_param(ipvs, af, IPPROTO_UDP, &iph->daddr, htons(PORT_ISAKMP), &iph->saddr, htons(PORT_ISAKMP), p); } static struct ip_vs_conn * -ah_esp_conn_in_get(int af, const struct sk_buff *skb, - const struct ip_vs_iphdr *iph, - int inverse) +ah_esp_conn_in_get(struct netns_ipvs *ipvs, int af, const struct sk_buff *skb, + const struct ip_vs_iphdr *iph) { struct ip_vs_conn *cp; struct ip_vs_conn_param p; - struct net *net = skb_net(skb); - ah_esp_conn_fill_param_proto(net, af, iph, inverse, &p); + ah_esp_conn_fill_param_proto(ipvs, af, iph, &p); cp = ip_vs_conn_in_get(&p); if (!cp) { /* @@ -73,7 +71,7 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb, */ IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet " "%s%s %s->%s\n", - inverse ? "ICMP+" : "", + ip_vs_iph_icmp(iph) ? "ICMP+" : "", ip_vs_proto_get(iph->protocol)->name, IP_VS_DBG_ADDR(af, &iph->saddr), IP_VS_DBG_ADDR(af, &iph->daddr)); @@ -84,19 +82,18 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb, static struct ip_vs_conn * -ah_esp_conn_out_get(int af, const struct sk_buff *skb, - const struct ip_vs_iphdr *iph, int inverse) +ah_esp_conn_out_get(struct netns_ipvs *ipvs, int af, const struct sk_buff *skb, + const struct ip_vs_iphdr *iph) { struct ip_vs_conn *cp; struct ip_vs_conn_param p; - struct net *net = skb_net(skb); - ah_esp_conn_fill_param_proto(net, af, iph, inverse, &p); + ah_esp_conn_fill_param_proto(ipvs, af, iph, &p); cp = ip_vs_conn_out_get(&p); if (!cp) { IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet " "%s%s %s->%s\n", - inverse ? "ICMP+" : "", + ip_vs_iph_icmp(iph) ? "ICMP+" : "", ip_vs_proto_get(iph->protocol)->name, IP_VS_DBG_ADDR(af, &iph->saddr), IP_VS_DBG_ADDR(af, &iph->daddr)); @@ -107,7 +104,8 @@ ah_esp_conn_out_get(int af, const struct sk_buff *skb, static int -ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, +ah_esp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, + struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp, struct ip_vs_iphdr *iph) { diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index 5b84c0b56642..010ddeec135f 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -9,35 +9,44 @@ #include <net/ip_vs.h> static int -sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, +sctp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, + struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp, struct ip_vs_iphdr *iph) { - struct net *net; struct ip_vs_service *svc; - struct netns_ipvs *ipvs; sctp_chunkhdr_t _schunkh, *sch; sctp_sctphdr_t *sh, _sctph; - - sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph); - if (sh == NULL) { - *verdict = NF_DROP; - return 0; + __be16 _ports[2], *ports = NULL; + + if (likely(!ip_vs_iph_icmp(iph))) { + sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph); + if (sh) { + sch = skb_header_pointer( + skb, iph->len + sizeof(sctp_sctphdr_t), + sizeof(_schunkh), &_schunkh); + if (sch && (sch->type == SCTP_CID_INIT || + sysctl_sloppy_sctp(ipvs))) + ports = &sh->source; + } + } else { + ports = skb_header_pointer( + skb, iph->len, sizeof(_ports), &_ports); } - sch = skb_header_pointer(skb, iph->len + sizeof(sctp_sctphdr_t), - sizeof(_schunkh), &_schunkh); - if (sch == NULL) { + if (!ports) { *verdict = NF_DROP; return 0; } - net = skb_net(skb); - ipvs = net_ipvs(net); rcu_read_lock(); - if ((sch->type == SCTP_CID_INIT || sysctl_sloppy_sctp(ipvs)) && - (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol, - &iph->daddr, sh->dest))) { + if (likely(!ip_vs_iph_inverse(iph))) + svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, + &iph->daddr, ports[1]); + else + svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, + &iph->saddr, ports[0]); + if (svc) { int ignored; if (ip_vs_todrop(ipvs)) { @@ -474,14 +483,13 @@ static inline __u16 sctp_app_hashkey(__be16 port) & SCTP_APP_TAB_MASK; } -static int sctp_register_app(struct net *net, struct ip_vs_app *inc) +static int sctp_register_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) { struct ip_vs_app *i; __u16 hash; __be16 port = inc->port; int ret = 0; - struct netns_ipvs *ipvs = net_ipvs(net); - struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_SCTP); hash = sctp_app_hashkey(port); @@ -498,9 +506,9 @@ out: return ret; } -static void sctp_unregister_app(struct net *net, struct ip_vs_app *inc) +static void sctp_unregister_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) { - struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_SCTP); atomic_dec(&pd->appcnt); list_del_rcu(&inc->p_list); @@ -508,7 +516,7 @@ static void sctp_unregister_app(struct net *net, struct ip_vs_app *inc) static int sctp_app_conn_bind(struct ip_vs_conn *cp) { - struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp)); + struct netns_ipvs *ipvs = cp->ipvs; int hash; struct ip_vs_app *inc; int result = 0; @@ -549,10 +557,8 @@ out: * timeouts is netns related now. * --------------------------------------------- */ -static int __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd) +static int __ip_vs_sctp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) { - struct netns_ipvs *ipvs = net_ipvs(net); - ip_vs_init_hash_table(ipvs->sctp_apps, SCTP_APP_TAB_SIZE); pd->timeout_table = ip_vs_create_timeout_table((int *)sctp_timeouts, sizeof(sctp_timeouts)); @@ -561,7 +567,7 @@ static int __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd) return 0; } -static void __ip_vs_sctp_exit(struct net *net, struct ip_vs_proto_data *pd) +static void __ip_vs_sctp_exit(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) { kfree(pd->timeout_table); } diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index 8e92beb0cca9..d7024b2ed769 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -32,27 +32,47 @@ #include <net/ip_vs.h> static int -tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, +tcp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, + struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp, struct ip_vs_iphdr *iph) { - struct net *net; struct ip_vs_service *svc; struct tcphdr _tcph, *th; - struct netns_ipvs *ipvs; + __be16 _ports[2], *ports = NULL; - th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); - if (th == NULL) { + /* In the event of icmp, we're only guaranteed to have the first 8 + * bytes of the transport header, so we only check the rest of the + * TCP packet for non-ICMP packets + */ + if (likely(!ip_vs_iph_icmp(iph))) { + th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); + if (th) { + if (th->rst || !(sysctl_sloppy_tcp(ipvs) || th->syn)) + return 1; + ports = &th->source; + } + } else { + ports = skb_header_pointer( + skb, iph->len, sizeof(_ports), &_ports); + } + + if (!ports) { *verdict = NF_DROP; return 0; } - net = skb_net(skb); - ipvs = net_ipvs(net); + /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */ rcu_read_lock(); - if ((th->syn || sysctl_sloppy_tcp(ipvs)) && !th->rst && - (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol, - &iph->daddr, th->dest))) { + + if (likely(!ip_vs_iph_inverse(iph))) + svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, + &iph->daddr, ports[1]); + else + svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, + &iph->saddr, ports[0]); + + if (svc) { int ignored; if (ip_vs_todrop(ipvs)) { @@ -571,14 +591,13 @@ static inline __u16 tcp_app_hashkey(__be16 port) } -static int tcp_register_app(struct net *net, struct ip_vs_app *inc) +static int tcp_register_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) { struct ip_vs_app *i; __u16 hash; __be16 port = inc->port; int ret = 0; - struct netns_ipvs *ipvs = net_ipvs(net); - struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); hash = tcp_app_hashkey(port); @@ -597,9 +616,9 @@ static int tcp_register_app(struct net *net, struct ip_vs_app *inc) static void -tcp_unregister_app(struct net *net, struct ip_vs_app *inc) +tcp_unregister_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) { - struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); atomic_dec(&pd->appcnt); list_del_rcu(&inc->p_list); @@ -609,7 +628,7 @@ tcp_unregister_app(struct net *net, struct ip_vs_app *inc) static int tcp_app_conn_bind(struct ip_vs_conn *cp) { - struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp)); + struct netns_ipvs *ipvs = cp->ipvs; int hash; struct ip_vs_app *inc; int result = 0; @@ -653,9 +672,9 @@ tcp_app_conn_bind(struct ip_vs_conn *cp) /* * Set LISTEN timeout. (ip_vs_conn_put will setup timer) */ -void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp) +void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp) { - struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(cp->ipvs, IPPROTO_TCP); spin_lock_bh(&cp->lock); cp->state = IP_VS_TCP_S_LISTEN; @@ -668,10 +687,8 @@ void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp) * timeouts is netns related now. * --------------------------------------------- */ -static int __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd) +static int __ip_vs_tcp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) { - struct netns_ipvs *ipvs = net_ipvs(net); - ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE); pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts, sizeof(tcp_timeouts)); @@ -681,7 +698,7 @@ static int __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd) return 0; } -static void __ip_vs_tcp_exit(struct net *net, struct ip_vs_proto_data *pd) +static void __ip_vs_tcp_exit(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) { kfree(pd->timeout_table); } diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index b62a3c0ff9bf..e494e9a88c7f 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -29,28 +29,42 @@ #include <net/ip6_checksum.h> static int -udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, +udp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, + struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp, struct ip_vs_iphdr *iph) { - struct net *net; struct ip_vs_service *svc; struct udphdr _udph, *uh; + __be16 _ports[2], *ports = NULL; - /* IPv6 fragments, only first fragment will hit this */ - uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph); - if (uh == NULL) { + if (likely(!ip_vs_iph_icmp(iph))) { + /* IPv6 fragments, only first fragment will hit this */ + uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph); + if (uh) + ports = &uh->source; + } else { + ports = skb_header_pointer( + skb, iph->len, sizeof(_ports), &_ports); + } + + if (!ports) { *verdict = NF_DROP; return 0; } - net = skb_net(skb); + rcu_read_lock(); - svc = ip_vs_service_find(net, af, skb->mark, iph->protocol, - &iph->daddr, uh->dest); + if (likely(!ip_vs_iph_inverse(iph))) + svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, + &iph->daddr, ports[1]); + else + svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, + &iph->saddr, ports[0]); + if (svc) { int ignored; - if (ip_vs_todrop(net_ipvs(net))) { + if (ip_vs_todrop(ipvs)) { /* * It seems that we are very loaded. * We have to drop this packet :( @@ -348,14 +362,13 @@ static inline __u16 udp_app_hashkey(__be16 port) } -static int udp_register_app(struct net *net, struct ip_vs_app *inc) +static int udp_register_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) { struct ip_vs_app *i; __u16 hash; __be16 port = inc->port; int ret = 0; - struct netns_ipvs *ipvs = net_ipvs(net); - struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP); hash = udp_app_hashkey(port); @@ -374,9 +387,9 @@ static int udp_register_app(struct net *net, struct ip_vs_app *inc) static void -udp_unregister_app(struct net *net, struct ip_vs_app *inc) +udp_unregister_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) { - struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP); atomic_dec(&pd->appcnt); list_del_rcu(&inc->p_list); @@ -385,7 +398,7 @@ udp_unregister_app(struct net *net, struct ip_vs_app *inc) static int udp_app_conn_bind(struct ip_vs_conn *cp) { - struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp)); + struct netns_ipvs *ipvs = cp->ipvs; int hash; struct ip_vs_app *inc; int result = 0; @@ -456,10 +469,8 @@ udp_state_transition(struct ip_vs_conn *cp, int direction, cp->timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL]; } -static int __udp_init(struct net *net, struct ip_vs_proto_data *pd) +static int __udp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) { - struct netns_ipvs *ipvs = net_ipvs(net); - ip_vs_init_hash_table(ipvs->udp_apps, UDP_APP_TAB_SIZE); pd->timeout_table = ip_vs_create_timeout_table((int *)udp_timeouts, sizeof(udp_timeouts)); @@ -468,7 +479,7 @@ static int __udp_init(struct net *net, struct ip_vs_proto_data *pd) return 0; } -static void __udp_exit(struct net *net, struct ip_vs_proto_data *pd) +static void __udp_exit(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) { kfree(pd->timeout_table); } diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c index 98a13433b68c..1e373a5e44e3 100644 --- a/net/netfilter/ipvs/ip_vs_sh.c +++ b/net/netfilter/ipvs/ip_vs_sh.c @@ -280,35 +280,29 @@ static int ip_vs_sh_dest_changed(struct ip_vs_service *svc, static inline __be16 ip_vs_sh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph) { - __be16 port; - struct tcphdr _tcph, *th; - struct udphdr _udph, *uh; - sctp_sctphdr_t _sctph, *sh; + __be16 _ports[2], *ports; + /* At this point we know that we have a valid packet of some kind. + * Because ICMP packets are only guaranteed to have the first 8 + * bytes, let's just grab the ports. Fortunately they're in the + * same position for all three of the protocols we care about. + */ switch (iph->protocol) { case IPPROTO_TCP: - th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); - if (unlikely(th == NULL)) - return 0; - port = th->source; - break; case IPPROTO_UDP: - uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph); - if (unlikely(uh == NULL)) - return 0; - port = uh->source; - break; case IPPROTO_SCTP: - sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph); - if (unlikely(sh == NULL)) + ports = skb_header_pointer(skb, iph->len, sizeof(_ports), + &_ports); + if (unlikely(!ports)) return 0; - port = sh->source; - break; + + if (likely(!ip_vs_iph_inverse(iph))) + return ports[0]; + else + return ports[1]; default: - port = 0; + return 0; } - - return port; } @@ -322,6 +316,9 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, struct ip_vs_dest *dest; struct ip_vs_sh_state *s; __be16 port = 0; + const union nf_inet_addr *hash_addr; + + hash_addr = ip_vs_iph_inverse(iph) ? &iph->daddr : &iph->saddr; IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n"); @@ -331,9 +328,9 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, s = (struct ip_vs_sh_state *) svc->sched_data; if (svc->flags & IP_VS_SVC_F_SCHED_SH_FALLBACK) - dest = ip_vs_sh_get_fallback(svc, s, &iph->saddr, port); + dest = ip_vs_sh_get_fallback(svc, s, hash_addr, port); else - dest = ip_vs_sh_get(svc, s, &iph->saddr, port); + dest = ip_vs_sh_get(svc, s, hash_addr, port); if (!dest) { ip_vs_scheduler_err(svc, "no destination available"); @@ -341,7 +338,7 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, } IP_VS_DBG_BUF(6, "SH: source IP address %s --> server %s:%d\n", - IP_VS_DBG_ADDR(svc->af, &iph->saddr), + IP_VS_DBG_ADDR(svc->af, hash_addr), IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index d99ad93eb855..803001a45aa1 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -193,7 +193,7 @@ union ip_vs_sync_conn { #define IPVS_OPT_F_PARAM (1 << (IPVS_OPT_PARAM-1)) struct ip_vs_sync_thread_data { - struct net *net; + struct netns_ipvs *ipvs; struct socket *sock; char *buf; int id; @@ -262,6 +262,11 @@ struct ip_vs_sync_mesg { /* ip_vs_sync_conn entries start here */ }; +union ipvs_sockaddr { + struct sockaddr_in in; + struct sockaddr_in6 in6; +}; + struct ip_vs_sync_buff { struct list_head list; unsigned long firstuse; @@ -320,26 +325,28 @@ sb_dequeue(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms) * Create a new sync buffer for Version 1 proto. */ static inline struct ip_vs_sync_buff * -ip_vs_sync_buff_create(struct netns_ipvs *ipvs) +ip_vs_sync_buff_create(struct netns_ipvs *ipvs, unsigned int len) { struct ip_vs_sync_buff *sb; if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) return NULL; - sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC); + len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg), + ipvs->mcfg.sync_maxlen); + sb->mesg = kmalloc(len, GFP_ATOMIC); if (!sb->mesg) { kfree(sb); return NULL; } sb->mesg->reserved = 0; /* old nr_conns i.e. must be zero now */ sb->mesg->version = SYNC_PROTO_VER; - sb->mesg->syncid = ipvs->master_syncid; + sb->mesg->syncid = ipvs->mcfg.syncid; sb->mesg->size = htons(sizeof(struct ip_vs_sync_mesg)); sb->mesg->nr_conns = 0; sb->mesg->spare = 0; sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg); - sb->end = (unsigned char *)sb->mesg + ipvs->send_mesg_maxlen; + sb->end = (unsigned char *)sb->mesg + len; sb->firstuse = jiffies; return sb; @@ -402,7 +409,7 @@ select_master_thread_id(struct netns_ipvs *ipvs, struct ip_vs_conn *cp) * Create a new sync buffer for Version 0 proto. */ static inline struct ip_vs_sync_buff * -ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs) +ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs, unsigned int len) { struct ip_vs_sync_buff *sb; struct ip_vs_sync_mesg_v0 *mesg; @@ -410,17 +417,19 @@ ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs) if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) return NULL; - sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC); + len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg_v0), + ipvs->mcfg.sync_maxlen); + sb->mesg = kmalloc(len, GFP_ATOMIC); if (!sb->mesg) { kfree(sb); return NULL; } mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg; mesg->nr_conns = 0; - mesg->syncid = ipvs->master_syncid; + mesg->syncid = ipvs->mcfg.syncid; mesg->size = htons(sizeof(struct ip_vs_sync_mesg_v0)); sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0); - sb->end = (unsigned char *)mesg + ipvs->send_mesg_maxlen; + sb->end = (unsigned char *)mesg + len; sb->firstuse = jiffies; return sb; } @@ -524,16 +533,15 @@ set: * Version 0 , could be switched in by sys_ctl. * Add an ip_vs_conn information into the current sync_buff. */ -static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp, +static void ip_vs_sync_conn_v0(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, int pkts) { - struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_sync_mesg_v0 *m; struct ip_vs_sync_conn_v0 *s; struct ip_vs_sync_buff *buff; struct ipvs_master_sync_state *ms; int id; - int len; + unsigned int len; if (unlikely(cp->af != AF_INET)) return; @@ -553,17 +561,19 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp, id = select_master_thread_id(ipvs, cp); ms = &ipvs->ms[id]; buff = ms->sync_buff; + len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : + SIMPLE_CONN_SIZE; if (buff) { m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; /* Send buffer if it is for v1 */ - if (!m->nr_conns) { + if (buff->head + len > buff->end || !m->nr_conns) { sb_queue_tail(ipvs, ms); ms->sync_buff = NULL; buff = NULL; } } if (!buff) { - buff = ip_vs_sync_buff_create_v0(ipvs); + buff = ip_vs_sync_buff_create_v0(ipvs, len); if (!buff) { spin_unlock_bh(&ipvs->sync_buff_lock); pr_err("ip_vs_sync_buff_create failed.\n"); @@ -572,8 +582,6 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp, ms->sync_buff = buff; } - len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : - SIMPLE_CONN_SIZE; m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; s = (struct ip_vs_sync_conn_v0 *) buff->head; @@ -597,12 +605,6 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp, m->nr_conns++; m->size = htons(ntohs(m->size) + len); buff->head += len; - - /* check if there is a space for next one */ - if (buff->head + FULL_CONN_SIZE > buff->end) { - sb_queue_tail(ipvs, ms); - ms->sync_buff = NULL; - } spin_unlock_bh(&ipvs->sync_buff_lock); /* synchronize its controller if it has */ @@ -612,7 +614,7 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp, pkts = atomic_add_return(1, &cp->in_pkts); else pkts = sysctl_sync_threshold(ipvs); - ip_vs_sync_conn(net, cp, pkts); + ip_vs_sync_conn(ipvs, cp, pkts); } } @@ -621,9 +623,8 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp, * Called by ip_vs_in. * Sending Version 1 messages */ -void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp, int pkts) +void ip_vs_sync_conn(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, int pkts) { - struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_sync_mesg *m; union ip_vs_sync_conn *s; struct ip_vs_sync_buff *buff; @@ -634,7 +635,7 @@ void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp, int pkts) /* Handle old version of the protocol */ if (sysctl_sync_ver(ipvs) == 0) { - ip_vs_sync_conn_v0(net, cp, pkts); + ip_vs_sync_conn_v0(ipvs, cp, pkts); return; } /* Do not sync ONE PACKET */ @@ -694,7 +695,7 @@ sloop: } if (!buff) { - buff = ip_vs_sync_buff_create(ipvs); + buff = ip_vs_sync_buff_create(ipvs, len); if (!buff) { spin_unlock_bh(&ipvs->sync_buff_lock); pr_err("ip_vs_sync_buff_create failed.\n"); @@ -781,21 +782,21 @@ control: * fill_param used by version 1 */ static inline int -ip_vs_conn_fill_param_sync(struct net *net, int af, union ip_vs_sync_conn *sc, +ip_vs_conn_fill_param_sync(struct netns_ipvs *ipvs, int af, union ip_vs_sync_conn *sc, struct ip_vs_conn_param *p, __u8 *pe_data, unsigned int pe_data_len, __u8 *pe_name, unsigned int pe_name_len) { #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) - ip_vs_conn_fill_param(net, af, sc->v6.protocol, + ip_vs_conn_fill_param(ipvs, af, sc->v6.protocol, (const union nf_inet_addr *)&sc->v6.caddr, sc->v6.cport, (const union nf_inet_addr *)&sc->v6.vaddr, sc->v6.vport, p); else #endif - ip_vs_conn_fill_param(net, af, sc->v4.protocol, + ip_vs_conn_fill_param(ipvs, af, sc->v4.protocol, (const union nf_inet_addr *)&sc->v4.caddr, sc->v4.cport, (const union nf_inet_addr *)&sc->v4.vaddr, @@ -834,7 +835,7 @@ ip_vs_conn_fill_param_sync(struct net *net, int af, union ip_vs_sync_conn *sc, * Param: ... * timeout is in sec. */ -static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, +static void ip_vs_proc_conn(struct netns_ipvs *ipvs, struct ip_vs_conn_param *param, unsigned int flags, unsigned int state, unsigned int protocol, unsigned int type, const union nf_inet_addr *daddr, __be16 dport, @@ -843,7 +844,6 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, { struct ip_vs_dest *dest; struct ip_vs_conn *cp; - struct netns_ipvs *ipvs = net_ipvs(net); if (!(flags & IP_VS_CONN_F_TEMPLATE)) { cp = ip_vs_conn_in_get(param); @@ -901,7 +901,7 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, * with synchronization, so we can make the assumption that * the svc_af is the same as the dest_af */ - dest = ip_vs_find_dest(net, type, type, daddr, dport, + dest = ip_vs_find_dest(ipvs, type, type, daddr, dport, param->vaddr, param->vport, protocol, fwmark, flags); @@ -938,7 +938,7 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, } else { struct ip_vs_proto_data *pd; - pd = ip_vs_proto_data_get(net, protocol); + pd = ip_vs_proto_data_get(ipvs, protocol); if (!(flags & IP_VS_CONN_F_TEMPLATE) && pd && pd->timeout_table) cp->timeout = pd->timeout_table[state]; else @@ -950,7 +950,7 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, /* * Process received multicast message for Version 0 */ -static void ip_vs_process_message_v0(struct net *net, const char *buffer, +static void ip_vs_process_message_v0(struct netns_ipvs *ipvs, const char *buffer, const size_t buflen) { struct ip_vs_sync_mesg_v0 *m = (struct ip_vs_sync_mesg_v0 *)buffer; @@ -1006,14 +1006,14 @@ static void ip_vs_process_message_v0(struct net *net, const char *buffer, } } - ip_vs_conn_fill_param(net, AF_INET, s->protocol, + ip_vs_conn_fill_param(ipvs, AF_INET, s->protocol, (const union nf_inet_addr *)&s->caddr, s->cport, (const union nf_inet_addr *)&s->vaddr, s->vport, ¶m); /* Send timeout as Zero */ - ip_vs_proc_conn(net, ¶m, flags, state, s->protocol, AF_INET, + ip_vs_proc_conn(ipvs, ¶m, flags, state, s->protocol, AF_INET, (union nf_inet_addr *)&s->daddr, s->dport, 0, 0, opt); } @@ -1064,7 +1064,7 @@ static int ip_vs_proc_str(__u8 *p, unsigned int plen, unsigned int *data_len, /* * Process a Version 1 sync. connection */ -static inline int ip_vs_proc_sync_conn(struct net *net, __u8 *p, __u8 *msg_end) +static inline int ip_vs_proc_sync_conn(struct netns_ipvs *ipvs, __u8 *p, __u8 *msg_end) { struct ip_vs_sync_conn_options opt; union ip_vs_sync_conn *s; @@ -1168,21 +1168,21 @@ static inline int ip_vs_proc_sync_conn(struct net *net, __u8 *p, __u8 *msg_end) state = 0; } } - if (ip_vs_conn_fill_param_sync(net, af, s, ¶m, pe_data, + if (ip_vs_conn_fill_param_sync(ipvs, af, s, ¶m, pe_data, pe_data_len, pe_name, pe_name_len)) { retc = 50; goto out; } /* If only IPv4, just silent skip IPv6 */ if (af == AF_INET) - ip_vs_proc_conn(net, ¶m, flags, state, s->v4.protocol, af, + ip_vs_proc_conn(ipvs, ¶m, flags, state, s->v4.protocol, af, (union nf_inet_addr *)&s->v4.daddr, s->v4.dport, ntohl(s->v4.timeout), ntohl(s->v4.fwmark), (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL) ); #ifdef CONFIG_IP_VS_IPV6 else - ip_vs_proc_conn(net, ¶m, flags, state, s->v6.protocol, af, + ip_vs_proc_conn(ipvs, ¶m, flags, state, s->v6.protocol, af, (union nf_inet_addr *)&s->v6.daddr, s->v6.dport, ntohl(s->v6.timeout), ntohl(s->v6.fwmark), (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL) @@ -1201,10 +1201,9 @@ out: * ip_vs_conn entries. * Handles Version 0 & 1 */ -static void ip_vs_process_message(struct net *net, __u8 *buffer, +static void ip_vs_process_message(struct netns_ipvs *ipvs, __u8 *buffer, const size_t buflen) { - struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer; __u8 *p, *msg_end; int i, nr_conns; @@ -1219,7 +1218,7 @@ static void ip_vs_process_message(struct net *net, __u8 *buffer, return; } /* SyncID sanity check */ - if (ipvs->backup_syncid != 0 && m2->syncid != ipvs->backup_syncid) { + if (ipvs->bcfg.syncid != 0 && m2->syncid != ipvs->bcfg.syncid) { IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid); return; } @@ -1254,7 +1253,7 @@ static void ip_vs_process_message(struct net *net, __u8 *buffer, return; } /* Process a single sync_conn */ - retc = ip_vs_proc_sync_conn(net, p, msg_end); + retc = ip_vs_proc_sync_conn(ipvs, p, msg_end); if (retc < 0) { IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n", retc); @@ -1265,7 +1264,7 @@ static void ip_vs_process_message(struct net *net, __u8 *buffer, } } else { /* Old type of message */ - ip_vs_process_message_v0(net, buffer, buflen); + ip_vs_process_message_v0(ipvs, buffer, buflen); return; } } @@ -1303,6 +1302,14 @@ static void set_mcast_loop(struct sock *sk, u_char loop) /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */ lock_sock(sk); inet->mc_loop = loop ? 1 : 0; +#ifdef CONFIG_IP_VS_IPV6 + if (sk->sk_family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + /* IPV6_MULTICAST_LOOP */ + np->mc_loop = loop ? 1 : 0; + } +#endif release_sock(sk); } @@ -1316,6 +1323,33 @@ static void set_mcast_ttl(struct sock *sk, u_char ttl) /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */ lock_sock(sk); inet->mc_ttl = ttl; +#ifdef CONFIG_IP_VS_IPV6 + if (sk->sk_family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + /* IPV6_MULTICAST_HOPS */ + np->mcast_hops = ttl; + } +#endif + release_sock(sk); +} + +/* Control fragmentation of messages */ +static void set_mcast_pmtudisc(struct sock *sk, int val) +{ + struct inet_sock *inet = inet_sk(sk); + + /* setsockopt(sock, SOL_IP, IP_MTU_DISCOVER, &val, sizeof(val)); */ + lock_sock(sk); + inet->pmtudisc = val; +#ifdef CONFIG_IP_VS_IPV6 + if (sk->sk_family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + /* IPV6_MTU_DISCOVER */ + np->pmtudisc = val; + } +#endif release_sock(sk); } @@ -1338,44 +1372,15 @@ static int set_mcast_if(struct sock *sk, char *ifname) lock_sock(sk); inet->mc_index = dev->ifindex; /* inet->mc_addr = 0; */ - release_sock(sk); - - return 0; -} - +#ifdef CONFIG_IP_VS_IPV6 + if (sk->sk_family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); -/* - * Set the maximum length of sync message according to the - * specified interface's MTU. - */ -static int set_sync_mesg_maxlen(struct net *net, int sync_state) -{ - struct netns_ipvs *ipvs = net_ipvs(net); - struct net_device *dev; - int num; - - if (sync_state == IP_VS_STATE_MASTER) { - dev = __dev_get_by_name(net, ipvs->master_mcast_ifn); - if (!dev) - return -ENODEV; - - num = (dev->mtu - sizeof(struct iphdr) - - sizeof(struct udphdr) - - SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE; - ipvs->send_mesg_maxlen = SYNC_MESG_HEADER_LEN + - SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF); - IP_VS_DBG(7, "setting the maximum length of sync sending " - "message %d.\n", ipvs->send_mesg_maxlen); - } else if (sync_state == IP_VS_STATE_BACKUP) { - dev = __dev_get_by_name(net, ipvs->backup_mcast_ifn); - if (!dev) - return -ENODEV; - - ipvs->recv_mesg_maxlen = dev->mtu - - sizeof(struct iphdr) - sizeof(struct udphdr); - IP_VS_DBG(7, "setting the maximum length of sync receiving " - "message %d.\n", ipvs->recv_mesg_maxlen); + /* IPV6_MULTICAST_IF */ + np->mcast_oif = dev->ifindex; } +#endif + release_sock(sk); return 0; } @@ -1405,15 +1410,34 @@ join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) mreq.imr_ifindex = dev->ifindex; - rtnl_lock(); lock_sock(sk); ret = ip_mc_join_group(sk, &mreq); release_sock(sk); - rtnl_unlock(); return ret; } +#ifdef CONFIG_IP_VS_IPV6 +static int join_mcast_group6(struct sock *sk, struct in6_addr *addr, + char *ifname) +{ + struct net *net = sock_net(sk); + struct net_device *dev; + int ret; + + dev = __dev_get_by_name(net, ifname); + if (!dev) + return -ENODEV; + if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) + return -EINVAL; + + lock_sock(sk); + ret = ipv6_sock_mc_join(sk, dev->ifindex, addr); + release_sock(sk); + + return ret; +} +#endif static int bind_mcastif_addr(struct socket *sock, char *ifname) { @@ -1442,47 +1466,69 @@ static int bind_mcastif_addr(struct socket *sock, char *ifname) return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin)); } +static void get_mcast_sockaddr(union ipvs_sockaddr *sa, int *salen, + struct ipvs_sync_daemon_cfg *c, int id) +{ + if (AF_INET6 == c->mcast_af) { + sa->in6 = (struct sockaddr_in6) { + .sin6_family = AF_INET6, + .sin6_port = htons(c->mcast_port + id), + }; + sa->in6.sin6_addr = c->mcast_group.in6; + *salen = sizeof(sa->in6); + } else { + sa->in = (struct sockaddr_in) { + .sin_family = AF_INET, + .sin_port = htons(c->mcast_port + id), + }; + sa->in.sin_addr = c->mcast_group.in; + *salen = sizeof(sa->in); + } +} + /* * Set up sending multicast socket over UDP */ -static struct socket *make_send_sock(struct net *net, int id) +static struct socket *make_send_sock(struct netns_ipvs *ipvs, int id) { - struct netns_ipvs *ipvs = net_ipvs(net); /* multicast addr */ - struct sockaddr_in mcast_addr = { - .sin_family = AF_INET, - .sin_port = cpu_to_be16(IP_VS_SYNC_PORT + id), - .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP), - }; + union ipvs_sockaddr mcast_addr; struct socket *sock; - int result; + int result, salen; /* First create a socket */ - result = sock_create_kern(net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); + result = sock_create_kern(ipvs->net, ipvs->mcfg.mcast_af, SOCK_DGRAM, + IPPROTO_UDP, &sock); if (result < 0) { pr_err("Error during creation of socket; terminating\n"); return ERR_PTR(result); } - result = set_mcast_if(sock->sk, ipvs->master_mcast_ifn); + result = set_mcast_if(sock->sk, ipvs->mcfg.mcast_ifn); if (result < 0) { pr_err("Error setting outbound mcast interface\n"); goto error; } set_mcast_loop(sock->sk, 0); - set_mcast_ttl(sock->sk, 1); + set_mcast_ttl(sock->sk, ipvs->mcfg.mcast_ttl); + /* Allow fragmentation if MTU changes */ + set_mcast_pmtudisc(sock->sk, IP_PMTUDISC_DONT); result = sysctl_sync_sock_size(ipvs); if (result > 0) set_sock_size(sock->sk, 1, result); - result = bind_mcastif_addr(sock, ipvs->master_mcast_ifn); + if (AF_INET == ipvs->mcfg.mcast_af) + result = bind_mcastif_addr(sock, ipvs->mcfg.mcast_ifn); + else + result = 0; if (result < 0) { pr_err("Error binding address of the mcast interface\n"); goto error; } + get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->mcfg, id); result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr, - sizeof(struct sockaddr), 0); + salen, 0); if (result < 0) { pr_err("Error connecting to the multicast addr\n"); goto error; @@ -1499,20 +1545,16 @@ error: /* * Set up receiving multicast socket over UDP */ -static struct socket *make_receive_sock(struct net *net, int id) +static struct socket *make_receive_sock(struct netns_ipvs *ipvs, int id) { - struct netns_ipvs *ipvs = net_ipvs(net); /* multicast addr */ - struct sockaddr_in mcast_addr = { - .sin_family = AF_INET, - .sin_port = cpu_to_be16(IP_VS_SYNC_PORT + id), - .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP), - }; + union ipvs_sockaddr mcast_addr; struct socket *sock; - int result; + int result, salen; /* First create a socket */ - result = sock_create_kern(net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); + result = sock_create_kern(ipvs->net, ipvs->bcfg.mcast_af, SOCK_DGRAM, + IPPROTO_UDP, &sock); if (result < 0) { pr_err("Error during creation of socket; terminating\n"); return ERR_PTR(result); @@ -1523,17 +1565,22 @@ static struct socket *make_receive_sock(struct net *net, int id) if (result > 0) set_sock_size(sock->sk, 0, result); - result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr, - sizeof(struct sockaddr)); + get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id); + result = sock->ops->bind(sock, (struct sockaddr *)&mcast_addr, salen); if (result < 0) { pr_err("Error binding to the multicast addr\n"); goto error; } /* join the multicast group */ - result = join_mcast_group(sock->sk, - (struct in_addr *) &mcast_addr.sin_addr, - ipvs->backup_mcast_ifn); +#ifdef CONFIG_IP_VS_IPV6 + if (ipvs->bcfg.mcast_af == AF_INET6) + result = join_mcast_group6(sock->sk, &mcast_addr.in6.sin6_addr, + ipvs->bcfg.mcast_ifn); + else +#endif + result = join_mcast_group(sock->sk, &mcast_addr.in.sin_addr, + ipvs->bcfg.mcast_ifn); if (result < 0) { pr_err("Error joining to the multicast group\n"); goto error; @@ -1634,14 +1681,14 @@ next_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms) static int sync_thread_master(void *data) { struct ip_vs_sync_thread_data *tinfo = data; - struct netns_ipvs *ipvs = net_ipvs(tinfo->net); + struct netns_ipvs *ipvs = tinfo->ipvs; struct ipvs_master_sync_state *ms = &ipvs->ms[tinfo->id]; struct sock *sk = tinfo->sock->sk; struct ip_vs_sync_buff *sb; pr_info("sync thread started: state = MASTER, mcast_ifn = %s, " "syncid = %d, id = %d\n", - ipvs->master_mcast_ifn, ipvs->master_syncid, tinfo->id); + ipvs->mcfg.mcast_ifn, ipvs->mcfg.syncid, tinfo->id); for (;;) { sb = next_sync_buff(ipvs, ms); @@ -1690,12 +1737,12 @@ done: static int sync_thread_backup(void *data) { struct ip_vs_sync_thread_data *tinfo = data; - struct netns_ipvs *ipvs = net_ipvs(tinfo->net); + struct netns_ipvs *ipvs = tinfo->ipvs; int len; pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, " "syncid = %d, id = %d\n", - ipvs->backup_mcast_ifn, ipvs->backup_syncid, tinfo->id); + ipvs->bcfg.mcast_ifn, ipvs->bcfg.syncid, tinfo->id); while (!kthread_should_stop()) { wait_event_interruptible(*sk_sleep(tinfo->sock->sk), @@ -1705,14 +1752,14 @@ static int sync_thread_backup(void *data) /* do we have data now? */ while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) { len = ip_vs_receive(tinfo->sock, tinfo->buf, - ipvs->recv_mesg_maxlen); + ipvs->bcfg.sync_maxlen); if (len <= 0) { if (len != -EAGAIN) pr_err("receiving message error\n"); break; } - ip_vs_process_message(tinfo->net, tinfo->buf, len); + ip_vs_process_message(ipvs, tinfo->buf, len); } } @@ -1725,16 +1772,18 @@ static int sync_thread_backup(void *data) } -int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid) +int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, + int state) { struct ip_vs_sync_thread_data *tinfo; struct task_struct **array = NULL, *task; struct socket *sock; - struct netns_ipvs *ipvs = net_ipvs(net); + struct net_device *dev; char *name; int (*threadfn)(void *data); - int id, count; + int id, count, hlen; int result = -ENOMEM; + u16 mtu, min_mtu; IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n", @@ -1746,22 +1795,46 @@ int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid) } else count = ipvs->threads_mask + 1; + if (c->mcast_af == AF_UNSPEC) { + c->mcast_af = AF_INET; + c->mcast_group.ip = cpu_to_be32(IP_VS_SYNC_GROUP); + } + if (!c->mcast_port) + c->mcast_port = IP_VS_SYNC_PORT; + if (!c->mcast_ttl) + c->mcast_ttl = 1; + + dev = __dev_get_by_name(ipvs->net, c->mcast_ifn); + if (!dev) { + pr_err("Unknown mcast interface: %s\n", c->mcast_ifn); + return -ENODEV; + } + hlen = (AF_INET6 == c->mcast_af) ? + sizeof(struct ipv6hdr) + sizeof(struct udphdr) : + sizeof(struct iphdr) + sizeof(struct udphdr); + mtu = (state == IP_VS_STATE_BACKUP) ? + clamp(dev->mtu, 1500U, 65535U) : 1500U; + min_mtu = (state == IP_VS_STATE_BACKUP) ? 1024 : 1; + + if (c->sync_maxlen) + c->sync_maxlen = clamp_t(unsigned int, + c->sync_maxlen, min_mtu, + 65535 - hlen); + else + c->sync_maxlen = mtu - hlen; + if (state == IP_VS_STATE_MASTER) { if (ipvs->ms) return -EEXIST; - strlcpy(ipvs->master_mcast_ifn, mcast_ifn, - sizeof(ipvs->master_mcast_ifn)); - ipvs->master_syncid = syncid; + ipvs->mcfg = *c; name = "ipvs-m:%d:%d"; threadfn = sync_thread_master; } else if (state == IP_VS_STATE_BACKUP) { if (ipvs->backup_threads) return -EEXIST; - strlcpy(ipvs->backup_mcast_ifn, mcast_ifn, - sizeof(ipvs->backup_mcast_ifn)); - ipvs->backup_syncid = syncid; + ipvs->bcfg = *c; name = "ipvs-b:%d:%d"; threadfn = sync_thread_backup; } else { @@ -1789,14 +1862,13 @@ int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid) if (!array) goto out; } - set_sync_mesg_maxlen(net, state); tinfo = NULL; for (id = 0; id < count; id++) { if (state == IP_VS_STATE_MASTER) - sock = make_send_sock(net, id); + sock = make_send_sock(ipvs, id); else - sock = make_receive_sock(net, id); + sock = make_receive_sock(ipvs, id); if (IS_ERR(sock)) { result = PTR_ERR(sock); goto outtinfo; @@ -1804,10 +1876,10 @@ int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid) tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); if (!tinfo) goto outsocket; - tinfo->net = net; + tinfo->ipvs = ipvs; tinfo->sock = sock; if (state == IP_VS_STATE_BACKUP) { - tinfo->buf = kmalloc(ipvs->recv_mesg_maxlen, + tinfo->buf = kmalloc(ipvs->bcfg.sync_maxlen, GFP_KERNEL); if (!tinfo->buf) goto outtinfo; @@ -1868,9 +1940,8 @@ out: } -int stop_sync_thread(struct net *net, int state) +int stop_sync_thread(struct netns_ipvs *ipvs, int state) { - struct netns_ipvs *ipvs = net_ipvs(net); struct task_struct **array; int id; int retc = -EINVAL; @@ -1936,27 +2007,24 @@ int stop_sync_thread(struct net *net, int state) /* * Initialize data struct for each netns */ -int __net_init ip_vs_sync_net_init(struct net *net) +int __net_init ip_vs_sync_net_init(struct netns_ipvs *ipvs) { - struct netns_ipvs *ipvs = net_ipvs(net); - __mutex_init(&ipvs->sync_mutex, "ipvs->sync_mutex", &__ipvs_sync_key); spin_lock_init(&ipvs->sync_lock); spin_lock_init(&ipvs->sync_buff_lock); return 0; } -void ip_vs_sync_net_cleanup(struct net *net) +void ip_vs_sync_net_cleanup(struct netns_ipvs *ipvs) { int retc; - struct netns_ipvs *ipvs = net_ipvs(net); mutex_lock(&ipvs->sync_mutex); - retc = stop_sync_thread(net, IP_VS_STATE_MASTER); + retc = stop_sync_thread(ipvs, IP_VS_STATE_MASTER); if (retc && retc != -ESRCH) pr_err("Failed to stop Master Daemon\n"); - retc = stop_sync_thread(net, IP_VS_STATE_BACKUP); + retc = stop_sync_thread(ipvs, IP_VS_STATE_BACKUP); if (retc && retc != -ESRCH) pr_err("Failed to stop Backup Daemon\n"); mutex_unlock(&ipvs->sync_mutex); diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 258a0b0e82a2..3264cb49b333 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -212,19 +212,20 @@ static inline void maybe_update_pmtu(int skb_af, struct sk_buff *skb, int mtu) ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu); } -static inline bool ensure_mtu_is_adequate(int skb_af, int rt_mode, +static inline bool ensure_mtu_is_adequate(struct netns_ipvs *ipvs, int skb_af, + int rt_mode, struct ip_vs_iphdr *ipvsh, struct sk_buff *skb, int mtu) { #ifdef CONFIG_IP_VS_IPV6 if (skb_af == AF_INET6) { - struct net *net = dev_net(skb_dst(skb)->dev); + struct net *net = ipvs->net; if (unlikely(__mtu_check_toobig_v6(skb, mtu))) { if (!skb->dev) skb->dev = net->loopback_dev; /* only send ICMP too big on first fragment */ - if (!ipvsh->fragoffs) + if (!ipvsh->fragoffs && !ip_vs_iph_icmp(ipvsh)) icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); IP_VS_DBG(1, "frag needed for %pI6c\n", &ipv6_hdr(skb)->saddr); @@ -233,8 +234,6 @@ static inline bool ensure_mtu_is_adequate(int skb_af, int rt_mode, } else #endif { - struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); - /* If we're going to tunnel the packet and pmtu discovery * is disabled, we'll just fragment it anyway */ @@ -242,7 +241,8 @@ static inline bool ensure_mtu_is_adequate(int skb_af, int rt_mode, return true; if (unlikely(ip_hdr(skb)->frag_off & htons(IP_DF) && - skb->len > mtu && !skb_is_gso(skb))) { + skb->len > mtu && !skb_is_gso(skb) && + !ip_vs_iph_icmp(ipvsh))) { icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); IP_VS_DBG(1, "frag needed for %pI4\n", @@ -256,11 +256,12 @@ static inline bool ensure_mtu_is_adequate(int skb_af, int rt_mode, /* Get route to destination or remote server */ static int -__ip_vs_get_out_rt(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, +__ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, + struct ip_vs_dest *dest, __be32 daddr, int rt_mode, __be32 *ret_saddr, struct ip_vs_iphdr *ipvsh) { - struct net *net = dev_net(skb_dst(skb)->dev); + struct net *net = ipvs->net; struct ip_vs_dest_dst *dest_dst; struct rtable *rt; /* Route to the other host */ int mtu; @@ -336,7 +337,7 @@ __ip_vs_get_out_rt(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, maybe_update_pmtu(skb_af, skb, mtu); } - if (!ensure_mtu_is_adequate(skb_af, rt_mode, ipvsh, skb, mtu)) + if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu)) goto err_put; skb_dst_drop(skb); @@ -402,11 +403,12 @@ out_err: * Get route to destination or remote server */ static int -__ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, +__ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, + struct ip_vs_dest *dest, struct in6_addr *daddr, struct in6_addr *ret_saddr, struct ip_vs_iphdr *ipvsh, int do_xfrm, int rt_mode) { - struct net *net = dev_net(skb_dst(skb)->dev); + struct net *net = ipvs->net; struct ip_vs_dest_dst *dest_dst; struct rt6_info *rt; /* Route to the other host */ struct dst_entry *dst; @@ -484,7 +486,7 @@ __ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, maybe_update_pmtu(skb_af, skb, mtu); } - if (!ensure_mtu_is_adequate(skb_af, rt_mode, ipvsh, skb, mtu)) + if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu)) goto err_put; skb_dst_drop(skb); @@ -573,8 +575,8 @@ static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb, skb_forward_csum(skb); if (!skb->sk) skb_sender_cpu_clear(skb); - NF_HOOK(pf, NF_INET_LOCAL_OUT, NULL, skb, - NULL, skb_dst(skb)->dev, dst_output_sk); + NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, + NULL, skb_dst(skb)->dev, dst_output); } else ret = NF_ACCEPT; @@ -595,8 +597,8 @@ static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb, skb_forward_csum(skb); if (!skb->sk) skb_sender_cpu_clear(skb); - NF_HOOK(pf, NF_INET_LOCAL_OUT, NULL, skb, - NULL, skb_dst(skb)->dev, dst_output_sk); + NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, + NULL, skb_dst(skb)->dev, dst_output); } else ret = NF_ACCEPT; return ret; @@ -629,7 +631,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); rcu_read_lock(); - if (__ip_vs_get_out_rt(cp->af, skb, NULL, iph->daddr, + if (__ip_vs_get_out_rt(cp->ipvs, cp->af, skb, NULL, iph->daddr, IP_VS_RT_MODE_NON_LOCAL, NULL, ipvsh) < 0) goto tx_error; @@ -656,10 +658,13 @@ int ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { + struct ipv6hdr *iph = ipv6_hdr(skb); + EnterFunction(10); rcu_read_lock(); - if (__ip_vs_get_out_rt_v6(cp->af, skb, NULL, &ipvsh->daddr.in6, NULL, + if (__ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, NULL, + &iph->daddr, NULL, ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0) goto tx_error; @@ -706,7 +711,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, } was_input = rt_is_input_route(skb_rtable(skb)); - local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip, + local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_RDR, NULL, ipvsh); @@ -723,7 +728,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct nf_conn *ct = nf_ct_get(skb, &ctinfo); if (ct && !nf_ct_is_untracked(ct)) { - IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, 0, + IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, ipvsh->off, "ip_vs_nat_xmit(): " "stopping DNAT to local address"); goto tx_error; @@ -733,8 +738,9 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* From world but DNAT to loopback address? */ if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) { - IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): " - "stopping DNAT to loopback address"); + IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, ipvsh->off, + "ip_vs_nat_xmit(): stopping DNAT to loopback " + "address"); goto tx_error; } @@ -751,7 +757,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, ip_hdr(skb)->daddr = cp->daddr.ip; ip_send_check(ip_hdr(skb)); - IP_VS_DBG_PKT(10, AF_INET, pp, skb, 0, "After DNAT"); + IP_VS_DBG_PKT(10, AF_INET, pp, skb, ipvsh->off, "After DNAT"); /* FIXME: when application helper enlarges the packet and the length is larger than the MTU of outgoing device, there will be still @@ -794,7 +800,8 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); } - local = __ip_vs_get_out_rt_v6(cp->af, skb, cp->dest, &cp->daddr.in6, + local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, + &cp->daddr.in6, NULL, ipvsh, 0, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | @@ -812,7 +819,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct nf_conn *ct = nf_ct_get(skb, &ctinfo); if (ct && !nf_ct_is_untracked(ct)) { - IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, 0, + IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, ipvsh->off, "ip_vs_nat_xmit_v6(): " "stopping DNAT to local address"); goto tx_error; @@ -823,7 +830,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* From world but DNAT to loopback address? */ if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) { - IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, 0, + IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, ipvsh->off, "ip_vs_nat_xmit_v6(): " "stopping DNAT to loopback address"); goto tx_error; @@ -841,7 +848,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error; ipv6_hdr(skb)->daddr = cp->daddr.in6; - IP_VS_DBG_PKT(10, AF_INET6, pp, skb, 0, "After DNAT"); + IP_VS_DBG_PKT(10, AF_INET6, pp, skb, ipvsh->off, "After DNAT"); /* FIXME: when application helper enlarges the packet and the length is larger than the MTU of outgoing device, there will be still @@ -967,8 +974,8 @@ int ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { - struct net *net = skb_net(skb); - struct netns_ipvs *ipvs = net_ipvs(net); + struct netns_ipvs *ipvs = cp->ipvs; + struct net *net = ipvs->net; struct rtable *rt; /* Route to the other host */ __be32 saddr; /* Source for tunnel */ struct net_device *tdev; /* Device to other host */ @@ -984,7 +991,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); rcu_read_lock(); - local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip, + local = __ip_vs_get_out_rt(ipvs, cp->af, skb, cp->dest, cp->daddr.ip, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_CONNECT | @@ -1042,7 +1049,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, ret = ip_vs_tunnel_xmit_prepare(skb, cp); if (ret == NF_ACCEPT) - ip_local_out(skb); + ip_local_out(net, skb->sk, skb); else if (ret == NF_DROP) kfree_skb(skb); rcu_read_unlock(); @@ -1078,7 +1085,8 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); rcu_read_lock(); - local = __ip_vs_get_out_rt_v6(cp->af, skb, cp->dest, &cp->daddr.in6, + local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, + &cp->daddr.in6, &saddr, ipvsh, 1, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | @@ -1133,7 +1141,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, ret = ip_vs_tunnel_xmit_prepare(skb, cp); if (ret == NF_ACCEPT) - ip6_local_out(skb); + ip6_local_out(cp->ipvs->net, skb->sk, skb); else if (ret == NF_DROP) kfree_skb(skb); rcu_read_unlock(); @@ -1165,7 +1173,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); rcu_read_lock(); - local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip, + local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_KNOWN_NH, NULL, ipvsh); @@ -1204,7 +1212,8 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); rcu_read_lock(); - local = __ip_vs_get_out_rt_v6(cp->af, skb, cp->dest, &cp->daddr.in6, + local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, + &cp->daddr.in6, NULL, ipvsh, 0, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | @@ -1273,7 +1282,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; rcu_read_lock(); - local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip, rt_mode, + local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, rt_mode, NULL, iph); if (local < 0) goto tx_error; @@ -1365,8 +1374,8 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; rcu_read_lock(); - local = __ip_vs_get_out_rt_v6(cp->af, skb, cp->dest, &cp->daddr.in6, - NULL, ipvsh, 0, rt_mode); + local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, + &cp->daddr.in6, NULL, ipvsh, 0, rt_mode); if (local < 0) goto tx_error; rt = (struct rt6_info *) skb_dst(skb); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index ac3be9b0629b..09d1d19b2ab9 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -168,6 +168,7 @@ nf_ct_get_tuple(const struct sk_buff *skb, unsigned int dataoff, u_int16_t l3num, u_int8_t protonum, + struct net *net, struct nf_conntrack_tuple *tuple, const struct nf_conntrack_l3proto *l3proto, const struct nf_conntrack_l4proto *l4proto) @@ -181,12 +182,13 @@ nf_ct_get_tuple(const struct sk_buff *skb, tuple->dst.protonum = protonum; tuple->dst.dir = IP_CT_DIR_ORIGINAL; - return l4proto->pkt_to_tuple(skb, dataoff, tuple); + return l4proto->pkt_to_tuple(skb, dataoff, net, tuple); } EXPORT_SYMBOL_GPL(nf_ct_get_tuple); bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, - u_int16_t l3num, struct nf_conntrack_tuple *tuple) + u_int16_t l3num, + struct net *net, struct nf_conntrack_tuple *tuple) { struct nf_conntrack_l3proto *l3proto; struct nf_conntrack_l4proto *l4proto; @@ -205,7 +207,7 @@ bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, l4proto = __nf_ct_l4proto_find(l3num, protonum); - ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, tuple, + ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple, l3proto, l4proto); rcu_read_unlock(); @@ -313,12 +315,13 @@ out_free: } EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc); -static void nf_ct_tmpl_free(struct nf_conn *tmpl) +void nf_ct_tmpl_free(struct nf_conn *tmpl) { nf_ct_ext_destroy(tmpl); nf_ct_ext_free(tmpl); kfree(tmpl); } +EXPORT_SYMBOL_GPL(nf_ct_tmpl_free); static void destroy_conntrack(struct nf_conntrack *nfct) @@ -1028,7 +1031,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl, u32 hash; if (!nf_ct_get_tuple(skb, skb_network_offset(skb), - dataoff, l3num, protonum, &tuple, l3proto, + dataoff, l3num, protonum, net, &tuple, l3proto, l4proto)) { pr_debug("resolve_normal_ct: Can't get tuple\n"); return NULL; @@ -1286,13 +1289,6 @@ bool __nf_ct_kill_acct(struct nf_conn *ct, } EXPORT_SYMBOL_GPL(__nf_ct_kill_acct); -/* Built-in default zone used e.g. by modules. */ -const struct nf_conntrack_zone nf_ct_zone_dflt = { - .id = NF_CT_DEFAULT_ZONE_ID, - .dir = NF_CT_DEFAULT_ZONE_DIR, -}; -EXPORT_SYMBOL_GPL(nf_ct_zone_dflt); - #ifdef CONFIG_NF_CONNTRACK_ZONES static struct nf_ct_ext_type nf_ct_zone_extend __read_mostly = { .len = sizeof(struct nf_conntrack_zone), diff --git a/net/netfilter/nf_conntrack_labels.c b/net/netfilter/nf_conntrack_labels.c index bb53f120e79c..3ce5c314ea4b 100644 --- a/net/netfilter/nf_conntrack_labels.c +++ b/net/netfilter/nf_conntrack_labels.c @@ -14,6 +14,8 @@ #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_labels.h> +static spinlock_t nf_connlabels_lock; + static unsigned int label_bits(const struct nf_conn_labels *l) { unsigned int longs = l->words; @@ -48,7 +50,6 @@ int nf_connlabel_set(struct nf_conn *ct, u16 bit) } EXPORT_SYMBOL_GPL(nf_connlabel_set); -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) static void replace_u32(u32 *address, u32 mask, u32 new) { u32 old, tmp; @@ -89,7 +90,35 @@ int nf_connlabels_replace(struct nf_conn *ct, return 0; } EXPORT_SYMBOL_GPL(nf_connlabels_replace); -#endif + +int nf_connlabels_get(struct net *net, unsigned int n_bits) +{ + size_t words; + + if (n_bits > (NF_CT_LABELS_MAX_SIZE * BITS_PER_BYTE)) + return -ERANGE; + + words = BITS_TO_LONGS(n_bits); + + spin_lock(&nf_connlabels_lock); + net->ct.labels_used++; + if (words > net->ct.label_words) + net->ct.label_words = words; + spin_unlock(&nf_connlabels_lock); + + return 0; +} +EXPORT_SYMBOL_GPL(nf_connlabels_get); + +void nf_connlabels_put(struct net *net) +{ + spin_lock(&nf_connlabels_lock); + net->ct.labels_used--; + if (net->ct.labels_used == 0) + net->ct.label_words = 0; + spin_unlock(&nf_connlabels_lock); +} +EXPORT_SYMBOL_GPL(nf_connlabels_put); static struct nf_ct_ext_type labels_extend __read_mostly = { .len = sizeof(struct nf_conn_labels), @@ -99,6 +128,7 @@ static struct nf_ct_ext_type labels_extend __read_mostly = { int nf_conntrack_labels_init(void) { + spin_lock_init(&nf_connlabels_lock); return nf_ct_extend_register(&labels_extend); } diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index 6dd995c7c72b..fce1b1cca32d 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -398,7 +398,7 @@ static inline struct dccp_net *dccp_pernet(struct net *net) } static bool dccp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, - struct nf_conntrack_tuple *tuple) + struct net *net, struct nf_conntrack_tuple *tuple) { struct dccp_hdr _hdr, *dh; diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c index 2281be419a74..86dc752e5349 100644 --- a/net/netfilter/nf_conntrack_proto_generic.c +++ b/net/netfilter/nf_conntrack_proto_generic.c @@ -45,7 +45,7 @@ static inline struct nf_generic_net *generic_pernet(struct net *net) static bool generic_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, - struct nf_conntrack_tuple *tuple) + struct net *net, struct nf_conntrack_tuple *tuple) { tuple->src.u.all = 0; tuple->dst.u.all = 0; diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index 7648674f29c3..a96451a7af20 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -190,9 +190,8 @@ static bool gre_invert_tuple(struct nf_conntrack_tuple *tuple, /* gre hdr info to tuple */ static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, - struct nf_conntrack_tuple *tuple) + struct net *net, struct nf_conntrack_tuple *tuple) { - struct net *net = dev_net(skb->dev ? skb->dev : skb_dst(skb)->dev); const struct gre_hdr_pptp *pgrehdr; struct gre_hdr_pptp _pgrehdr; __be16 srckey; diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 67197731eb68..9578a7c371ef 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -156,7 +156,7 @@ static inline struct sctp_net *sctp_pernet(struct net *net) } static bool sctp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, - struct nf_conntrack_tuple *tuple) + struct net *net, struct nf_conntrack_tuple *tuple) { const struct sctphdr *hp; struct sctphdr _hdr; diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 70383de72054..278f3b9356ef 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -277,7 +277,7 @@ static inline struct nf_tcp_net *tcp_pernet(struct net *net) } static bool tcp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, - struct nf_conntrack_tuple *tuple) + struct net *net, struct nf_conntrack_tuple *tuple) { const struct tcphdr *hp; struct tcphdr _hdr; diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 6957281ffee5..478f92f834b6 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -38,6 +38,7 @@ static inline struct nf_udp_net *udp_pernet(struct net *net) static bool udp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, + struct net *net, struct nf_conntrack_tuple *tuple) { const struct udphdr *hp; diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c index c5903d1649f9..1ac8ee13a873 100644 --- a/net/netfilter/nf_conntrack_proto_udplite.c +++ b/net/netfilter/nf_conntrack_proto_udplite.c @@ -48,6 +48,7 @@ static inline struct udplite_net *udplite_pernet(struct net *net) static bool udplite_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, + struct net *net, struct nf_conntrack_tuple *tuple) { const struct udphdr *hp; diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 675d12c69e32..a5d41dfa9f05 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -107,12 +107,17 @@ EXPORT_SYMBOL(nf_log_register); void nf_log_unregister(struct nf_logger *logger) { + const struct nf_logger *log; int i; mutex_lock(&nf_log_mutex); - for (i = 0; i < NFPROTO_NUMPROTO; i++) - RCU_INIT_POINTER(loggers[i][logger->type], NULL); + for (i = 0; i < NFPROTO_NUMPROTO; i++) { + log = nft_log_dereference(loggers[i][logger->type]); + if (log == logger) + RCU_INIT_POINTER(loggers[i][logger->type], NULL); + } mutex_unlock(&nf_log_mutex); + synchronize_rcu(); } EXPORT_SYMBOL(nf_log_unregister); diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 5113dfd39df9..06a9f45771ab 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -83,7 +83,7 @@ out: rcu_read_unlock(); } -int nf_xfrm_me_harder(struct sk_buff *skb, unsigned int family) +int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family) { struct flowi fl; unsigned int hh_len; @@ -99,7 +99,7 @@ int nf_xfrm_me_harder(struct sk_buff *skb, unsigned int family) dst = ((struct xfrm_dst *)dst)->route; dst_hold(dst); - dst = xfrm_lookup(dev_net(dst->dev), dst, &fl, skb->sk, 0); + dst = xfrm_lookup(net, dst, &fl, skb->sk, 0); if (IS_ERR(dst)) return PTR_ERR(dst); diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 96777f9a9350..34f628e16a4c 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -199,7 +199,7 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) if (verdict == NF_ACCEPT) { afinfo = nf_get_afinfo(entry->state.pf); - if (!afinfo || afinfo->reroute(skb, entry) < 0) + if (!afinfo || afinfo->reroute(entry->state.net, skb, entry) < 0) verdict = NF_DROP; } @@ -215,7 +215,7 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) case NF_ACCEPT: case NF_STOP: local_bh_disable(); - entry->state.okfn(entry->state.sk, skb); + entry->state.okfn(entry->state.net, entry->state.sk, skb); local_bh_enable(); break; case NF_QUEUE: diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 8fbbdb09826e..c8a4a48bced9 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -188,7 +188,7 @@ unsigned int synproxy_tstamp_adjust(struct sk_buff *skb, const struct nf_conn_synproxy *synproxy) { unsigned int optoff, optend; - u32 *ptr, old; + __be32 *ptr, old; if (synproxy->tsoff == 0) return 1; @@ -216,12 +216,12 @@ unsigned int synproxy_tstamp_adjust(struct sk_buff *skb, if (op[0] == TCPOPT_TIMESTAMP && op[1] == TCPOLEN_TIMESTAMP) { if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) { - ptr = (u32 *)&op[2]; + ptr = (__be32 *)&op[2]; old = *ptr; *ptr = htonl(ntohl(*ptr) - synproxy->tsoff); } else { - ptr = (u32 *)&op[6]; + ptr = (__be32 *)&op[6]; old = *ptr; *ptr = htonl(ntohl(*ptr) + synproxy->tsoff); @@ -380,7 +380,7 @@ static int __net_init synproxy_net_init(struct net *net) err3: free_percpu(snet->stats); err2: - nf_conntrack_free(ct); + nf_ct_tmpl_free(ct); err1: return err; } diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 05d0b03530f6..f3695a497408 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -48,9 +48,7 @@ static void __nft_trace_packet(const struct nft_pktinfo *pkt, const struct nft_chain *chain, int rulenum, enum nft_trace type) { - struct net *net = dev_net(pkt->in ? pkt->in : pkt->out); - - nf_log_trace(net, pkt->xt.family, pkt->ops->hooknum, pkt->skb, pkt->in, + nf_log_trace(pkt->net, pkt->pf, pkt->hook, pkt->skb, pkt->in, pkt->out, &trace_loginfo, "TRACE: %s:%s:%s:%u ", chain->table->name, chain->name, comments[type], rulenum); @@ -111,10 +109,10 @@ struct nft_jumpstack { }; unsigned int -nft_do_chain(struct nft_pktinfo *pkt, const struct nf_hook_ops *ops) +nft_do_chain(struct nft_pktinfo *pkt, void *priv) { - const struct nft_chain *chain = ops->priv, *basechain = chain; - const struct net *net = dev_net(pkt->in ? pkt->in : pkt->out); + const struct nft_chain *chain = priv, *basechain = chain; + const struct net *net = pkt->net; const struct nft_rule *rule; const struct nft_expr *expr, *last; struct nft_regs regs; diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c index 2cae4d4a03b7..7b9c053ba750 100644 --- a/net/netfilter/nf_tables_netdev.c +++ b/net/netfilter/nf_tables_netdev.c @@ -17,13 +17,13 @@ static inline void nft_netdev_set_pktinfo_ipv4(struct nft_pktinfo *pkt, - const struct nf_hook_ops *ops, struct sk_buff *skb, + struct sk_buff *skb, const struct nf_hook_state *state) { struct iphdr *iph, _iph; u32 len, thoff; - nft_set_pktinfo(pkt, ops, skb, state); + nft_set_pktinfo(pkt, skb, state); iph = skb_header_pointer(skb, skb_network_offset(skb), sizeof(*iph), &_iph); @@ -48,7 +48,6 @@ nft_netdev_set_pktinfo_ipv4(struct nft_pktinfo *pkt, static inline void __nft_netdev_set_pktinfo_ipv6(struct nft_pktinfo *pkt, - const struct nf_hook_ops *ops, struct sk_buff *skb, const struct nf_hook_state *state) { @@ -82,33 +81,32 @@ __nft_netdev_set_pktinfo_ipv6(struct nft_pktinfo *pkt, } static inline void nft_netdev_set_pktinfo_ipv6(struct nft_pktinfo *pkt, - const struct nf_hook_ops *ops, struct sk_buff *skb, const struct nf_hook_state *state) { - nft_set_pktinfo(pkt, ops, skb, state); - __nft_netdev_set_pktinfo_ipv6(pkt, ops, skb, state); + nft_set_pktinfo(pkt, skb, state); + __nft_netdev_set_pktinfo_ipv6(pkt, skb, state); } static unsigned int -nft_do_chain_netdev(const struct nf_hook_ops *ops, struct sk_buff *skb, +nft_do_chain_netdev(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nft_pktinfo pkt; switch (eth_hdr(skb)->h_proto) { case htons(ETH_P_IP): - nft_netdev_set_pktinfo_ipv4(&pkt, ops, skb, state); + nft_netdev_set_pktinfo_ipv4(&pkt, skb, state); break; case htons(ETH_P_IPV6): - nft_netdev_set_pktinfo_ipv6(&pkt, ops, skb, state); + nft_netdev_set_pktinfo_ipv6(&pkt, skb, state); break; default: - nft_set_pktinfo(&pkt, ops, skb, state); + nft_set_pktinfo(&pkt, skb, state); break; } - return nft_do_chain(&pkt, ops); + return nft_do_chain(&pkt, priv); } static struct nft_af_info nft_af_netdev __read_mostly = { diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 0c0e8ecf02ab..f1d9e887f5b1 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -64,7 +64,7 @@ void nfnl_unlock(__u8 subsys_id) EXPORT_SYMBOL_GPL(nfnl_unlock); #ifdef CONFIG_PROVE_LOCKING -int lockdep_nfnl_is_held(u8 subsys_id) +bool lockdep_nfnl_is_held(u8 subsys_id) { return lockdep_is_held(&table[subsys_id].mutex); } @@ -444,6 +444,7 @@ done: static void nfnetlink_rcv(struct sk_buff *skb) { struct nlmsghdr *nlh = nlmsg_hdr(skb); + u_int16_t res_id; int msglen; if (nlh->nlmsg_len < NLMSG_HDRLEN || @@ -468,7 +469,12 @@ static void nfnetlink_rcv(struct sk_buff *skb) nfgenmsg = nlmsg_data(nlh); skb_pull(skb, msglen); - nfnetlink_rcv_batch(skb, nlh, nfgenmsg->res_id); + /* Work around old nft using host byte order */ + if (nfgenmsg->res_id == NFNL_SUBSYS_NFTABLES) + res_id = NFNL_SUBSYS_NFTABLES; + else + res_id = ntohs(nfgenmsg->res_id); + nfnetlink_rcv_batch(skb, nlh, res_id); } else { netlink_rcv_skb(skb, &nfnetlink_rcv_msg); } diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 4670821b569d..cc2300f4e177 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -538,9 +538,9 @@ __build_packet_message(struct nfnl_log_net *log, if (skb->tstamp.tv64) { struct nfulnl_msg_packet_timestamp ts; - struct timeval tv = ktime_to_timeval(skb->tstamp); - ts.sec = cpu_to_be64(tv.tv_sec); - ts.usec = cpu_to_be64(tv.tv_usec); + struct timespec64 kts = ktime_to_timespec64(skb->tstamp); + ts.sec = cpu_to_be64(kts.tv_sec); + ts.usec = cpu_to_be64(kts.tv_nsec / NSEC_PER_USEC); if (nla_put(inst->skb, NFULA_TIMESTAMP, sizeof(ts), &ts)) goto nla_put_failure; diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c index 685cc6a17163..41583e30051b 100644 --- a/net/netfilter/nfnetlink_queue_core.c +++ b/net/netfilter/nfnetlink_queue_core.c @@ -301,7 +301,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, __be32 **packet_id_ptr) { size_t size; - size_t data_len = 0, cap_len = 0; + size_t data_len = 0, cap_len = 0, rem_len = 0; unsigned int hlen = 0; struct sk_buff *skb; struct nlattr *nla; @@ -360,6 +360,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, hlen = min_t(unsigned int, hlen, data_len); size += sizeof(struct nlattr) + hlen; cap_len = entskb->len; + rem_len = data_len - hlen; break; } @@ -377,7 +378,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, size += nla_total_size(seclen); } - skb = nfnetlink_alloc_skb(net, size, queue->peer_portid, + skb = __netlink_alloc_skb(net->nfnl, size, rem_len, queue->peer_portid, GFP_ATOMIC); if (!skb) { skb_tx_error(entskb); @@ -669,8 +670,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) struct nfqnl_instance *queue; struct sk_buff *skb, *segs; int err = -ENOBUFS; - struct net *net = dev_net(entry->state.in ? - entry->state.in : entry->state.out); + struct net *net = entry->state.net; struct nfnl_queue_net *q = nfnl_queue_pernet(net); /* rcu_read_lock()ed by nf_hook_slow() */ diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 66def315eb56..9c8fab00164b 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -619,6 +619,13 @@ struct nft_xt { static struct nft_expr_type nft_match_type; +static bool nft_match_cmp(const struct xt_match *match, + const char *name, u32 rev, u32 family) +{ + return strcmp(match->name, name) == 0 && match->revision == rev && + (match->family == NFPROTO_UNSPEC || match->family == family); +} + static const struct nft_expr_ops * nft_match_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) @@ -626,7 +633,7 @@ nft_match_select_ops(const struct nft_ctx *ctx, struct nft_xt *nft_match; struct xt_match *match; char *mt_name; - __u32 rev, family; + u32 rev, family; if (tb[NFTA_MATCH_NAME] == NULL || tb[NFTA_MATCH_REV] == NULL || @@ -641,8 +648,7 @@ nft_match_select_ops(const struct nft_ctx *ctx, list_for_each_entry(nft_match, &nft_match_list, head) { struct xt_match *match = nft_match->ops.data; - if (strcmp(match->name, mt_name) == 0 && - match->revision == rev && match->family == family) { + if (nft_match_cmp(match, mt_name, rev, family)) { if (!try_module_get(match->me)) return ERR_PTR(-ENOENT); @@ -693,6 +699,13 @@ static LIST_HEAD(nft_target_list); static struct nft_expr_type nft_target_type; +static bool nft_target_cmp(const struct xt_target *tg, + const char *name, u32 rev, u32 family) +{ + return strcmp(tg->name, name) == 0 && tg->revision == rev && + (tg->family == NFPROTO_UNSPEC || tg->family == family); +} + static const struct nft_expr_ops * nft_target_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) @@ -700,7 +713,7 @@ nft_target_select_ops(const struct nft_ctx *ctx, struct nft_xt *nft_target; struct xt_target *target; char *tg_name; - __u32 rev, family; + u32 rev, family; if (tb[NFTA_TARGET_NAME] == NULL || tb[NFTA_TARGET_REV] == NULL || @@ -715,8 +728,7 @@ nft_target_select_ops(const struct nft_ctx *ctx, list_for_each_entry(nft_target, &nft_target_list, head) { struct xt_target *target = nft_target->ops.data; - if (strcmp(target->name, tg_name) == 0 && - target->revision == rev && target->family == family) { + if (nft_target_cmp(target, tg_name, rev, family)) { if (!try_module_get(target->me)) return ERR_PTR(-ENOENT); diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c index a13d6a386d63..319c22b4bca2 100644 --- a/net/netfilter/nft_log.c +++ b/net/netfilter/nft_log.c @@ -31,9 +31,8 @@ static void nft_log_eval(const struct nft_expr *expr, const struct nft_pktinfo *pkt) { const struct nft_log *priv = nft_expr_priv(expr); - struct net *net = dev_net(pkt->in ? pkt->in : pkt->out); - nf_log_packet(net, pkt->ops->pf, pkt->ops->hooknum, pkt->skb, pkt->in, + nf_log_packet(pkt->net, pkt->pf, pkt->hook, pkt->skb, pkt->in, pkt->out, &priv->loginfo, "%s", priv->prefix); } diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index cb2f13ebb5a6..e4ad2c24bc41 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -42,7 +42,7 @@ void nft_meta_get_eval(const struct nft_expr *expr, *(__be16 *)dest = skb->protocol; break; case NFT_META_NFPROTO: - *dest = pkt->ops->pf; + *dest = pkt->pf; break; case NFT_META_L4PROTO: *dest = pkt->tprot; @@ -135,7 +135,7 @@ void nft_meta_get_eval(const struct nft_expr *expr, break; } - switch (pkt->ops->pf) { + switch (pkt->pf) { case NFPROTO_IPV4: if (ipv4_is_multicast(ip_hdr(skb)->daddr)) *dest = PACKET_MULTICAST; diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c index 96805d21d618..61d216eb7917 100644 --- a/net/netfilter/nft_queue.c +++ b/net/netfilter/nft_queue.c @@ -42,7 +42,7 @@ static void nft_queue_eval(const struct nft_expr *expr, queue = priv->queuenum + cpu % priv->queues_total; } else { queue = nfqueue_hash(pkt->skb, queue, - priv->queues_total, pkt->ops->pf, + priv->queues_total, pkt->pf, jhash_initval); } } diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c index 635dbba93d01..759ca5248a3d 100644 --- a/net/netfilter/nft_reject_inet.c +++ b/net/netfilter/nft_reject_inet.c @@ -22,38 +22,37 @@ static void nft_reject_inet_eval(const struct nft_expr *expr, const struct nft_pktinfo *pkt) { struct nft_reject *priv = nft_expr_priv(expr); - struct net *net = dev_net((pkt->in != NULL) ? pkt->in : pkt->out); - switch (pkt->ops->pf) { + switch (pkt->pf) { case NFPROTO_IPV4: switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: nf_send_unreach(pkt->skb, priv->icmp_code, - pkt->ops->hooknum); + pkt->hook); break; case NFT_REJECT_TCP_RST: - nf_send_reset(pkt->skb, pkt->ops->hooknum); + nf_send_reset(pkt->net, pkt->skb, pkt->hook); break; case NFT_REJECT_ICMPX_UNREACH: nf_send_unreach(pkt->skb, nft_reject_icmp_code(priv->icmp_code), - pkt->ops->hooknum); + pkt->hook); break; } break; case NFPROTO_IPV6: switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: - nf_send_unreach6(net, pkt->skb, priv->icmp_code, - pkt->ops->hooknum); + nf_send_unreach6(pkt->net, pkt->skb, priv->icmp_code, + pkt->hook); break; case NFT_REJECT_TCP_RST: - nf_send_reset6(net, pkt->skb, pkt->ops->hooknum); + nf_send_reset6(pkt->net, pkt->skb, pkt->hook); break; case NFT_REJECT_ICMPX_UNREACH: - nf_send_unreach6(net, pkt->skb, + nf_send_unreach6(pkt->net, pkt->skb, nft_reject_icmpv6_code(priv->icmp_code), - pkt->ops->hooknum); + pkt->hook); break; } break; diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 8e524898ccea..faf32d888198 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -255,7 +255,7 @@ out: return 0; err3: - nf_conntrack_free(ct); + nf_ct_tmpl_free(ct); err2: nf_ct_l3proto_module_put(par->family); err1: diff --git a/net/netfilter/xt_LOG.c b/net/netfilter/xt_LOG.c index c13b79440ede..1763ab82bcd7 100644 --- a/net/netfilter/xt_LOG.c +++ b/net/netfilter/xt_LOG.c @@ -33,7 +33,7 @@ log_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_log_info *loginfo = par->targinfo; struct nf_loginfo li; - struct net *net = dev_net(par->in ? par->in : par->out); + struct net *net = par->net; li.type = NF_LOG_TYPE_LOG; li.u.log.level = loginfo->level; diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c index fb7497c928a0..a1fa2c800cb9 100644 --- a/net/netfilter/xt_NFLOG.c +++ b/net/netfilter/xt_NFLOG.c @@ -26,7 +26,7 @@ nflog_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_nflog_info *info = par->targinfo; struct nf_loginfo li; - struct net *net = dev_net(par->in ? par->in : par->out); + struct net *net = par->net; li.type = NF_LOG_TYPE_ULOG; li.u.ulog.copy_len = info->len; diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index 8c02501a530f..b7c43def0dc6 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -108,7 +108,7 @@ tcpmss_mangle_packet(struct sk_buff *skb, return -1; if (info->mss == XT_TCPMSS_CLAMP_PMTU) { - struct net *net = dev_net(par->in ? par->in : par->out); + struct net *net = par->net; unsigned int in_mtu = tcpmss_reverse_mtu(net, skb, family); if (dst_mtu(skb_dst(skb)) <= minlen) { diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c index 49fee6aa2c0a..899b06115fc5 100644 --- a/net/netfilter/xt_TEE.c +++ b/net/netfilter/xt_TEE.c @@ -32,18 +32,18 @@ tee_tg4(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tee_tginfo *info = par->targinfo; - nf_dup_ipv4(skb, par->hooknum, &info->gw.in, info->priv->oif); + nf_dup_ipv4(par->net, skb, par->hooknum, &info->gw.in, info->priv->oif); return XT_CONTINUE; } -#if IS_ENABLED(CONFIG_IPV6) +#if IS_ENABLED(CONFIG_NF_DUP_IPV6) static unsigned int tee_tg6(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tee_tginfo *info = par->targinfo; - nf_dup_ipv6(skb, par->hooknum, &info->gw.in6, info->priv->oif); + nf_dup_ipv6(par->net, skb, par->hooknum, &info->gw.in6, info->priv->oif); return XT_CONTINUE; } @@ -129,7 +129,7 @@ static struct xt_target tee_tg_reg[] __read_mostly = { .destroy = tee_tg_destroy, .me = THIS_MODULE, }, -#if IS_ENABLED(CONFIG_IPV6) +#if IS_ENABLED(CONFIG_NF_DUP_IPV6) { .name = "TEE", .revision = 1, diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index d0c96c5ae29a..3ab591e73ec0 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -250,8 +250,8 @@ nf_tproxy_get_sock_v6(struct net *net, const u8 protocol, * no such listener is found, or NULL if the TCP header is incomplete. */ static struct sock * -tproxy_handle_time_wait4(struct sk_buff *skb, __be32 laddr, __be16 lport, - struct sock *sk) +tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb, + __be32 laddr, __be16 lport, struct sock *sk) { const struct iphdr *iph = ip_hdr(skb); struct tcphdr _hdr, *hp; @@ -267,7 +267,7 @@ tproxy_handle_time_wait4(struct sk_buff *skb, __be32 laddr, __be16 lport, * to a listener socket if there's one */ struct sock *sk2; - sk2 = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol, + sk2 = nf_tproxy_get_sock_v4(net, iph->protocol, iph->saddr, laddr ? laddr : iph->daddr, hp->source, lport ? lport : hp->dest, skb->dev, NFT_LOOKUP_LISTENER); @@ -290,7 +290,7 @@ nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk) } static unsigned int -tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport, +tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport, u_int32_t mark_mask, u_int32_t mark_value) { const struct iphdr *iph = ip_hdr(skb); @@ -305,7 +305,7 @@ tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport, * addresses, this happens if the redirect already happened * and the current packet belongs to an already established * connection */ - sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol, + sk = nf_tproxy_get_sock_v4(net, iph->protocol, iph->saddr, iph->daddr, hp->source, hp->dest, skb->dev, NFT_LOOKUP_ESTABLISHED); @@ -317,11 +317,11 @@ tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport, /* UDP has no TCP_TIME_WAIT state, so we never enter here */ if (sk && sk->sk_state == TCP_TIME_WAIT) /* reopening a TIME_WAIT connection needs special handling */ - sk = tproxy_handle_time_wait4(skb, laddr, lport, sk); + sk = tproxy_handle_time_wait4(net, skb, laddr, lport, sk); else if (!sk) /* no, there's no established connection, check if * there's a listener on the redirected addr/port */ - sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol, + sk = nf_tproxy_get_sock_v4(net, iph->protocol, iph->saddr, laddr, hp->source, lport, skb->dev, NFT_LOOKUP_LISTENER); @@ -351,7 +351,7 @@ tproxy_tg4_v0(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tproxy_target_info *tgi = par->targinfo; - return tproxy_tg4(skb, tgi->laddr, tgi->lport, tgi->mark_mask, tgi->mark_value); + return tproxy_tg4(par->net, skb, tgi->laddr, tgi->lport, tgi->mark_mask, tgi->mark_value); } static unsigned int @@ -359,7 +359,7 @@ tproxy_tg4_v1(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tproxy_target_info_v1 *tgi = par->targinfo; - return tproxy_tg4(skb, tgi->laddr.ip, tgi->lport, tgi->mark_mask, tgi->mark_value); + return tproxy_tg4(par->net, skb, tgi->laddr.ip, tgi->lport, tgi->mark_mask, tgi->mark_value); } #ifdef XT_TPROXY_HAVE_IPV6 @@ -429,7 +429,7 @@ tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff, * to a listener socket if there's one */ struct sock *sk2; - sk2 = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto, + sk2 = nf_tproxy_get_sock_v6(par->net, tproto, &iph->saddr, tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr), hp->source, @@ -472,7 +472,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) * addresses, this happens if the redirect already happened * and the current packet belongs to an already established * connection */ - sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto, + sk = nf_tproxy_get_sock_v6(par->net, tproto, &iph->saddr, &iph->daddr, hp->source, hp->dest, par->in, NFT_LOOKUP_ESTABLISHED); @@ -487,7 +487,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) else if (!sk) /* no there's no established connection, check if * there's a listener on the redirected addr/port */ - sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto, + sk = nf_tproxy_get_sock_v6(par->net, tproto, &iph->saddr, laddr, hp->source, lport, par->in, NFT_LOOKUP_LISTENER); diff --git a/net/netfilter/xt_addrtype.c b/net/netfilter/xt_addrtype.c index 5b4743cc0436..11d6091991a4 100644 --- a/net/netfilter/xt_addrtype.c +++ b/net/netfilter/xt_addrtype.c @@ -125,7 +125,7 @@ static inline bool match_type(struct net *net, const struct net_device *dev, static bool addrtype_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) { - struct net *net = dev_net(par->in ? par->in : par->out); + struct net *net = par->net; const struct xt_addrtype_info *info = par->matchinfo; const struct iphdr *iph = ip_hdr(skb); bool ret = true; @@ -143,7 +143,7 @@ addrtype_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) static bool addrtype_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) { - struct net *net = dev_net(par->in ? par->in : par->out); + struct net *net = par->net; const struct xt_addrtype_info_v1 *info = par->matchinfo; const struct iphdr *iph; const struct net_device *dev = NULL; diff --git a/net/netfilter/xt_connlabel.c b/net/netfilter/xt_connlabel.c index 9f8719df2001..bb9cbeb18868 100644 --- a/net/netfilter/xt_connlabel.c +++ b/net/netfilter/xt_connlabel.c @@ -42,10 +42,6 @@ static int connlabel_mt_check(const struct xt_mtchk_param *par) XT_CONNLABEL_OP_SET; struct xt_connlabel_mtinfo *info = par->matchinfo; int ret; - size_t words; - - if (info->bit > XT_CONNLABEL_MAXBIT) - return -ERANGE; if (info->options & ~options) { pr_err("Unknown options in mask %x\n", info->options); @@ -59,19 +55,15 @@ static int connlabel_mt_check(const struct xt_mtchk_param *par) return ret; } - par->net->ct.labels_used++; - words = BITS_TO_LONGS(info->bit+1); - if (words > par->net->ct.label_words) - par->net->ct.label_words = words; - + ret = nf_connlabels_get(par->net, info->bit + 1); + if (ret < 0) + nf_ct_l3proto_module_put(par->family); return ret; } static void connlabel_mt_destroy(const struct xt_mtdtor_param *par) { - par->net->ct.labels_used--; - if (par->net->ct.labels_used == 0) - par->net->ct.label_words = 0; + nf_connlabels_put(par->net); nf_ct_l3proto_module_put(par->family); } diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index 075d89d94d28..99bbc829868d 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -317,7 +317,7 @@ static int count_them(struct net *net, static bool connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) { - struct net *net = dev_net(par->in ? par->in : par->out); + struct net *net = par->net; const struct xt_connlimit_info *info = par->matchinfo; union nf_inet_addr addr; struct nf_conntrack_tuple tuple; @@ -332,7 +332,7 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; zone = nf_ct_zone(ct); } else if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), - par->family, &tuple)) { + par->family, net, &tuple)) { goto hotdrop; } diff --git a/net/netfilter/xt_ipvs.c b/net/netfilter/xt_ipvs.c index 8d47c3780fda..71a9d95e0a81 100644 --- a/net/netfilter/xt_ipvs.c +++ b/net/netfilter/xt_ipvs.c @@ -48,6 +48,7 @@ static bool ipvs_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_ipvs_mtinfo *data = par->matchinfo; + struct netns_ipvs *ipvs = net_ipvs(par->net); /* ipvs_mt_check ensures that family is only NFPROTO_IPV[46]. */ const u_int8_t family = par->family; struct ip_vs_iphdr iph; @@ -67,7 +68,7 @@ ipvs_mt(const struct sk_buff *skb, struct xt_action_param *par) goto out; } - ip_vs_fill_iph_skb(family, skb, &iph); + ip_vs_fill_iph_skb(family, skb, true, &iph); if (data->bitmask & XT_IPVS_PROTO) if ((iph.protocol == data->l4proto) ^ @@ -85,7 +86,7 @@ ipvs_mt(const struct sk_buff *skb, struct xt_action_param *par) /* * Check if the packet belongs to an existing entry */ - cp = pp->conn_out_get(family, skb, &iph, 1 /* inverse */); + cp = pp->conn_out_get(ipvs, family, skb, &iph); if (unlikely(cp == NULL)) { match = false; goto out; diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c index 0778855ea5e7..df8801e02a32 100644 --- a/net/netfilter/xt_osf.c +++ b/net/netfilter/xt_osf.c @@ -200,7 +200,7 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p) unsigned char opts[MAX_IPOPTLEN]; const struct xt_osf_finger *kf; const struct xt_osf_user_finger *f; - struct net *net = dev_net(p->in ? p->in : p->out); + struct net *net = p->net; if (!info) return false; diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 45e1b30e4fb2..d725a27743a1 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -237,7 +237,7 @@ static void recent_table_flush(struct recent_table *t) static bool recent_mt(const struct sk_buff *skb, struct xt_action_param *par) { - struct net *net = dev_net(par->in ? par->in : par->out); + struct net *net = par->net; struct recent_net *recent_net = recent_pernet(net); const struct xt_recent_mtinfo_v1 *info = par->matchinfo; struct recent_table *t; diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 43e26c881100..2ec08f04b816 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -143,7 +143,8 @@ static bool xt_socket_sk_is_transparent(struct sock *sk) } } -static struct sock *xt_socket_lookup_slow_v4(const struct sk_buff *skb, +static struct sock *xt_socket_lookup_slow_v4(struct net *net, + const struct sk_buff *skb, const struct net_device *indev) { const struct iphdr *iph = ip_hdr(skb); @@ -197,7 +198,7 @@ static struct sock *xt_socket_lookup_slow_v4(const struct sk_buff *skb, } #endif - return xt_socket_get_sock_v4(dev_net(skb->dev), protocol, saddr, daddr, + return xt_socket_get_sock_v4(net, protocol, saddr, daddr, sport, dport, indev); } @@ -209,7 +210,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, struct sock *sk = skb->sk; if (!sk) - sk = xt_socket_lookup_slow_v4(skb, par->in); + sk = xt_socket_lookup_slow_v4(par->net, skb, par->in); if (sk) { bool wildcard; bool transparent = true; @@ -335,7 +336,8 @@ xt_socket_get_sock_v6(struct net *net, const u8 protocol, return NULL; } -static struct sock *xt_socket_lookup_slow_v6(const struct sk_buff *skb, +static struct sock *xt_socket_lookup_slow_v6(struct net *net, + const struct sk_buff *skb, const struct net_device *indev) { __be16 uninitialized_var(dport), uninitialized_var(sport); @@ -371,7 +373,7 @@ static struct sock *xt_socket_lookup_slow_v6(const struct sk_buff *skb, return NULL; } - return xt_socket_get_sock_v6(dev_net(skb->dev), tproto, saddr, daddr, + return xt_socket_get_sock_v6(net, tproto, saddr, daddr, sport, dport, indev); } @@ -383,7 +385,7 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par) struct sock *sk = skb->sk; if (!sk) - sk = xt_socket_lookup_slow_v6(skb, par->in); + sk = xt_socket_lookup_slow_v6(par->net, skb, par->in); if (sk) { bool wildcard; bool transparent = true; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 67d210477863..8f060d7f9a0e 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -84,6 +84,7 @@ struct listeners { #define NETLINK_F_BROADCAST_SEND_ERROR 0x4 #define NETLINK_F_RECV_NO_ENOBUFS 0x8 #define NETLINK_F_LISTEN_ALL_NSID 0x10 +#define NETLINK_F_CAP_ACK 0x20 static inline int netlink_is_kernel(struct sock *sk) { @@ -124,6 +125,24 @@ static inline u32 netlink_group_mask(u32 group) return group ? 1 << (group - 1) : 0; } +static struct sk_buff *netlink_to_full_skb(const struct sk_buff *skb, + gfp_t gfp_mask) +{ + unsigned int len = skb_end_offset(skb); + struct sk_buff *new; + + new = alloc_skb(len, gfp_mask); + if (new == NULL) + return NULL; + + NETLINK_CB(new).portid = NETLINK_CB(skb).portid; + NETLINK_CB(new).dst_group = NETLINK_CB(skb).dst_group; + NETLINK_CB(new).creds = NETLINK_CB(skb).creds; + + memcpy(skb_put(new, len), skb->data, len); + return new; +} + int netlink_add_tap(struct netlink_tap *nt) { if (unlikely(nt->dev->type != ARPHRD_NETLINK)) @@ -205,7 +224,11 @@ static int __netlink_deliver_tap_skb(struct sk_buff *skb, int ret = -ENOMEM; dev_hold(dev); - nskb = skb_clone(skb, GFP_ATOMIC); + + if (netlink_skb_is_mmaped(skb) || is_vmalloc_addr(skb->head)) + nskb = netlink_to_full_skb(skb, GFP_ATOMIC); + else + nskb = skb_clone(skb, GFP_ATOMIC); if (nskb) { nskb->dev = dev; nskb->protocol = htons((u16) sk->sk_protocol); @@ -278,11 +301,6 @@ static void netlink_rcv_wake(struct sock *sk) } #ifdef CONFIG_NETLINK_MMAP -static bool netlink_skb_is_mmaped(const struct sk_buff *skb) -{ - return NETLINK_CB(skb).flags & NETLINK_SKB_MMAPED; -} - static bool netlink_rx_is_mmaped(struct sock *sk) { return nlk_sk(sk)->rx_ring.pg_vec != NULL; @@ -593,16 +611,6 @@ netlink_current_frame(const struct netlink_ring *ring, return netlink_lookup_frame(ring, ring->head, status); } -static struct nl_mmap_hdr * -netlink_previous_frame(const struct netlink_ring *ring, - enum nl_mmap_status status) -{ - unsigned int prev; - - prev = ring->head ? ring->head - 1 : ring->frame_max; - return netlink_lookup_frame(ring, prev, status); -} - static void netlink_increment_head(struct netlink_ring *ring) { ring->head = ring->head != ring->frame_max ? ring->head + 1 : 0; @@ -610,11 +618,11 @@ static void netlink_increment_head(struct netlink_ring *ring) static void netlink_forward_ring(struct netlink_ring *ring) { - unsigned int head = ring->head, pos = head; + unsigned int head = ring->head; const struct nl_mmap_hdr *hdr; do { - hdr = __netlink_lookup_frame(ring, pos); + hdr = __netlink_lookup_frame(ring, ring->head); if (hdr->nm_status == NL_MMAP_STATUS_UNUSED) break; if (hdr->nm_status != NL_MMAP_STATUS_SKIP) @@ -623,6 +631,21 @@ static void netlink_forward_ring(struct netlink_ring *ring) } while (ring->head != head); } +static bool netlink_has_valid_frame(struct netlink_ring *ring) +{ + unsigned int head = ring->head, pos = head; + const struct nl_mmap_hdr *hdr; + + do { + hdr = __netlink_lookup_frame(ring, pos); + if (hdr->nm_status == NL_MMAP_STATUS_VALID) + return true; + pos = pos != 0 ? pos - 1 : ring->frame_max; + } while (pos != head); + + return false; +} + static bool netlink_dump_space(struct netlink_sock *nlk) { struct netlink_ring *ring = &nlk->rx_ring; @@ -668,13 +691,19 @@ static unsigned int netlink_poll(struct file *file, struct socket *sock, mask = datagram_poll(file, sock, wait); - spin_lock_bh(&sk->sk_receive_queue.lock); - if (nlk->rx_ring.pg_vec) { - netlink_forward_ring(&nlk->rx_ring); - if (!netlink_previous_frame(&nlk->rx_ring, NL_MMAP_STATUS_UNUSED)) - mask |= POLLIN | POLLRDNORM; + /* We could already have received frames in the normal receive + * queue, that will show up as NL_MMAP_STATUS_COPY in the ring, + * so if mask contains pollin/etc already, there's no point + * walking the ring. + */ + if ((mask & (POLLIN | POLLRDNORM)) != (POLLIN | POLLRDNORM)) { + spin_lock_bh(&sk->sk_receive_queue.lock); + if (nlk->rx_ring.pg_vec) { + if (netlink_has_valid_frame(&nlk->rx_ring)) + mask |= POLLIN | POLLRDNORM; + } + spin_unlock_bh(&sk->sk_receive_queue.lock); } - spin_unlock_bh(&sk->sk_receive_queue.lock); spin_lock_bh(&sk->sk_write_queue.lock); if (nlk->tx_ring.pg_vec) { @@ -834,7 +863,6 @@ static void netlink_ring_set_copied(struct sock *sk, struct sk_buff *skb) } #else /* CONFIG_NETLINK_MMAP */ -#define netlink_skb_is_mmaped(skb) false #define netlink_rx_is_mmaped(sk) false #define netlink_tx_is_mmaped(sk) false #define netlink_mmap sock_no_mmap @@ -1082,8 +1110,8 @@ static int netlink_insert(struct sock *sk, u32 portid) lock_sock(sk); - err = -EBUSY; - if (nlk_sk(sk)->portid) + err = nlk_sk(sk)->portid == portid ? 0 : -EBUSY; + if (nlk_sk(sk)->bound) goto err; err = -ENOMEM; @@ -1103,10 +1131,14 @@ static int netlink_insert(struct sock *sk, u32 portid) err = -EOVERFLOW; if (err == -EEXIST) err = -EADDRINUSE; - nlk_sk(sk)->portid = 0; sock_put(sk); + goto err; } + /* We need to ensure that the socket is hashed and visible. */ + smp_wmb(); + nlk_sk(sk)->bound = portid; + err: release_sock(sk); return err; @@ -1491,6 +1523,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr; int err; long unsigned int groups = nladdr->nl_groups; + bool bound; if (addr_len < sizeof(struct sockaddr_nl)) return -EINVAL; @@ -1507,9 +1540,14 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, return err; } - if (nlk->portid) + bound = nlk->bound; + if (bound) { + /* Ensure nlk->portid is up-to-date. */ + smp_rmb(); + if (nladdr->nl_pid != nlk->portid) return -EINVAL; + } if (nlk->netlink_bind && groups) { int group; @@ -1525,7 +1563,10 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, } } - if (!nlk->portid) { + /* No need for barriers here as we return to user-space without + * using any of the bound attributes. + */ + if (!bound) { err = nladdr->nl_pid ? netlink_insert(sk, nladdr->nl_pid) : netlink_autobind(sock); @@ -1573,7 +1614,10 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr, !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND)) return -EPERM; - if (!nlk->portid) + /* No need for barriers here as we return to user-space without + * using any of the bound attributes. + */ + if (!nlk->bound) err = netlink_autobind(sock); if (err == 0) { @@ -1832,15 +1876,16 @@ retry: } EXPORT_SYMBOL(netlink_unicast); -struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size, - u32 dst_portid, gfp_t gfp_mask) +struct sk_buff *__netlink_alloc_skb(struct sock *ssk, unsigned int size, + unsigned int ldiff, u32 dst_portid, + gfp_t gfp_mask) { #ifdef CONFIG_NETLINK_MMAP + unsigned int maxlen, linear_size; struct sock *sk = NULL; struct sk_buff *skb; struct netlink_ring *ring; struct nl_mmap_hdr *hdr; - unsigned int maxlen; sk = netlink_getsockbyportid(ssk, dst_portid); if (IS_ERR(sk)) @@ -1851,7 +1896,11 @@ struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size, if (ring->pg_vec == NULL) goto out_put; - if (ring->frame_size - NL_MMAP_HDRLEN < size) + /* We need to account the full linear size needed as a ring + * slot cannot have non-linear parts. + */ + linear_size = size + ldiff; + if (ring->frame_size - NL_MMAP_HDRLEN < linear_size) goto out_put; skb = alloc_skb_head(gfp_mask); @@ -1865,13 +1914,14 @@ struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size, /* check again under lock */ maxlen = ring->frame_size - NL_MMAP_HDRLEN; - if (maxlen < size) + if (maxlen < linear_size) goto out_free; netlink_forward_ring(ring); hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED); if (hdr == NULL) goto err2; + netlink_ring_setup_skb(skb, sk, ring, hdr); netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED); atomic_inc(&ring->pending); @@ -1897,7 +1947,7 @@ out: #endif return alloc_skb(size, gfp_mask); } -EXPORT_SYMBOL_GPL(netlink_alloc_skb); +EXPORT_SYMBOL_GPL(__netlink_alloc_skb); int netlink_has_listeners(struct sock *sk, unsigned int group) { @@ -2258,6 +2308,13 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, nlk->flags &= ~NETLINK_F_LISTEN_ALL_NSID; err = 0; break; + case NETLINK_CAP_ACK: + if (val) + nlk->flags |= NETLINK_F_CAP_ACK; + else + nlk->flags &= ~NETLINK_F_CAP_ACK; + err = 0; + break; default: err = -ENOPROTOOPT; } @@ -2332,6 +2389,16 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname, netlink_table_ungrab(); break; } + case NETLINK_CAP_ACK: + if (len < sizeof(int)) + return -EINVAL; + len = sizeof(int); + val = nlk->flags & NETLINK_F_CAP_ACK ? 1 : 0; + if (put_user(len, optlen) || + put_user(val, optval)) + return -EFAULT; + err = 0; + break; default: err = -ENOPROTOOPT; } @@ -2391,17 +2458,20 @@ static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) dst_group = nlk->dst_group; } - if (!nlk->portid) { + if (!nlk->bound) { err = netlink_autobind(sock); if (err) goto out; + } else { + /* Ensure nlk is hashed and visible. */ + smp_rmb(); } /* It's a really convoluted way for userland to ask for mmaped * sendmsg(), but that's what we've got... */ if (netlink_tx_is_mmaped(sk) && - msg->msg_iter.type == ITER_IOVEC && + iter_is_iovec(&msg->msg_iter) && msg->msg_iter.nr_segs == 1 && msg->msg_iter.iov->iov_base == NULL) { err = netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, @@ -2873,9 +2943,12 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err) struct nlmsghdr *rep; struct nlmsgerr *errmsg; size_t payload = sizeof(*errmsg); + struct netlink_sock *nlk = nlk_sk(NETLINK_CB(in_skb).sk); - /* error messages get the original request appened */ - if (err) + /* Error messages get the original request appened, unless the user + * requests to cap the error message. + */ + if (!(nlk->flags & NETLINK_F_CAP_ACK) && err) payload += nlmsg_len(nlh); skb = netlink_alloc_skb(in_skb->sk, nlmsg_total_size(payload), @@ -2898,7 +2971,7 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err) NLMSG_ERROR, payload, 0); errmsg = nlmsg_data(rep); errmsg->error = err; - memcpy(&errmsg->msg, nlh, err ? nlh->nlmsg_len : sizeof(*nlh)); + memcpy(&errmsg->msg, nlh, payload > sizeof(*errmsg) ? nlh->nlmsg_len : sizeof(*nlh)); netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).portid, MSG_DONTWAIT); } EXPORT_SYMBOL(netlink_ack); diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index 89008405d6b4..14437d9b1965 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h @@ -35,6 +35,7 @@ struct netlink_sock { unsigned long state; size_t max_recvmsg_len; wait_queue_head_t wait; + bool bound; bool cb_running; struct netlink_callback cb; struct mutex *cb_mutex; @@ -59,6 +60,15 @@ static inline struct netlink_sock *nlk_sk(struct sock *sk) return container_of(sk, struct netlink_sock, sk); } +static inline bool netlink_skb_is_mmaped(const struct sk_buff *skb) +{ +#ifdef CONFIG_NETLINK_MMAP + return NETLINK_CB(skb).flags & NETLINK_SKB_MMAPED; +#else + return false; +#endif /* CONFIG_NETLINK_MMAP */ +} + struct netlink_table { struct rhashtable hash; struct hlist_head mc_list; diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 2ed5f964772e..bc0e504f33a6 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -39,7 +39,7 @@ void genl_unlock(void) EXPORT_SYMBOL(genl_unlock); #ifdef CONFIG_LOCKDEP -int lockdep_genl_is_held(void) +bool lockdep_genl_is_held(void) { return lockdep_is_held(&genl_mutex); } @@ -1136,19 +1136,19 @@ int genlmsg_multicast_allns(struct genl_family *family, struct sk_buff *skb, } EXPORT_SYMBOL(genlmsg_multicast_allns); -void genl_notify(struct genl_family *family, - struct sk_buff *skb, struct net *net, u32 portid, u32 group, - struct nlmsghdr *nlh, gfp_t flags) +void genl_notify(struct genl_family *family, struct sk_buff *skb, + struct genl_info *info, u32 group, gfp_t flags) { + struct net *net = genl_info_net(info); struct sock *sk = net->genl_sock; int report = 0; - if (nlh) - report = nlmsg_report(nlh); + if (info->nlhdr) + report = nlmsg_report(info->nlhdr); if (WARN_ON_ONCE(group >= family->n_mcgrps)) return; group = family->mcgrp_offset + group; - nlmsg_notify(sk, skb, portid, group, report, flags); + nlmsg_notify(sk, skb, info->snd_portid, group, report, flags); } EXPORT_SYMBOL(genl_notify); diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index 422dc0567de9..d143aa9f6654 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -5,6 +5,8 @@ config OPENVSWITCH tristate "Open vSwitch" depends on INET + depends on !NF_CONNTRACK || \ + (NF_CONNTRACK && (!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6)) select LIBCRC32C select MPLS select NET_MPLS_GSO @@ -59,7 +61,7 @@ config OPENVSWITCH_VXLAN config OPENVSWITCH_GENEVE tristate "Open vSwitch Geneve tunneling support" depends on OPENVSWITCH - depends on GENEVE_CORE + depends on GENEVE default OPENVSWITCH ---help--- If you say Y here, then the Open vSwitch will be able create geneve vport. diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile index 6e1701de04d8..60f809085b92 100644 --- a/net/openvswitch/Makefile +++ b/net/openvswitch/Makefile @@ -15,6 +15,10 @@ openvswitch-y := \ vport-internal_dev.o \ vport-netdev.o +ifneq ($(CONFIG_NF_CONNTRACK),) +openvswitch-y += conntrack.o +endif + obj-$(CONFIG_OPENVSWITCH_VXLAN)+= vport-vxlan.o obj-$(CONFIG_OPENVSWITCH_GENEVE)+= vport-geneve.o obj-$(CONFIG_OPENVSWITCH_GRE) += vport-gre.o diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 4f4200717bef..1d21ab9d2b5c 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -22,6 +22,7 @@ #include <linux/in.h> #include <linux/ip.h> #include <linux/openvswitch.h> +#include <linux/netfilter_ipv6.h> #include <linux/sctp.h> #include <linux/tcp.h> #include <linux/udp.h> @@ -29,8 +30,10 @@ #include <linux/if_arp.h> #include <linux/if_vlan.h> +#include <net/dst.h> #include <net/ip.h> #include <net/ipv6.h> +#include <net/ip6_fib.h> #include <net/checksum.h> #include <net/dsfield.h> #include <net/mpls.h> @@ -38,6 +41,7 @@ #include "datapath.h" #include "flow.h" +#include "conntrack.h" #include "vport.h" static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, @@ -52,6 +56,20 @@ struct deferred_action { struct sw_flow_key pkt_key; }; +#define MAX_L2_LEN (VLAN_ETH_HLEN + 3 * MPLS_HLEN) +struct ovs_frag_data { + unsigned long dst; + struct vport *vport; + struct ovs_skb_cb cb; + __be16 inner_protocol; + __u16 vlan_tci; + __be16 vlan_proto; + unsigned int l2_len; + u8 l2_data[MAX_L2_LEN]; +}; + +static DEFINE_PER_CPU(struct ovs_frag_data, ovs_frag_data_storage); + #define DEFERRED_ACTION_FIFO_SIZE 10 struct action_fifo { int head; @@ -185,10 +203,6 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key, return 0; } -/* 'KEY' must not have any bits set outside of the 'MASK' */ -#define MASKED(OLD, KEY, MASK) ((KEY) | ((OLD) & ~(MASK))) -#define SET_MASKED(OLD, KEY, MASK) ((OLD) = MASKED(OLD, KEY, MASK)) - static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key, const __be32 *mpls_lse, const __be32 *mask) { @@ -201,7 +215,7 @@ static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key, return err; stack = (__be32 *)skb_mpls_header(skb); - lse = MASKED(*stack, *mpls_lse, *mask); + lse = OVS_MASKED(*stack, *mpls_lse, *mask); if (skb->ip_summed == CHECKSUM_COMPLETE) { __be32 diff[] = { ~(*stack), lse }; @@ -244,9 +258,9 @@ static void ether_addr_copy_masked(u8 *dst_, const u8 *src_, const u8 *mask_) const u16 *src = (const u16 *)src_; const u16 *mask = (const u16 *)mask_; - SET_MASKED(dst[0], src[0], mask[0]); - SET_MASKED(dst[1], src[1], mask[1]); - SET_MASKED(dst[2], src[2], mask[2]); + OVS_SET_MASKED(dst[0], src[0], mask[0]); + OVS_SET_MASKED(dst[1], src[1], mask[1]); + OVS_SET_MASKED(dst[2], src[2], mask[2]); } static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *flow_key, @@ -338,10 +352,10 @@ static void update_ipv6_checksum(struct sk_buff *skb, u8 l4_proto, static void mask_ipv6_addr(const __be32 old[4], const __be32 addr[4], const __be32 mask[4], __be32 masked[4]) { - masked[0] = MASKED(old[0], addr[0], mask[0]); - masked[1] = MASKED(old[1], addr[1], mask[1]); - masked[2] = MASKED(old[2], addr[2], mask[2]); - masked[3] = MASKED(old[3], addr[3], mask[3]); + masked[0] = OVS_MASKED(old[0], addr[0], mask[0]); + masked[1] = OVS_MASKED(old[1], addr[1], mask[1]); + masked[2] = OVS_MASKED(old[2], addr[2], mask[2]); + masked[3] = OVS_MASKED(old[3], addr[3], mask[3]); } static void set_ipv6_addr(struct sk_buff *skb, u8 l4_proto, @@ -358,15 +372,15 @@ static void set_ipv6_addr(struct sk_buff *skb, u8 l4_proto, static void set_ipv6_fl(struct ipv6hdr *nh, u32 fl, u32 mask) { /* Bits 21-24 are always unmasked, so this retains their values. */ - SET_MASKED(nh->flow_lbl[0], (u8)(fl >> 16), (u8)(mask >> 16)); - SET_MASKED(nh->flow_lbl[1], (u8)(fl >> 8), (u8)(mask >> 8)); - SET_MASKED(nh->flow_lbl[2], (u8)fl, (u8)mask); + OVS_SET_MASKED(nh->flow_lbl[0], (u8)(fl >> 16), (u8)(mask >> 16)); + OVS_SET_MASKED(nh->flow_lbl[1], (u8)(fl >> 8), (u8)(mask >> 8)); + OVS_SET_MASKED(nh->flow_lbl[2], (u8)fl, (u8)mask); } static void set_ip_ttl(struct sk_buff *skb, struct iphdr *nh, u8 new_ttl, u8 mask) { - new_ttl = MASKED(nh->ttl, new_ttl, mask); + new_ttl = OVS_MASKED(nh->ttl, new_ttl, mask); csum_replace2(&nh->check, htons(nh->ttl << 8), htons(new_ttl << 8)); nh->ttl = new_ttl; @@ -392,7 +406,7 @@ static int set_ipv4(struct sk_buff *skb, struct sw_flow_key *flow_key, * makes sense to check if the value actually changed. */ if (mask->ipv4_src) { - new_addr = MASKED(nh->saddr, key->ipv4_src, mask->ipv4_src); + new_addr = OVS_MASKED(nh->saddr, key->ipv4_src, mask->ipv4_src); if (unlikely(new_addr != nh->saddr)) { set_ip_addr(skb, nh, &nh->saddr, new_addr); @@ -400,7 +414,7 @@ static int set_ipv4(struct sk_buff *skb, struct sw_flow_key *flow_key, } } if (mask->ipv4_dst) { - new_addr = MASKED(nh->daddr, key->ipv4_dst, mask->ipv4_dst); + new_addr = OVS_MASKED(nh->daddr, key->ipv4_dst, mask->ipv4_dst); if (unlikely(new_addr != nh->daddr)) { set_ip_addr(skb, nh, &nh->daddr, new_addr); @@ -488,7 +502,8 @@ static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *flow_key, *(__be32 *)nh & htonl(IPV6_FLOWINFO_FLOWLABEL); } if (mask->ipv6_hlimit) { - SET_MASKED(nh->hop_limit, key->ipv6_hlimit, mask->ipv6_hlimit); + OVS_SET_MASKED(nh->hop_limit, key->ipv6_hlimit, + mask->ipv6_hlimit); flow_key->ip.ttl = nh->hop_limit; } return 0; @@ -517,8 +532,8 @@ static int set_udp(struct sk_buff *skb, struct sw_flow_key *flow_key, uh = udp_hdr(skb); /* Either of the masks is non-zero, so do not bother checking them. */ - src = MASKED(uh->source, key->udp_src, mask->udp_src); - dst = MASKED(uh->dest, key->udp_dst, mask->udp_dst); + src = OVS_MASKED(uh->source, key->udp_src, mask->udp_src); + dst = OVS_MASKED(uh->dest, key->udp_dst, mask->udp_dst); if (uh->check && skb->ip_summed != CHECKSUM_PARTIAL) { if (likely(src != uh->source)) { @@ -558,12 +573,12 @@ static int set_tcp(struct sk_buff *skb, struct sw_flow_key *flow_key, return err; th = tcp_hdr(skb); - src = MASKED(th->source, key->tcp_src, mask->tcp_src); + src = OVS_MASKED(th->source, key->tcp_src, mask->tcp_src); if (likely(src != th->source)) { set_tp_port(skb, &th->source, src, &th->check); flow_key->tp.src = src; } - dst = MASKED(th->dest, key->tcp_dst, mask->tcp_dst); + dst = OVS_MASKED(th->dest, key->tcp_dst, mask->tcp_dst); if (likely(dst != th->dest)) { set_tp_port(skb, &th->dest, dst, &th->check); flow_key->tp.dst = dst; @@ -590,8 +605,8 @@ static int set_sctp(struct sk_buff *skb, struct sw_flow_key *flow_key, old_csum = sh->checksum; old_correct_csum = sctp_compute_cksum(skb, sctphoff); - sh->source = MASKED(sh->source, key->sctp_src, mask->sctp_src); - sh->dest = MASKED(sh->dest, key->sctp_dst, mask->sctp_dst); + sh->source = OVS_MASKED(sh->source, key->sctp_src, mask->sctp_src); + sh->dest = OVS_MASKED(sh->dest, key->sctp_dst, mask->sctp_dst); new_csum = sctp_compute_cksum(skb, sctphoff); @@ -605,14 +620,146 @@ static int set_sctp(struct sk_buff *skb, struct sw_flow_key *flow_key, return 0; } -static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port) +static int ovs_vport_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct ovs_frag_data *data = this_cpu_ptr(&ovs_frag_data_storage); + struct vport *vport = data->vport; + + if (skb_cow_head(skb, data->l2_len) < 0) { + kfree_skb(skb); + return -ENOMEM; + } + + __skb_dst_copy(skb, data->dst); + *OVS_CB(skb) = data->cb; + skb->inner_protocol = data->inner_protocol; + skb->vlan_tci = data->vlan_tci; + skb->vlan_proto = data->vlan_proto; + + /* Reconstruct the MAC header. */ + skb_push(skb, data->l2_len); + memcpy(skb->data, &data->l2_data, data->l2_len); + ovs_skb_postpush_rcsum(skb, skb->data, data->l2_len); + skb_reset_mac_header(skb); + + ovs_vport_send(vport, skb); + return 0; +} + +static unsigned int +ovs_dst_get_mtu(const struct dst_entry *dst) +{ + return dst->dev->mtu; +} + +static struct dst_ops ovs_dst_ops = { + .family = AF_UNSPEC, + .mtu = ovs_dst_get_mtu, +}; + +/* prepare_frag() is called once per (larger-than-MTU) frame; its inverse is + * ovs_vport_output(), which is called once per fragmented packet. + */ +static void prepare_frag(struct vport *vport, struct sk_buff *skb) +{ + unsigned int hlen = skb_network_offset(skb); + struct ovs_frag_data *data; + + data = this_cpu_ptr(&ovs_frag_data_storage); + data->dst = skb->_skb_refdst; + data->vport = vport; + data->cb = *OVS_CB(skb); + data->inner_protocol = skb->inner_protocol; + data->vlan_tci = skb->vlan_tci; + data->vlan_proto = skb->vlan_proto; + data->l2_len = hlen; + memcpy(&data->l2_data, skb->data, hlen); + + memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); + skb_pull(skb, hlen); +} + +static void ovs_fragment(struct net *net, struct vport *vport, + struct sk_buff *skb, u16 mru, __be16 ethertype) +{ + if (skb_network_offset(skb) > MAX_L2_LEN) { + OVS_NLERR(1, "L2 header too long to fragment"); + return; + } + + if (ethertype == htons(ETH_P_IP)) { + struct dst_entry ovs_dst; + unsigned long orig_dst; + + prepare_frag(vport, skb); + dst_init(&ovs_dst, &ovs_dst_ops, NULL, 1, + DST_OBSOLETE_NONE, DST_NOCOUNT); + ovs_dst.dev = vport->dev; + + orig_dst = skb->_skb_refdst; + skb_dst_set_noref(skb, &ovs_dst); + IPCB(skb)->frag_max_size = mru; + + ip_do_fragment(net, skb->sk, skb, ovs_vport_output); + refdst_drop(orig_dst); + } else if (ethertype == htons(ETH_P_IPV6)) { + const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops(); + unsigned long orig_dst; + struct rt6_info ovs_rt; + + if (!v6ops) { + kfree_skb(skb); + return; + } + + prepare_frag(vport, skb); + memset(&ovs_rt, 0, sizeof(ovs_rt)); + dst_init(&ovs_rt.dst, &ovs_dst_ops, NULL, 1, + DST_OBSOLETE_NONE, DST_NOCOUNT); + ovs_rt.dst.dev = vport->dev; + + orig_dst = skb->_skb_refdst; + skb_dst_set_noref(skb, &ovs_rt.dst); + IP6CB(skb)->frag_max_size = mru; + + v6ops->fragment(net, skb->sk, skb, ovs_vport_output); + refdst_drop(orig_dst); + } else { + WARN_ONCE(1, "Failed fragment ->%s: eth=%04x, MRU=%d, MTU=%d.", + ovs_vport_name(vport), ntohs(ethertype), mru, + vport->dev->mtu); + kfree_skb(skb); + } +} + +static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port, + struct sw_flow_key *key) { struct vport *vport = ovs_vport_rcu(dp, out_port); - if (likely(vport)) - ovs_vport_send(vport, skb); - else + if (likely(vport)) { + u16 mru = OVS_CB(skb)->mru; + + if (likely(!mru || (skb->len <= mru + ETH_HLEN))) { + ovs_vport_send(vport, skb); + } else if (mru <= vport->dev->mtu) { + struct net *net = read_pnet(&dp->net); + __be16 ethertype = key->eth.type; + + if (!is_flow_key_valid(key)) { + if (eth_p_mpls(skb->protocol)) + ethertype = skb->inner_protocol; + else + ethertype = vlan_get_protocol(skb); + } + + ovs_fragment(net, vport, skb, mru, ethertype); + } else { + kfree_skb(skb); + } + } else { kfree_skb(skb); + } } static int output_userspace(struct datapath *dp, struct sk_buff *skb, @@ -626,6 +773,7 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, memset(&upcall, 0, sizeof(upcall)); upcall.cmd = OVS_PACKET_CMD_ACTION; + upcall.mru = OVS_CB(skb)->mru; for (a = nla_data(attr), rem = nla_len(attr); rem > 0; a = nla_next(a, &rem)) { @@ -646,11 +794,13 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, if (vport) { int err; + upcall.egress_tun_info = &info; err = ovs_vport_get_egress_tun_info(vport, skb, - &info); - if (!err) - upcall.egress_tun_info = &info; + &upcall); + if (err) + upcall.egress_tun_info = NULL; } + break; } @@ -749,10 +899,6 @@ static int execute_set_action(struct sk_buff *skb, skb_dst_drop(skb); dst_hold((struct dst_entry *)tun->tun_dst); skb_dst_set(skb, (struct dst_entry *)tun->tun_dst); - - /* FIXME: Remove when all vports have been converted */ - OVS_CB(skb)->egress_tun_info = &tun->tun_dst->u.tun_info; - return 0; } @@ -770,12 +916,13 @@ static int execute_masked_set_action(struct sk_buff *skb, switch (nla_type(a)) { case OVS_KEY_ATTR_PRIORITY: - SET_MASKED(skb->priority, nla_get_u32(a), *get_mask(a, u32 *)); + OVS_SET_MASKED(skb->priority, nla_get_u32(a), + *get_mask(a, u32 *)); flow_key->phy.priority = skb->priority; break; case OVS_KEY_ATTR_SKB_MARK: - SET_MASKED(skb->mark, nla_get_u32(a), *get_mask(a, u32 *)); + OVS_SET_MASKED(skb->mark, nla_get_u32(a), *get_mask(a, u32 *)); flow_key->phy.skb_mark = skb->mark; break; @@ -818,6 +965,13 @@ static int execute_masked_set_action(struct sk_buff *skb, err = set_mpls(skb, flow_key, nla_data(a), get_mask(a, __be32 *)); break; + + case OVS_KEY_ATTR_CT_STATE: + case OVS_KEY_ATTR_CT_ZONE: + case OVS_KEY_ATTR_CT_MARK: + case OVS_KEY_ATTR_CT_LABEL: + err = -EINVAL; + break; } return err; @@ -887,7 +1041,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, struct sk_buff *out_skb = skb_clone(skb, GFP_ATOMIC); if (out_skb) - do_output(dp, out_skb, prev_port); + do_output(dp, out_skb, prev_port, key); prev_port = -1; } @@ -944,6 +1098,15 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, case OVS_ACTION_ATTR_SAMPLE: err = sample(dp, skb, key, a, attr, len); break; + + case OVS_ACTION_ATTR_CT: + err = ovs_ct_execute(ovs_dp_get_net(dp), skb, key, + nla_data(a)); + + /* Hide stolen IP fragments from user space. */ + if (err == -EINPROGRESS) + return 0; + break; } if (unlikely(err)) { @@ -953,7 +1116,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, } if (prev_port != -1) - do_output(dp, skb, prev_port); + do_output(dp, skb, prev_port, key); else consume_skb(skb); @@ -995,7 +1158,6 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, int err; this_cpu_inc(exec_actions_level); - OVS_CB(skb)->egress_tun_info = NULL; err = do_execute_actions(dp, skb, key, acts->actions, acts->actions_len); diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c new file mode 100644 index 000000000000..ad614267cc2a --- /dev/null +++ b/net/openvswitch/conntrack.c @@ -0,0 +1,757 @@ +/* + * Copyright (c) 2015 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include <linux/module.h> +#include <linux/openvswitch.h> +#include <net/ip.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_labels.h> +#include <net/netfilter/nf_conntrack_zones.h> +#include <net/netfilter/ipv6/nf_defrag_ipv6.h> + +#include "datapath.h" +#include "conntrack.h" +#include "flow.h" +#include "flow_netlink.h" + +struct ovs_ct_len_tbl { + size_t maxlen; + size_t minlen; +}; + +/* Metadata mark for masked write to conntrack mark */ +struct md_mark { + u32 value; + u32 mask; +}; + +/* Metadata label for masked write to conntrack label. */ +struct md_label { + struct ovs_key_ct_label value; + struct ovs_key_ct_label mask; +}; + +/* Conntrack action context for execution. */ +struct ovs_conntrack_info { + struct nf_conntrack_helper *helper; + struct nf_conntrack_zone zone; + struct nf_conn *ct; + u32 flags; + u16 family; + struct md_mark mark; + struct md_label label; +}; + +static u16 key_to_nfproto(const struct sw_flow_key *key) +{ + switch (ntohs(key->eth.type)) { + case ETH_P_IP: + return NFPROTO_IPV4; + case ETH_P_IPV6: + return NFPROTO_IPV6; + default: + return NFPROTO_UNSPEC; + } +} + +/* Map SKB connection state into the values used by flow definition. */ +static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo) +{ + u8 ct_state = OVS_CS_F_TRACKED; + + switch (ctinfo) { + case IP_CT_ESTABLISHED_REPLY: + case IP_CT_RELATED_REPLY: + case IP_CT_NEW_REPLY: + ct_state |= OVS_CS_F_REPLY_DIR; + break; + default: + break; + } + + switch (ctinfo) { + case IP_CT_ESTABLISHED: + case IP_CT_ESTABLISHED_REPLY: + ct_state |= OVS_CS_F_ESTABLISHED; + break; + case IP_CT_RELATED: + case IP_CT_RELATED_REPLY: + ct_state |= OVS_CS_F_RELATED; + break; + case IP_CT_NEW: + case IP_CT_NEW_REPLY: + ct_state |= OVS_CS_F_NEW; + break; + default: + break; + } + + return ct_state; +} + +static u32 ovs_ct_get_mark(const struct nf_conn *ct) +{ +#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) + return ct ? ct->mark : 0; +#else + return 0; +#endif +} + +static void ovs_ct_get_label(const struct nf_conn *ct, + struct ovs_key_ct_label *label) +{ + struct nf_conn_labels *cl = ct ? nf_ct_labels_find(ct) : NULL; + + if (cl) { + size_t len = cl->words * sizeof(long); + + if (len > OVS_CT_LABEL_LEN) + len = OVS_CT_LABEL_LEN; + else if (len < OVS_CT_LABEL_LEN) + memset(label, 0, OVS_CT_LABEL_LEN); + memcpy(label, cl->bits, len); + } else { + memset(label, 0, OVS_CT_LABEL_LEN); + } +} + +static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state, + const struct nf_conntrack_zone *zone, + const struct nf_conn *ct) +{ + key->ct.state = state; + key->ct.zone = zone->id; + key->ct.mark = ovs_ct_get_mark(ct); + ovs_ct_get_label(ct, &key->ct.label); +} + +/* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has + * previously sent the packet to conntrack via the ct action. + */ +static void ovs_ct_update_key(const struct sk_buff *skb, + struct sw_flow_key *key, bool post_ct) +{ + const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + u8 state = 0; + + ct = nf_ct_get(skb, &ctinfo); + if (ct) { + state = ovs_ct_get_state(ctinfo); + if (ct->master) + state |= OVS_CS_F_RELATED; + zone = nf_ct_zone(ct); + } else if (post_ct) { + state = OVS_CS_F_TRACKED | OVS_CS_F_INVALID; + } + __ovs_ct_update_key(key, state, zone, ct); +} + +void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key) +{ + ovs_ct_update_key(skb, key, false); +} + +int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb) +{ + if (nla_put_u8(skb, OVS_KEY_ATTR_CT_STATE, key->ct.state)) + return -EMSGSIZE; + + if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && + nla_put_u16(skb, OVS_KEY_ATTR_CT_ZONE, key->ct.zone)) + return -EMSGSIZE; + + if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && + nla_put_u32(skb, OVS_KEY_ATTR_CT_MARK, key->ct.mark)) + return -EMSGSIZE; + + if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && + nla_put(skb, OVS_KEY_ATTR_CT_LABEL, sizeof(key->ct.label), + &key->ct.label)) + return -EMSGSIZE; + + return 0; +} + +static int ovs_ct_set_mark(struct sk_buff *skb, struct sw_flow_key *key, + u32 ct_mark, u32 mask) +{ +#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + u32 new_mark; + + + /* The connection could be invalid, in which case set_mark is no-op. */ + ct = nf_ct_get(skb, &ctinfo); + if (!ct) + return 0; + + new_mark = ct_mark | (ct->mark & ~(mask)); + if (ct->mark != new_mark) { + ct->mark = new_mark; + nf_conntrack_event_cache(IPCT_MARK, ct); + key->ct.mark = new_mark; + } + + return 0; +#else + return -ENOTSUPP; +#endif +} + +static int ovs_ct_set_label(struct sk_buff *skb, struct sw_flow_key *key, + const struct ovs_key_ct_label *label, + const struct ovs_key_ct_label *mask) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn_labels *cl; + struct nf_conn *ct; + int err; + + if (!IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS)) + return -ENOTSUPP; + + /* The connection could be invalid, in which case set_label is no-op.*/ + ct = nf_ct_get(skb, &ctinfo); + if (!ct) + return 0; + + cl = nf_ct_labels_find(ct); + if (!cl) { + nf_ct_labels_ext_add(ct); + cl = nf_ct_labels_find(ct); + } + if (!cl || cl->words * sizeof(long) < OVS_CT_LABEL_LEN) + return -ENOSPC; + + err = nf_connlabels_replace(ct, (u32 *)label, (u32 *)mask, + OVS_CT_LABEL_LEN / sizeof(u32)); + if (err) + return err; + + ovs_ct_get_label(ct, &key->ct.label); + return 0; +} + +/* 'skb' should already be pulled to nh_ofs. */ +static int ovs_ct_helper(struct sk_buff *skb, u16 proto) +{ + const struct nf_conntrack_helper *helper; + const struct nf_conn_help *help; + enum ip_conntrack_info ctinfo; + unsigned int protoff; + struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct || ctinfo == IP_CT_RELATED_REPLY) + return NF_ACCEPT; + + help = nfct_help(ct); + if (!help) + return NF_ACCEPT; + + helper = rcu_dereference(help->helper); + if (!helper) + return NF_ACCEPT; + + switch (proto) { + case NFPROTO_IPV4: + protoff = ip_hdrlen(skb); + break; + case NFPROTO_IPV6: { + u8 nexthdr = ipv6_hdr(skb)->nexthdr; + __be16 frag_off; + int ofs; + + ofs = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, + &frag_off); + if (ofs < 0 || (frag_off & htons(~0x7)) != 0) { + pr_debug("proto header not found\n"); + return NF_ACCEPT; + } + protoff = ofs; + break; + } + default: + WARN_ONCE(1, "helper invoked on non-IP family!"); + return NF_DROP; + } + + return helper->help(skb, protoff, ct, ctinfo); +} + +static int handle_fragments(struct net *net, struct sw_flow_key *key, + u16 zone, struct sk_buff *skb) +{ + struct ovs_skb_cb ovs_cb = *OVS_CB(skb); + + if (key->eth.type == htons(ETH_P_IP)) { + enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone; + int err; + + memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); + err = ip_defrag(net, skb, user); + if (err) + return err; + + ovs_cb.mru = IPCB(skb)->frag_max_size; + } else if (key->eth.type == htons(ETH_P_IPV6)) { +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) + enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone; + struct sk_buff *reasm; + + memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); + reasm = nf_ct_frag6_gather(net, skb, user); + if (!reasm) + return -EINPROGRESS; + + if (skb == reasm) + return -EINVAL; + + key->ip.proto = ipv6_hdr(reasm)->nexthdr; + skb_morph(skb, reasm); + consume_skb(reasm); + ovs_cb.mru = IP6CB(skb)->frag_max_size; +#else + return -EPFNOSUPPORT; +#endif + } else { + return -EPFNOSUPPORT; + } + + key->ip.frag = OVS_FRAG_TYPE_NONE; + skb_clear_hash(skb); + skb->ignore_df = 1; + *OVS_CB(skb) = ovs_cb; + + return 0; +} + +static struct nf_conntrack_expect * +ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, + u16 proto, const struct sk_buff *skb) +{ + struct nf_conntrack_tuple tuple; + + if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), proto, net, &tuple)) + return NULL; + return __nf_ct_expect_find(net, zone, &tuple); +} + +/* Determine whether skb->nfct is equal to the result of conntrack lookup. */ +static bool skb_nfct_cached(const struct net *net, const struct sk_buff *skb, + const struct ovs_conntrack_info *info) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct) + return false; + if (!net_eq(net, read_pnet(&ct->ct_net))) + return false; + if (!nf_ct_zone_equal_any(info->ct, nf_ct_zone(ct))) + return false; + if (info->helper) { + struct nf_conn_help *help; + + help = nf_ct_ext_find(ct, NF_CT_EXT_HELPER); + if (help && rcu_access_pointer(help->helper) != info->helper) + return false; + } + + return true; +} + +static int __ovs_ct_lookup(struct net *net, const struct sw_flow_key *key, + const struct ovs_conntrack_info *info, + struct sk_buff *skb) +{ + /* If we are recirculating packets to match on conntrack fields and + * committing with a separate conntrack action, then we don't need to + * actually run the packet through conntrack twice unless it's for a + * different zone. + */ + if (!skb_nfct_cached(net, skb, info)) { + struct nf_conn *tmpl = info->ct; + + /* Associate skb with specified zone. */ + if (tmpl) { + if (skb->nfct) + nf_conntrack_put(skb->nfct); + nf_conntrack_get(&tmpl->ct_general); + skb->nfct = &tmpl->ct_general; + skb->nfctinfo = IP_CT_NEW; + } + + if (nf_conntrack_in(net, info->family, NF_INET_PRE_ROUTING, + skb) != NF_ACCEPT) + return -ENOENT; + + if (ovs_ct_helper(skb, info->family) != NF_ACCEPT) { + WARN_ONCE(1, "helper rejected packet"); + return -EINVAL; + } + } + + return 0; +} + +/* Lookup connection and read fields into key. */ +static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key, + const struct ovs_conntrack_info *info, + struct sk_buff *skb) +{ + struct nf_conntrack_expect *exp; + + exp = ovs_ct_expect_find(net, &info->zone, info->family, skb); + if (exp) { + u8 state; + + state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED; + __ovs_ct_update_key(key, state, &info->zone, exp->master); + } else { + int err; + + err = __ovs_ct_lookup(net, key, info, skb); + if (err) + return err; + + ovs_ct_update_key(skb, key, true); + } + + return 0; +} + +/* Lookup connection and confirm if unconfirmed. */ +static int ovs_ct_commit(struct net *net, struct sw_flow_key *key, + const struct ovs_conntrack_info *info, + struct sk_buff *skb) +{ + u8 state; + int err; + + state = key->ct.state; + if (key->ct.zone == info->zone.id && + ((state & OVS_CS_F_TRACKED) && !(state & OVS_CS_F_NEW))) { + /* Previous lookup has shown that this connection is already + * tracked and committed. Skip committing. + */ + return 0; + } + + err = __ovs_ct_lookup(net, key, info, skb); + if (err) + return err; + if (nf_conntrack_confirm(skb) != NF_ACCEPT) + return -EINVAL; + + ovs_ct_update_key(skb, key, true); + + return 0; +} + +static bool label_nonzero(const struct ovs_key_ct_label *label) +{ + size_t i; + + for (i = 0; i < sizeof(*label); i++) + if (label->ct_label[i]) + return true; + + return false; +} + +int ovs_ct_execute(struct net *net, struct sk_buff *skb, + struct sw_flow_key *key, + const struct ovs_conntrack_info *info) +{ + int nh_ofs; + int err; + + /* The conntrack module expects to be working at L3. */ + nh_ofs = skb_network_offset(skb); + skb_pull(skb, nh_ofs); + + if (key->ip.frag != OVS_FRAG_TYPE_NONE) { + err = handle_fragments(net, key, info->zone.id, skb); + if (err) + return err; + } + + if (info->flags & OVS_CT_F_COMMIT) + err = ovs_ct_commit(net, key, info, skb); + else + err = ovs_ct_lookup(net, key, info, skb); + if (err) + goto err; + + if (info->mark.mask) { + err = ovs_ct_set_mark(skb, key, info->mark.value, + info->mark.mask); + if (err) + goto err; + } + if (label_nonzero(&info->label.mask)) + err = ovs_ct_set_label(skb, key, &info->label.value, + &info->label.mask); +err: + skb_push(skb, nh_ofs); + return err; +} + +static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name, + const struct sw_flow_key *key, bool log) +{ + struct nf_conntrack_helper *helper; + struct nf_conn_help *help; + + helper = nf_conntrack_helper_try_module_get(name, info->family, + key->ip.proto); + if (!helper) { + OVS_NLERR(log, "Unknown helper \"%s\"", name); + return -EINVAL; + } + + help = nf_ct_helper_ext_add(info->ct, helper, GFP_KERNEL); + if (!help) { + module_put(helper->me); + return -ENOMEM; + } + + rcu_assign_pointer(help->helper, helper); + info->helper = helper; + return 0; +} + +static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = { + [OVS_CT_ATTR_FLAGS] = { .minlen = sizeof(u32), + .maxlen = sizeof(u32) }, + [OVS_CT_ATTR_ZONE] = { .minlen = sizeof(u16), + .maxlen = sizeof(u16) }, + [OVS_CT_ATTR_MARK] = { .minlen = sizeof(struct md_mark), + .maxlen = sizeof(struct md_mark) }, + [OVS_CT_ATTR_LABEL] = { .minlen = sizeof(struct md_label), + .maxlen = sizeof(struct md_label) }, + [OVS_CT_ATTR_HELPER] = { .minlen = 1, + .maxlen = NF_CT_HELPER_NAME_LEN } +}; + +static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, + const char **helper, bool log) +{ + struct nlattr *a; + int rem; + + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + int maxlen = ovs_ct_attr_lens[type].maxlen; + int minlen = ovs_ct_attr_lens[type].minlen; + + if (type > OVS_CT_ATTR_MAX) { + OVS_NLERR(log, + "Unknown conntrack attr (type=%d, max=%d)", + type, OVS_CT_ATTR_MAX); + return -EINVAL; + } + if (nla_len(a) < minlen || nla_len(a) > maxlen) { + OVS_NLERR(log, + "Conntrack attr type has unexpected length (type=%d, length=%d, expected=%d)", + type, nla_len(a), maxlen); + return -EINVAL; + } + + switch (type) { + case OVS_CT_ATTR_FLAGS: + info->flags = nla_get_u32(a); + break; +#ifdef CONFIG_NF_CONNTRACK_ZONES + case OVS_CT_ATTR_ZONE: + info->zone.id = nla_get_u16(a); + break; +#endif +#ifdef CONFIG_NF_CONNTRACK_MARK + case OVS_CT_ATTR_MARK: { + struct md_mark *mark = nla_data(a); + + info->mark = *mark; + break; + } +#endif +#ifdef CONFIG_NF_CONNTRACK_LABELS + case OVS_CT_ATTR_LABEL: { + struct md_label *label = nla_data(a); + + info->label = *label; + break; + } +#endif + case OVS_CT_ATTR_HELPER: + *helper = nla_data(a); + if (!memchr(*helper, '\0', nla_len(a))) { + OVS_NLERR(log, "Invalid conntrack helper"); + return -EINVAL; + } + break; + default: + OVS_NLERR(log, "Unknown conntrack attr (%d)", + type); + return -EINVAL; + } + } + + if (rem > 0) { + OVS_NLERR(log, "Conntrack attr has %d unknown bytes", rem); + return -EINVAL; + } + + return 0; +} + +bool ovs_ct_verify(struct net *net, enum ovs_key_attr attr) +{ + if (attr == OVS_KEY_ATTR_CT_STATE) + return true; + if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && + attr == OVS_KEY_ATTR_CT_ZONE) + return true; + if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && + attr == OVS_KEY_ATTR_CT_MARK) + return true; + if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && + attr == OVS_KEY_ATTR_CT_LABEL) { + struct ovs_net *ovs_net = net_generic(net, ovs_net_id); + + return ovs_net->xt_label; + } + + return false; +} + +int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, + const struct sw_flow_key *key, + struct sw_flow_actions **sfa, bool log) +{ + struct ovs_conntrack_info ct_info; + const char *helper = NULL; + u16 family; + int err; + + family = key_to_nfproto(key); + if (family == NFPROTO_UNSPEC) { + OVS_NLERR(log, "ct family unspecified"); + return -EINVAL; + } + + memset(&ct_info, 0, sizeof(ct_info)); + ct_info.family = family; + + nf_ct_zone_init(&ct_info.zone, NF_CT_DEFAULT_ZONE_ID, + NF_CT_DEFAULT_ZONE_DIR, 0); + + err = parse_ct(attr, &ct_info, &helper, log); + if (err) + return err; + + /* Set up template for tracking connections in specific zones. */ + ct_info.ct = nf_ct_tmpl_alloc(net, &ct_info.zone, GFP_KERNEL); + if (!ct_info.ct) { + OVS_NLERR(log, "Failed to allocate conntrack template"); + return -ENOMEM; + } + if (helper) { + err = ovs_ct_add_helper(&ct_info, helper, key, log); + if (err) + goto err_free_ct; + } + + err = ovs_nla_add_action(sfa, OVS_ACTION_ATTR_CT, &ct_info, + sizeof(ct_info), log); + if (err) + goto err_free_ct; + + __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status); + nf_conntrack_get(&ct_info.ct->ct_general); + return 0; +err_free_ct: + nf_conntrack_free(ct_info.ct); + return err; +} + +int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info, + struct sk_buff *skb) +{ + struct nlattr *start; + + start = nla_nest_start(skb, OVS_ACTION_ATTR_CT); + if (!start) + return -EMSGSIZE; + + if (nla_put_u32(skb, OVS_CT_ATTR_FLAGS, ct_info->flags)) + return -EMSGSIZE; + if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && + nla_put_u16(skb, OVS_CT_ATTR_ZONE, ct_info->zone.id)) + return -EMSGSIZE; + if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && + nla_put(skb, OVS_CT_ATTR_MARK, sizeof(ct_info->mark), + &ct_info->mark)) + return -EMSGSIZE; + if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && + nla_put(skb, OVS_CT_ATTR_LABEL, sizeof(ct_info->label), + &ct_info->label)) + return -EMSGSIZE; + if (ct_info->helper) { + if (nla_put_string(skb, OVS_CT_ATTR_HELPER, + ct_info->helper->name)) + return -EMSGSIZE; + } + + nla_nest_end(skb, start); + + return 0; +} + +void ovs_ct_free_action(const struct nlattr *a) +{ + struct ovs_conntrack_info *ct_info = nla_data(a); + + if (ct_info->helper) + module_put(ct_info->helper->me); + if (ct_info->ct) + nf_ct_put(ct_info->ct); +} + +void ovs_ct_init(struct net *net) +{ + unsigned int n_bits = sizeof(struct ovs_key_ct_label) * BITS_PER_BYTE; + struct ovs_net *ovs_net = net_generic(net, ovs_net_id); + + if (nf_connlabels_get(net, n_bits)) { + ovs_net->xt_label = false; + OVS_NLERR(true, "Failed to set connlabel length"); + } else { + ovs_net->xt_label = true; + } +} + +void ovs_ct_exit(struct net *net) +{ + struct ovs_net *ovs_net = net_generic(net, ovs_net_id); + + if (ovs_net->xt_label) + nf_connlabels_put(net); +} diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h new file mode 100644 index 000000000000..43f5dd7a5577 --- /dev/null +++ b/net/openvswitch/conntrack.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2015 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef OVS_CONNTRACK_H +#define OVS_CONNTRACK_H 1 + +#include "flow.h" + +struct ovs_conntrack_info; +enum ovs_key_attr; + +#if IS_ENABLED(CONFIG_NF_CONNTRACK) +void ovs_ct_init(struct net *); +void ovs_ct_exit(struct net *); +bool ovs_ct_verify(struct net *, enum ovs_key_attr attr); +int ovs_ct_copy_action(struct net *, const struct nlattr *, + const struct sw_flow_key *, struct sw_flow_actions **, + bool log); +int ovs_ct_action_to_attr(const struct ovs_conntrack_info *, struct sk_buff *); + +int ovs_ct_execute(struct net *, struct sk_buff *, struct sw_flow_key *, + const struct ovs_conntrack_info *); + +void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key); +int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb); +void ovs_ct_free_action(const struct nlattr *a); +#else +#include <linux/errno.h> + +static inline void ovs_ct_init(struct net *net) { } + +static inline void ovs_ct_exit(struct net *net) { } + +static inline bool ovs_ct_verify(struct net *net, int attr) +{ + return false; +} + +static inline int ovs_ct_copy_action(struct net *net, const struct nlattr *nla, + const struct sw_flow_key *key, + struct sw_flow_actions **acts, bool log) +{ + return -ENOTSUPP; +} + +static inline int ovs_ct_action_to_attr(const struct ovs_conntrack_info *info, + struct sk_buff *skb) +{ + return -ENOTSUPP; +} + +static inline int ovs_ct_execute(struct net *net, struct sk_buff *skb, + struct sw_flow_key *key, + const struct ovs_conntrack_info *info) +{ + return -ENOTSUPP; +} + +static inline void ovs_ct_fill_key(const struct sk_buff *skb, + struct sw_flow_key *key) +{ + key->ct.state = 0; + key->ct.zone = 0; + key->ct.mark = 0; + memset(&key->ct.label, 0, sizeof(key->ct.label)); +} + +static inline int ovs_ct_put_key(const struct sw_flow_key *key, + struct sk_buff *skb) +{ + return 0; +} + +static inline void ovs_ct_free_action(const struct nlattr *a) { } +#endif /* CONFIG_NF_CONNTRACK */ +#endif /* ovs_conntrack.h */ diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index ffe984f5b95c..a75828091e21 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -91,8 +91,7 @@ static bool ovs_must_notify(struct genl_family *family, struct genl_info *info, static void ovs_notify(struct genl_family *family, struct sk_buff *skb, struct genl_info *info) { - genl_notify(family, skb, genl_info_net(info), info->snd_portid, - 0, info->nlhdr, GFP_KERNEL); + genl_notify(family, skb, info, 0, GFP_KERNEL); } /** @@ -275,6 +274,7 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) memset(&upcall, 0, sizeof(upcall)); upcall.cmd = OVS_PACKET_CMD_MISS; upcall.portid = ovs_vport_find_upcall_portid(p, skb); + upcall.mru = OVS_CB(skb)->mru; error = ovs_dp_upcall(dp, skb, key, &upcall); if (unlikely(error)) kfree_skb(skb); @@ -400,9 +400,23 @@ static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info, if (upcall_info->actions_len) size += nla_total_size(upcall_info->actions_len); + /* OVS_PACKET_ATTR_MRU */ + if (upcall_info->mru) + size += nla_total_size(sizeof(upcall_info->mru)); + return size; } +static void pad_packet(struct datapath *dp, struct sk_buff *skb) +{ + if (!(dp->user_features & OVS_DP_F_UNALIGNED)) { + size_t plen = NLA_ALIGN(skb->len) - skb->len; + + if (plen > 0) + memset(skb_put(skb, plen), 0, plen); + } +} + static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, const struct sw_flow_key *key, const struct dp_upcall_info *upcall_info) @@ -476,7 +490,8 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, if (upcall_info->egress_tun_info) { nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_EGRESS_TUN_KEY); err = ovs_nla_put_egress_tunnel_key(user_skb, - upcall_info->egress_tun_info); + upcall_info->egress_tun_info, + upcall_info->egress_tun_opts); BUG_ON(err); nla_nest_end(user_skb, nla); } @@ -492,6 +507,16 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, nla_nest_cancel(user_skb, nla); } + /* Add OVS_PACKET_ATTR_MRU */ + if (upcall_info->mru) { + if (nla_put_u16(user_skb, OVS_PACKET_ATTR_MRU, + upcall_info->mru)) { + err = -ENOBUFS; + goto out; + } + pad_packet(dp, user_skb); + } + /* Only reserve room for attribute header, packet data is added * in skb_zerocopy() */ if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) { @@ -505,12 +530,7 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, goto out; /* Pad OVS_PACKET_ATTR_PACKET if linear copy was performed */ - if (!(dp->user_features & OVS_DP_F_UNALIGNED)) { - size_t plen = NLA_ALIGN(user_skb->len) - user_skb->len; - - if (plen > 0) - memset(skb_put(user_skb, plen), 0, plen); - } + pad_packet(dp, user_skb); ((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len; @@ -527,6 +547,7 @@ out: static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) { struct ovs_header *ovs_header = info->userhdr; + struct net *net = sock_net(skb->sk); struct nlattr **a = info->attrs; struct sw_flow_actions *acts; struct sk_buff *packet; @@ -535,6 +556,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) struct datapath *dp; struct ethhdr *eth; struct vport *input_vport; + u16 mru = 0; int len; int err; bool log = !a[OVS_PACKET_ATTR_PROBE]; @@ -564,29 +586,35 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) else packet->protocol = htons(ETH_P_802_2); + /* Set packet's mru */ + if (a[OVS_PACKET_ATTR_MRU]) { + mru = nla_get_u16(a[OVS_PACKET_ATTR_MRU]); + packet->ignore_df = 1; + } + OVS_CB(packet)->mru = mru; + /* Build an sw_flow for sending this packet. */ flow = ovs_flow_alloc(); err = PTR_ERR(flow); if (IS_ERR(flow)) goto err_kfree_skb; - err = ovs_flow_key_extract_userspace(a[OVS_PACKET_ATTR_KEY], packet, - &flow->key, log); + err = ovs_flow_key_extract_userspace(net, a[OVS_PACKET_ATTR_KEY], + packet, &flow->key, log); if (err) goto err_flow_free; - err = ovs_nla_copy_actions(a[OVS_PACKET_ATTR_ACTIONS], + err = ovs_nla_copy_actions(net, a[OVS_PACKET_ATTR_ACTIONS], &flow->key, &acts, log); if (err) goto err_flow_free; rcu_assign_pointer(flow->sf_acts, acts); - OVS_CB(packet)->egress_tun_info = NULL; packet->priority = flow->key.phy.priority; packet->mark = flow->key.phy.skb_mark; rcu_read_lock(); - dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex); + dp = get_dp_rcu(net, ovs_header->dp_ifindex); err = -ENODEV; if (!dp) goto err_unlock; @@ -598,6 +626,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) if (!input_vport) goto err_unlock; + packet->dev = input_vport->dev; OVS_CB(packet)->input_vport = input_vport; sf_acts = rcu_dereference(flow->sf_acts); @@ -624,6 +653,7 @@ static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = { [OVS_PACKET_ATTR_KEY] = { .type = NLA_NESTED }, [OVS_PACKET_ATTR_ACTIONS] = { .type = NLA_NESTED }, [OVS_PACKET_ATTR_PROBE] = { .type = NLA_FLAG }, + [OVS_PACKET_ATTR_MRU] = { .type = NLA_U16 }, }; static const struct genl_ops dp_packet_genl_ops[] = { @@ -713,7 +743,7 @@ static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts, /* OVS_FLOW_ATTR_ACTIONS */ if (should_fill_actions(ufid_flags)) - len += nla_total_size(acts->actions_len); + len += nla_total_size(acts->orig_len); return len + nla_total_size(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */ @@ -880,6 +910,7 @@ static struct sk_buff *ovs_flow_cmd_build_info(const struct sw_flow *flow, static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) { + struct net *net = sock_net(skb->sk); struct nlattr **a = info->attrs; struct ovs_header *ovs_header = info->userhdr; struct sw_flow *flow = NULL, *new_flow; @@ -915,12 +946,12 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) /* Extract key. */ ovs_match_init(&match, &key, &mask); - error = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], + error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK], log); if (error) goto err_kfree_flow; - ovs_flow_mask_key(&new_flow->key, &key, &mask); + ovs_flow_mask_key(&new_flow->key, &key, true, &mask); /* Extract flow identifier. */ error = ovs_nla_get_identifier(&new_flow->id, a[OVS_FLOW_ATTR_UFID], @@ -929,8 +960,8 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) goto err_kfree_flow; /* Validate actions. */ - error = ovs_nla_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], &new_flow->key, - &acts, log); + error = ovs_nla_copy_actions(net, a[OVS_FLOW_ATTR_ACTIONS], + &new_flow->key, &acts, log); if (error) { OVS_NLERR(log, "Flow actions may not be safe on all matching packets."); goto err_kfree_flow; @@ -944,7 +975,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) } ovs_lock(); - dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + dp = get_dp(net, ovs_header->dp_ifindex); if (unlikely(!dp)) { error = -ENODEV; goto err_unlock_ovs; @@ -1038,7 +1069,8 @@ error: } /* Factor out action copy to avoid "Wframe-larger-than=1024" warning. */ -static struct sw_flow_actions *get_flow_actions(const struct nlattr *a, +static struct sw_flow_actions *get_flow_actions(struct net *net, + const struct nlattr *a, const struct sw_flow_key *key, const struct sw_flow_mask *mask, bool log) @@ -1047,8 +1079,8 @@ static struct sw_flow_actions *get_flow_actions(const struct nlattr *a, struct sw_flow_key masked_key; int error; - ovs_flow_mask_key(&masked_key, key, mask); - error = ovs_nla_copy_actions(a, &masked_key, &acts, log); + ovs_flow_mask_key(&masked_key, key, true, mask); + error = ovs_nla_copy_actions(net, a, &masked_key, &acts, log); if (error) { OVS_NLERR(log, "Actions may not be safe on all matching packets"); @@ -1060,6 +1092,7 @@ static struct sw_flow_actions *get_flow_actions(const struct nlattr *a, static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) { + struct net *net = sock_net(skb->sk); struct nlattr **a = info->attrs; struct ovs_header *ovs_header = info->userhdr; struct sw_flow_key key; @@ -1084,15 +1117,15 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) ufid_present = ovs_nla_get_ufid(&sfid, a[OVS_FLOW_ATTR_UFID], log); ovs_match_init(&match, &key, &mask); - error = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], + error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK], log); if (error) goto error; /* Validate actions. */ if (a[OVS_FLOW_ATTR_ACTIONS]) { - acts = get_flow_actions(a[OVS_FLOW_ATTR_ACTIONS], &key, &mask, - log); + acts = get_flow_actions(net, a[OVS_FLOW_ATTR_ACTIONS], &key, + &mask, log); if (IS_ERR(acts)) { error = PTR_ERR(acts); goto error; @@ -1108,7 +1141,7 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) } ovs_lock(); - dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + dp = get_dp(net, ovs_header->dp_ifindex); if (unlikely(!dp)) { error = -ENODEV; goto err_unlock_ovs; @@ -1174,6 +1207,7 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; struct ovs_header *ovs_header = info->userhdr; + struct net *net = sock_net(skb->sk); struct sw_flow_key key; struct sk_buff *reply; struct sw_flow *flow; @@ -1188,7 +1222,7 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log); if (a[OVS_FLOW_ATTR_KEY]) { ovs_match_init(&match, &key, NULL); - err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL, + err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], NULL, log); } else if (!ufid_present) { OVS_NLERR(log, @@ -1232,6 +1266,7 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; struct ovs_header *ovs_header = info->userhdr; + struct net *net = sock_net(skb->sk); struct sw_flow_key key; struct sk_buff *reply; struct sw_flow *flow = NULL; @@ -1246,8 +1281,8 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log); if (a[OVS_FLOW_ATTR_KEY]) { ovs_match_init(&match, &key, NULL); - err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL, - log); + err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], + NULL, log); if (unlikely(err)) return err; } @@ -2203,6 +2238,7 @@ static int __net_init ovs_init_net(struct net *net) INIT_LIST_HEAD(&ovs_net->dps); INIT_WORK(&ovs_net->dp_notify_work, ovs_dp_notify_wq); + ovs_ct_init(net); return 0; } @@ -2237,6 +2273,7 @@ static void __net_exit ovs_exit_net(struct net *dnet) struct net *net; LIST_HEAD(head); + ovs_ct_exit(dnet); ovs_lock(); list_for_each_entry_safe(dp, dp_next, &ovs_net->dps, list_node) __dp_destroy(dp); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 6b28c5cedb23..f88038a99f44 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -27,9 +27,9 @@ #include <linux/u64_stats_sync.h> #include <net/ip_tunnels.h> +#include "conntrack.h" #include "flow.h" #include "flow_table.h" -#include "vport.h" #define DP_MAX_PORTS USHRT_MAX #define DP_VPORT_HASH_BUCKETS 1024 @@ -93,14 +93,14 @@ struct datapath { /** * struct ovs_skb_cb - OVS data in skb CB - * @egress_tun_key: Tunnel information about this packet on egress path. - * NULL if the packet is not being tunneled. * @input_vport: The original vport packet came in on. This value is cached * when a packet is received by OVS. + * @mru: The maximum received fragement size; 0 if the packet is not + * fragmented. */ struct ovs_skb_cb { - struct ip_tunnel_info *egress_tun_info; struct vport *input_vport; + u16 mru; }; #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb) @@ -113,14 +113,17 @@ struct ovs_skb_cb { * then no packet is sent and the packet is accounted in the datapath's @n_lost * counter. * @egress_tun_info: If nonnull, becomes %OVS_PACKET_ATTR_EGRESS_TUN_KEY. + * @mru: If not zero, Maximum received IP fragment size. */ struct dp_upcall_info { - const struct ip_tunnel_info *egress_tun_info; + struct ip_tunnel_info *egress_tun_info; + const void *egress_tun_opts; const struct nlattr *userdata; const struct nlattr *actions; int actions_len; u32 portid; u8 cmd; + u16 mru; }; /** @@ -131,7 +134,9 @@ struct dp_upcall_info { struct ovs_net { struct list_head dps; struct work_struct dp_notify_work; - struct vport_net vport_net; + + /* Module reference for configuring conntrack. */ + bool xt_label; }; extern int ovs_net_id; @@ -200,6 +205,10 @@ void ovs_dp_notify_wq(struct work_struct *work); int action_fifos_init(void); void action_fifos_exit(void); +/* 'KEY' must not have any bits set outside of the 'MASK' */ +#define OVS_MASKED(OLD, KEY, MASK) ((KEY) | ((OLD) & ~(MASK))) +#define OVS_SET_MASKED(OLD, KEY, MASK) ((OLD) = OVS_MASKED(OLD, KEY, MASK)) + #define OVS_NLERR(logging_allowed, fmt, ...) \ do { \ if (logging_allowed && net_ratelimit()) \ diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 8db22ef73626..0ea128eeeab2 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -46,9 +46,11 @@ #include <net/mpls.h> #include <net/ndisc.h> +#include "conntrack.h" #include "datapath.h" #include "flow.h" #include "flow_netlink.h" +#include "vport.h" u64 ovs_flow_used_time(unsigned long flow_jiffies) { @@ -271,8 +273,6 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key) key->ipv6.addr.dst = nh->daddr; payload_ofs = ipv6_skip_exthdr(skb, payload_ofs, &nexthdr, &frag_off); - if (unlikely(payload_ofs < 0)) - return -EINVAL; if (frag_off) { if (frag_off & htons(~0x7)) @@ -283,6 +283,13 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key) key->ip.frag = OVS_FRAG_TYPE_NONE; } + /* Delayed handling of error in ipv6_skip_exthdr() as it + * always sets frag_off to a valid value which may be + * used to set key->ip.frag above. + */ + if (unlikely(payload_ofs < 0)) + return -EPROTO; + nh_len = payload_ofs - nh_ofs; skb_set_transport_header(skb, nh_ofs + nh_len); key->ip.proto = nexthdr; @@ -622,12 +629,16 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) nh_len = parse_ipv6hdr(skb, key); if (unlikely(nh_len < 0)) { - memset(&key->ip, 0, sizeof(key->ip)); - memset(&key->ipv6.addr, 0, sizeof(key->ipv6.addr)); - if (nh_len == -EINVAL) { + switch (nh_len) { + case -EINVAL: + memset(&key->ip, 0, sizeof(key->ip)); + memset(&key->ipv6.addr, 0, sizeof(key->ipv6.addr)); + /* fall-through */ + case -EPROTO: skb->transport_header = skb->network_header; error = 0; - } else { + break; + default: error = nh_len; } return error; @@ -687,19 +698,22 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, { /* Extract metadata from packet. */ if (tun_info) { + key->tun_proto = ip_tunnel_info_af(tun_info); memcpy(&key->tun_key, &tun_info->key, sizeof(key->tun_key)); - if (tun_info->options) { + if (tun_info->options_len) { BUILD_BUG_ON((1 << (sizeof(tun_info->options_len) * 8)) - 1 > sizeof(key->tun_opts)); - memcpy(TUN_METADATA_OPTS(key, tun_info->options_len), - tun_info->options, tun_info->options_len); + + ip_tunnel_info_opts_get(TUN_METADATA_OPTS(key, tun_info->options_len), + tun_info); key->tun_opts_len = tun_info->options_len; } else { key->tun_opts_len = 0; } } else { + key->tun_proto = 0; key->tun_opts_len = 0; memset(&key->tun_key, 0, sizeof(key->tun_key)); } @@ -707,13 +721,14 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, key->phy.priority = skb->priority; key->phy.in_port = OVS_CB(skb)->input_vport->port_no; key->phy.skb_mark = skb->mark; + ovs_ct_fill_key(skb, key); key->ovs_flow_hash = 0; key->recirc_id = 0; return key_extract(skb, key); } -int ovs_flow_key_extract_userspace(const struct nlattr *attr, +int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr, struct sk_buff *skb, struct sw_flow_key *key, bool log) { @@ -722,7 +737,7 @@ int ovs_flow_key_extract_userspace(const struct nlattr *attr, memset(key, 0, OVS_SW_FLOW_KEY_METADATA_SIZE); /* Extract metadata from netlink attributes. */ - err = ovs_nla_get_flow_metadata(attr, key, log); + err = ovs_nla_get_flow_metadata(net, attr, key, log); if (err) return err; diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index b62cdb3e3589..5688e33e2de6 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -63,6 +63,7 @@ struct sw_flow_key { u32 skb_mark; /* SKB mark. */ u16 in_port; /* Input switch port (or DP_MAX_PORTS). */ } __packed phy; /* Safe when right after 'tun_key'. */ + u8 tun_proto; /* Protocol of encapsulating tunnel. */ u32 ovs_flow_hash; /* Datapath computed hash value. */ u32 recirc_id; /* Recirculation ID. */ struct { @@ -111,6 +112,14 @@ struct sw_flow_key { } nd; } ipv6; }; + struct { + /* Connection tracking fields. */ + u16 zone; + u32 mark; + u8 state; + struct ovs_key_ct_label label; + } ct; + } __aligned(BITS_PER_LONG/8); /* Ensure that we can do comparisons as longs. */ struct sw_flow_key_range { @@ -144,6 +153,7 @@ struct sw_flow_id { struct sw_flow_actions { struct rcu_head rcu; + size_t orig_len; /* From flow_cmd_new netlink actions size */ u32 actions_len; struct nlattr actions[]; }; @@ -212,7 +222,7 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, struct sk_buff *skb, struct sw_flow_key *key); /* Extract key from packet coming from userspace. */ -int ovs_flow_key_extract_userspace(const struct nlattr *attr, +int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr, struct sk_buff *skb, struct sw_flow_key *key, bool log); diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 4e7a3f7facc2..77850f177a47 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -57,6 +57,7 @@ struct ovs_len_tbl { }; #define OVS_ATTR_NESTED -1 +#define OVS_ATTR_VARIABLE -2 static void update_range(struct sw_flow_match *match, size_t offset, size_t size, bool is_mask) @@ -261,8 +262,8 @@ size_t ovs_tun_key_attr_size(void) * updating this function. */ return nla_total_size(8) /* OVS_TUNNEL_KEY_ATTR_ID */ - + nla_total_size(4) /* OVS_TUNNEL_KEY_ATTR_IPV4_SRC */ - + nla_total_size(4) /* OVS_TUNNEL_KEY_ATTR_IPV4_DST */ + + nla_total_size(16) /* OVS_TUNNEL_KEY_ATTR_IPV[46]_SRC */ + + nla_total_size(16) /* OVS_TUNNEL_KEY_ATTR_IPV[46]_DST */ + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TOS */ + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TTL */ + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT */ @@ -281,7 +282,7 @@ size_t ovs_key_attr_size(void) /* Whenever adding new OVS_KEY_ FIELDS, we should consider * updating this function. */ - BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 22); + BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 26); return nla_total_size(4) /* OVS_KEY_ATTR_PRIORITY */ + nla_total_size(0) /* OVS_KEY_ATTR_TUNNEL */ @@ -290,6 +291,10 @@ size_t ovs_key_attr_size(void) + nla_total_size(4) /* OVS_KEY_ATTR_SKB_MARK */ + nla_total_size(4) /* OVS_KEY_ATTR_DP_HASH */ + nla_total_size(4) /* OVS_KEY_ATTR_RECIRC_ID */ + + nla_total_size(1) /* OVS_KEY_ATTR_CT_STATE */ + + nla_total_size(2) /* OVS_KEY_ATTR_CT_ZONE */ + + nla_total_size(4) /* OVS_KEY_ATTR_CT_MARK */ + + nla_total_size(16) /* OVS_KEY_ATTR_CT_LABEL */ + nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */ + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */ + nla_total_size(4) /* OVS_KEY_ATTR_VLAN */ @@ -300,6 +305,10 @@ size_t ovs_key_attr_size(void) + nla_total_size(28); /* OVS_KEY_ATTR_ND */ } +static const struct ovs_len_tbl ovs_vxlan_ext_key_lens[OVS_VXLAN_EXT_MAX + 1] = { + [OVS_VXLAN_EXT_GBP] = { .len = sizeof(u32) }, +}; + static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] = { [OVS_TUNNEL_KEY_ATTR_ID] = { .len = sizeof(u64) }, [OVS_TUNNEL_KEY_ATTR_IPV4_SRC] = { .len = sizeof(u32) }, @@ -311,8 +320,11 @@ static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] [OVS_TUNNEL_KEY_ATTR_TP_SRC] = { .len = sizeof(u16) }, [OVS_TUNNEL_KEY_ATTR_TP_DST] = { .len = sizeof(u16) }, [OVS_TUNNEL_KEY_ATTR_OAM] = { .len = 0 }, - [OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS] = { .len = OVS_ATTR_NESTED }, - [OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS] = { .len = OVS_ATTR_NESTED }, + [OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS] = { .len = OVS_ATTR_VARIABLE }, + [OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS] = { .len = OVS_ATTR_NESTED, + .next = ovs_vxlan_ext_key_lens }, + [OVS_TUNNEL_KEY_ATTR_IPV6_SRC] = { .len = sizeof(struct in6_addr) }, + [OVS_TUNNEL_KEY_ATTR_IPV6_DST] = { .len = sizeof(struct in6_addr) }, }; /* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */ @@ -339,8 +351,19 @@ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { [OVS_KEY_ATTR_TUNNEL] = { .len = OVS_ATTR_NESTED, .next = ovs_tunnel_key_lens, }, [OVS_KEY_ATTR_MPLS] = { .len = sizeof(struct ovs_key_mpls) }, + [OVS_KEY_ATTR_CT_STATE] = { .len = sizeof(u8) }, + [OVS_KEY_ATTR_CT_ZONE] = { .len = sizeof(u16) }, + [OVS_KEY_ATTR_CT_MARK] = { .len = sizeof(u32) }, + [OVS_KEY_ATTR_CT_LABEL] = { .len = sizeof(struct ovs_key_ct_label) }, }; +static bool check_attr_len(unsigned int attr_len, unsigned int expected_len) +{ + return expected_len == attr_len || + expected_len == OVS_ATTR_NESTED || + expected_len == OVS_ATTR_VARIABLE; +} + static bool is_all_zero(const u8 *fp, size_t size) { int i; @@ -380,7 +403,7 @@ static int __parse_flow_nlattrs(const struct nlattr *attr, } expected_len = ovs_key_lens[type].len; - if (nla_len(nla) != expected_len && expected_len != OVS_ATTR_NESTED) { + if (!check_attr_len(nla_len(nla), expected_len)) { OVS_NLERR(log, "Key %d has unexpected len %d expected %d", type, nla_len(nla), expected_len); return -EINVAL; @@ -465,29 +488,50 @@ static int genev_tun_opt_from_nlattr(const struct nlattr *a, return 0; } -static const struct nla_policy vxlan_opt_policy[OVS_VXLAN_EXT_MAX + 1] = { - [OVS_VXLAN_EXT_GBP] = { .type = NLA_U32 }, -}; - -static int vxlan_tun_opt_from_nlattr(const struct nlattr *a, +static int vxlan_tun_opt_from_nlattr(const struct nlattr *attr, struct sw_flow_match *match, bool is_mask, bool log) { - struct nlattr *tb[OVS_VXLAN_EXT_MAX+1]; + struct nlattr *a; + int rem; unsigned long opt_key_offset; struct vxlan_metadata opts; - int err; BUILD_BUG_ON(sizeof(opts) > sizeof(match->key->tun_opts)); - err = nla_parse_nested(tb, OVS_VXLAN_EXT_MAX, a, vxlan_opt_policy); - if (err < 0) - return err; - memset(&opts, 0, sizeof(opts)); + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + + if (type > OVS_VXLAN_EXT_MAX) { + OVS_NLERR(log, "VXLAN extension %d out of range max %d", + type, OVS_VXLAN_EXT_MAX); + return -EINVAL; + } - if (tb[OVS_VXLAN_EXT_GBP]) - opts.gbp = nla_get_u32(tb[OVS_VXLAN_EXT_GBP]); + if (!check_attr_len(nla_len(a), + ovs_vxlan_ext_key_lens[type].len)) { + OVS_NLERR(log, "VXLAN extension %d has unexpected len %d expected %d", + type, nla_len(a), + ovs_vxlan_ext_key_lens[type].len); + return -EINVAL; + } + + switch (type) { + case OVS_VXLAN_EXT_GBP: + opts.gbp = nla_get_u32(a); + break; + default: + OVS_NLERR(log, "Unknown VXLAN extension attribute %d", + type); + return -EINVAL; + } + } + if (rem) { + OVS_NLERR(log, "VXLAN extension message has %d unknown bytes.", + rem); + return -EINVAL; + } if (!is_mask) SW_FLOW_KEY_PUT(match, tun_opts_len, sizeof(opts), false); @@ -500,14 +544,14 @@ static int vxlan_tun_opt_from_nlattr(const struct nlattr *a, return 0; } -static int ipv4_tun_from_nlattr(const struct nlattr *attr, - struct sw_flow_match *match, bool is_mask, - bool log) +static int ip_tun_from_nlattr(const struct nlattr *attr, + struct sw_flow_match *match, bool is_mask, + bool log) { struct nlattr *a; int rem; bool ttl = false; - __be16 tun_flags = 0; + __be16 tun_flags = 0, ipv4 = false, ipv6 = false; int opts_type = 0; nla_for_each_nested(a, attr, rem) { @@ -520,8 +564,8 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, return -EINVAL; } - if (ovs_tunnel_key_lens[type].len != nla_len(a) && - ovs_tunnel_key_lens[type].len != OVS_ATTR_NESTED) { + if (!check_attr_len(nla_len(a), + ovs_tunnel_key_lens[type].len)) { OVS_NLERR(log, "Tunnel attr %d has unexpected len %d expected %d", type, nla_len(a), ovs_tunnel_key_lens[type].len); return -EINVAL; @@ -536,10 +580,22 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, case OVS_TUNNEL_KEY_ATTR_IPV4_SRC: SW_FLOW_KEY_PUT(match, tun_key.u.ipv4.src, nla_get_in_addr(a), is_mask); + ipv4 = true; break; case OVS_TUNNEL_KEY_ATTR_IPV4_DST: SW_FLOW_KEY_PUT(match, tun_key.u.ipv4.dst, nla_get_in_addr(a), is_mask); + ipv4 = true; + break; + case OVS_TUNNEL_KEY_ATTR_IPV6_SRC: + SW_FLOW_KEY_PUT(match, tun_key.u.ipv6.dst, + nla_get_in6_addr(a), is_mask); + ipv6 = true; + break; + case OVS_TUNNEL_KEY_ATTR_IPV6_DST: + SW_FLOW_KEY_PUT(match, tun_key.u.ipv6.dst, + nla_get_in6_addr(a), is_mask); + ipv6 = true; break; case OVS_TUNNEL_KEY_ATTR_TOS: SW_FLOW_KEY_PUT(match, tun_key.tos, @@ -594,28 +650,46 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, opts_type = type; break; default: - OVS_NLERR(log, "Unknown IPv4 tunnel attribute %d", + OVS_NLERR(log, "Unknown IP tunnel attribute %d", type); return -EINVAL; } } SW_FLOW_KEY_PUT(match, tun_key.tun_flags, tun_flags, is_mask); + if (is_mask) + SW_FLOW_KEY_MEMSET_FIELD(match, tun_proto, 0xff, true); + else + SW_FLOW_KEY_PUT(match, tun_proto, ipv6 ? AF_INET6 : AF_INET, + false); if (rem > 0) { - OVS_NLERR(log, "IPv4 tunnel attribute has %d unknown bytes.", + OVS_NLERR(log, "IP tunnel attribute has %d unknown bytes.", rem); return -EINVAL; } + if (ipv4 && ipv6) { + OVS_NLERR(log, "Mixed IPv4 and IPv6 tunnel attributes"); + return -EINVAL; + } + if (!is_mask) { - if (!match->key->tun_key.u.ipv4.dst) { + if (!ipv4 && !ipv6) { + OVS_NLERR(log, "IP tunnel dst address not specified"); + return -EINVAL; + } + if (ipv4 && !match->key->tun_key.u.ipv4.dst) { OVS_NLERR(log, "IPv4 tunnel dst address is zero"); return -EINVAL; } + if (ipv6 && ipv6_addr_any(&match->key->tun_key.u.ipv6.dst)) { + OVS_NLERR(log, "IPv6 tunnel dst address is zero"); + return -EINVAL; + } if (!ttl) { - OVS_NLERR(log, "IPv4 tunnel TTL not specified."); + OVS_NLERR(log, "IP tunnel TTL not specified."); return -EINVAL; } } @@ -640,21 +714,36 @@ static int vxlan_opt_to_nlattr(struct sk_buff *skb, return 0; } -static int __ipv4_tun_to_nlattr(struct sk_buff *skb, - const struct ip_tunnel_key *output, - const void *tun_opts, int swkey_tun_opts_len) +static int __ip_tun_to_nlattr(struct sk_buff *skb, + const struct ip_tunnel_key *output, + const void *tun_opts, int swkey_tun_opts_len, + unsigned short tun_proto) { if (output->tun_flags & TUNNEL_KEY && nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id)) return -EMSGSIZE; - if (output->u.ipv4.src && - nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, - output->u.ipv4.src)) - return -EMSGSIZE; - if (output->u.ipv4.dst && - nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, - output->u.ipv4.dst)) - return -EMSGSIZE; + switch (tun_proto) { + case AF_INET: + if (output->u.ipv4.src && + nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, + output->u.ipv4.src)) + return -EMSGSIZE; + if (output->u.ipv4.dst && + nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, + output->u.ipv4.dst)) + return -EMSGSIZE; + break; + case AF_INET6: + if (!ipv6_addr_any(&output->u.ipv6.src) && + nla_put_in6_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV6_SRC, + &output->u.ipv6.src)) + return -EMSGSIZE; + if (!ipv6_addr_any(&output->u.ipv6.dst) && + nla_put_in6_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV6_DST, + &output->u.ipv6.dst)) + return -EMSGSIZE; + break; + } if (output->tos && nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->tos)) return -EMSGSIZE; @@ -688,9 +777,10 @@ static int __ipv4_tun_to_nlattr(struct sk_buff *skb, return 0; } -static int ipv4_tun_to_nlattr(struct sk_buff *skb, - const struct ip_tunnel_key *output, - const void *tun_opts, int swkey_tun_opts_len) +static int ip_tun_to_nlattr(struct sk_buff *skb, + const struct ip_tunnel_key *output, + const void *tun_opts, int swkey_tun_opts_len, + unsigned short tun_proto) { struct nlattr *nla; int err; @@ -699,7 +789,8 @@ static int ipv4_tun_to_nlattr(struct sk_buff *skb, if (!nla) return -EMSGSIZE; - err = __ipv4_tun_to_nlattr(skb, output, tun_opts, swkey_tun_opts_len); + err = __ip_tun_to_nlattr(skb, output, tun_opts, swkey_tun_opts_len, + tun_proto); if (err) return err; @@ -708,16 +799,18 @@ static int ipv4_tun_to_nlattr(struct sk_buff *skb, } int ovs_nla_put_egress_tunnel_key(struct sk_buff *skb, - const struct ip_tunnel_info *egress_tun_info) + const struct ip_tunnel_info *egress_tun_info, + const void *egress_tun_opts) { - return __ipv4_tun_to_nlattr(skb, &egress_tun_info->key, - egress_tun_info->options, - egress_tun_info->options_len); + return __ip_tun_to_nlattr(skb, &egress_tun_info->key, + egress_tun_opts, + egress_tun_info->options_len, + ip_tunnel_info_af(egress_tun_info)); } -static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs, - const struct nlattr **a, bool is_mask, - bool log) +static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match, + u64 *attrs, const struct nlattr **a, + bool is_mask, bool log) { if (*attrs & (1 << OVS_KEY_ATTR_DP_HASH)) { u32 hash_val = nla_get_u32(a[OVS_KEY_ATTR_DP_HASH]); @@ -763,21 +856,52 @@ static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs, *attrs &= ~(1 << OVS_KEY_ATTR_SKB_MARK); } if (*attrs & (1 << OVS_KEY_ATTR_TUNNEL)) { - if (ipv4_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], match, - is_mask, log) < 0) + if (ip_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], match, + is_mask, log) < 0) return -EINVAL; *attrs &= ~(1 << OVS_KEY_ATTR_TUNNEL); } + + if (*attrs & (1 << OVS_KEY_ATTR_CT_STATE) && + ovs_ct_verify(net, OVS_KEY_ATTR_CT_STATE)) { + u8 ct_state = nla_get_u8(a[OVS_KEY_ATTR_CT_STATE]); + + SW_FLOW_KEY_PUT(match, ct.state, ct_state, is_mask); + *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_STATE); + } + if (*attrs & (1 << OVS_KEY_ATTR_CT_ZONE) && + ovs_ct_verify(net, OVS_KEY_ATTR_CT_ZONE)) { + u16 ct_zone = nla_get_u16(a[OVS_KEY_ATTR_CT_ZONE]); + + SW_FLOW_KEY_PUT(match, ct.zone, ct_zone, is_mask); + *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ZONE); + } + if (*attrs & (1 << OVS_KEY_ATTR_CT_MARK) && + ovs_ct_verify(net, OVS_KEY_ATTR_CT_MARK)) { + u32 mark = nla_get_u32(a[OVS_KEY_ATTR_CT_MARK]); + + SW_FLOW_KEY_PUT(match, ct.mark, mark, is_mask); + *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_MARK); + } + if (*attrs & (1 << OVS_KEY_ATTR_CT_LABEL) && + ovs_ct_verify(net, OVS_KEY_ATTR_CT_LABEL)) { + const struct ovs_key_ct_label *cl; + + cl = nla_data(a[OVS_KEY_ATTR_CT_LABEL]); + SW_FLOW_KEY_MEMCPY(match, ct.label, cl->ct_label, + sizeof(*cl), is_mask); + *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_LABEL); + } return 0; } -static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, - const struct nlattr **a, bool is_mask, - bool log) +static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match, + u64 attrs, const struct nlattr **a, + bool is_mask, bool log) { int err; - err = metadata_from_nlattrs(match, &attrs, a, is_mask, log); + err = metadata_from_nlattrs(net, match, &attrs, a, is_mask, log); if (err) return err; @@ -1012,10 +1136,13 @@ static void nlattr_set(struct nlattr *attr, u8 val, /* The nlattr stream should already have been validated */ nla_for_each_nested(nla, attr, rem) { - if (tbl && tbl[nla_type(nla)].len == OVS_ATTR_NESTED) - nlattr_set(nla, val, tbl[nla_type(nla)].next); - else + if (tbl[nla_type(nla)].len == OVS_ATTR_NESTED) { + if (tbl[nla_type(nla)].next) + tbl = tbl[nla_type(nla)].next; + nlattr_set(nla, val, tbl); + } else { memset(nla_data(nla), val, nla_len(nla)); + } } } @@ -1029,6 +1156,7 @@ static void mask_set_nlattr(struct nlattr *attr, u8 val) * mask. In case the 'mask' is NULL, the flow is treated as exact match * flow. Otherwise, it is treated as a wildcarded flow, except the mask * does not include any don't care bit. + * @net: Used to determine per-namespace field support. * @match: receives the extracted flow match information. * @key: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute * sequence. The fields should of the packet that triggered the creation @@ -1039,7 +1167,7 @@ static void mask_set_nlattr(struct nlattr *attr, u8 val) * probing for feature compatibility this should be passed in as false to * suppress unnecessary error logging. */ -int ovs_nla_get_match(struct sw_flow_match *match, +int ovs_nla_get_match(struct net *net, struct sw_flow_match *match, const struct nlattr *nla_key, const struct nlattr *nla_mask, bool log) @@ -1089,7 +1217,7 @@ int ovs_nla_get_match(struct sw_flow_match *match, } } - err = ovs_key_from_nlattrs(match, key_attrs, a, false, log); + err = ovs_key_from_nlattrs(net, match, key_attrs, a, false, log); if (err) return err; @@ -1116,7 +1244,7 @@ int ovs_nla_get_match(struct sw_flow_match *match, /* The userspace does not send tunnel attributes that * are 0, but we should not wildcard them nonetheless. */ - if (match->key->tun_key.u.ipv4.dst) + if (match->key->tun_proto) SW_FLOW_KEY_MEMSET_FIELD(match, tun_key, 0xff, true); @@ -1169,7 +1297,8 @@ int ovs_nla_get_match(struct sw_flow_match *match, } } - err = ovs_key_from_nlattrs(match, mask_attrs, a, true, log); + err = ovs_key_from_nlattrs(net, match, mask_attrs, a, true, + log); if (err) goto free_newmask; } @@ -1250,7 +1379,7 @@ u32 ovs_nla_get_ufid_flags(const struct nlattr *attr) * extracted from the packet itself. */ -int ovs_nla_get_flow_metadata(const struct nlattr *attr, +int ovs_nla_get_flow_metadata(struct net *net, const struct nlattr *attr, struct sw_flow_key *key, bool log) { @@ -1266,9 +1395,10 @@ int ovs_nla_get_flow_metadata(const struct nlattr *attr, memset(&match, 0, sizeof(match)); match.key = key; + memset(&key->ct, 0, sizeof(key->ct)); key->phy.in_port = DP_MAX_PORTS; - return metadata_from_nlattrs(&match, &attrs, a, false, log); + return metadata_from_nlattrs(net, &match, &attrs, a, false, log); } static int __ovs_nla_put_key(const struct sw_flow_key *swkey, @@ -1287,14 +1417,14 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey, if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority)) goto nla_put_failure; - if ((swkey->tun_key.u.ipv4.dst || is_mask)) { + if ((swkey->tun_proto || is_mask)) { const void *opts = NULL; if (output->tun_key.tun_flags & TUNNEL_OPTIONS_PRESENT) opts = TUN_METADATA_OPTS(output, swkey->tun_opts_len); - if (ipv4_tun_to_nlattr(skb, &output->tun_key, opts, - swkey->tun_opts_len)) + if (ip_tun_to_nlattr(skb, &output->tun_key, opts, + swkey->tun_opts_len, swkey->tun_proto)) goto nla_put_failure; } @@ -1314,6 +1444,9 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey, if (nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, output->phy.skb_mark)) goto nla_put_failure; + if (ovs_ct_put_key(output, skb)) + goto nla_put_failure; + nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key)); if (!nla) goto nla_put_failure; @@ -1574,6 +1707,9 @@ void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts) case OVS_ACTION_ATTR_SET: ovs_nla_free_set_action(a); break; + case OVS_ACTION_ATTR_CT: + ovs_ct_free_action(a); + break; } } @@ -1619,6 +1755,7 @@ static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, memcpy(acts->actions, (*sfa)->actions, (*sfa)->actions_len); acts->actions_len = (*sfa)->actions_len; + acts->orig_len = (*sfa)->orig_len; kfree(*sfa); *sfa = acts; @@ -1646,8 +1783,8 @@ static struct nlattr *__add_action(struct sw_flow_actions **sfa, return a; } -static int add_action(struct sw_flow_actions **sfa, int attrtype, - void *data, int len, bool log) +int ovs_nla_add_action(struct sw_flow_actions **sfa, int attrtype, void *data, + int len, bool log) { struct nlattr *a; @@ -1662,7 +1799,7 @@ static inline int add_nested_action_start(struct sw_flow_actions **sfa, int used = (*sfa)->actions_len; int err; - err = add_action(sfa, attrtype, NULL, 0, log); + err = ovs_nla_add_action(sfa, attrtype, NULL, 0, log); if (err) return err; @@ -1678,12 +1815,12 @@ static inline void add_nested_action_end(struct sw_flow_actions *sfa, a->nla_len = sfa->actions_len - st_offset; } -static int __ovs_nla_copy_actions(const struct nlattr *attr, +static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, const struct sw_flow_key *key, int depth, struct sw_flow_actions **sfa, __be16 eth_type, __be16 vlan_tci, bool log); -static int validate_and_copy_sample(const struct nlattr *attr, +static int validate_and_copy_sample(struct net *net, const struct nlattr *attr, const struct sw_flow_key *key, int depth, struct sw_flow_actions **sfa, __be16 eth_type, __be16 vlan_tci, bool log) @@ -1715,15 +1852,15 @@ static int validate_and_copy_sample(const struct nlattr *attr, start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SAMPLE, log); if (start < 0) return start; - err = add_action(sfa, OVS_SAMPLE_ATTR_PROBABILITY, - nla_data(probability), sizeof(u32), log); + err = ovs_nla_add_action(sfa, OVS_SAMPLE_ATTR_PROBABILITY, + nla_data(probability), sizeof(u32), log); if (err) return err; st_acts = add_nested_action_start(sfa, OVS_SAMPLE_ATTR_ACTIONS, log); if (st_acts < 0) return st_acts; - err = __ovs_nla_copy_actions(actions, key, depth + 1, sfa, + err = __ovs_nla_copy_actions(net, actions, key, depth + 1, sfa, eth_type, vlan_tci, log); if (err) return err; @@ -1790,7 +1927,7 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, int err = 0, start, opts_type; ovs_match_init(&match, &key, NULL); - opts_type = ipv4_tun_from_nlattr(nla_data(attr), &match, false, log); + opts_type = ip_tun_from_nlattr(nla_data(attr), &match, false, log); if (opts_type < 0) return opts_type; @@ -1826,21 +1963,17 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, tun_info = &tun_dst->u.tun_info; tun_info->mode = IP_TUNNEL_INFO_TX; + if (key.tun_proto == AF_INET6) + tun_info->mode |= IP_TUNNEL_INFO_IPV6; tun_info->key = key.tun_key; - tun_info->options_len = key.tun_opts_len; - - if (tun_info->options_len) { - /* We need to store the options in the action itself since - * everything else will go away after flow setup. We can append - * it to tun_info and then point there. - */ - memcpy((tun_info + 1), - TUN_METADATA_OPTS(&key, key.tun_opts_len), key.tun_opts_len); - tun_info->options = (tun_info + 1); - } else { - tun_info->options = NULL; - } + /* We need to store the options in the action itself since + * everything else will go away after flow setup. We can append + * it to tun_info and then point there. + */ + ip_tunnel_info_opts_set(tun_info, + TUN_METADATA_OPTS(&key, key.tun_opts_len), + key.tun_opts_len); add_nested_action_end(*sfa, start); return err; @@ -1878,8 +2011,7 @@ static int validate_set(const struct nlattr *a, key_len /= 2; if (key_type > OVS_KEY_ATTR_MAX || - (ovs_key_lens[key_type].len != key_len && - ovs_key_lens[key_type].len != OVS_ATTR_NESTED)) + !check_attr_len(key_len, ovs_key_lens[key_type].len)) return -EINVAL; if (masked && !validate_masked(nla_data(ovs_key), key_len)) @@ -1892,6 +2024,8 @@ static int validate_set(const struct nlattr *a, case OVS_KEY_ATTR_PRIORITY: case OVS_KEY_ATTR_SKB_MARK: + case OVS_KEY_ATTR_CT_MARK: + case OVS_KEY_ATTR_CT_LABEL: case OVS_KEY_ATTR_ETHERNET: break; @@ -2057,7 +2191,7 @@ static int copy_action(const struct nlattr *from, return 0; } -static int __ovs_nla_copy_actions(const struct nlattr *attr, +static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, const struct sw_flow_key *key, int depth, struct sw_flow_actions **sfa, __be16 eth_type, __be16 vlan_tci, bool log) @@ -2081,7 +2215,8 @@ static int __ovs_nla_copy_actions(const struct nlattr *attr, [OVS_ACTION_ATTR_SET] = (u32)-1, [OVS_ACTION_ATTR_SET_MASKED] = (u32)-1, [OVS_ACTION_ATTR_SAMPLE] = (u32)-1, - [OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash) + [OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash), + [OVS_ACTION_ATTR_CT] = (u32)-1, }; const struct ovs_action_push_vlan *vlan; int type = nla_type(a); @@ -2188,13 +2323,20 @@ static int __ovs_nla_copy_actions(const struct nlattr *attr, break; case OVS_ACTION_ATTR_SAMPLE: - err = validate_and_copy_sample(a, key, depth, sfa, + err = validate_and_copy_sample(net, a, key, depth, sfa, eth_type, vlan_tci, log); if (err) return err; skip_copy = true; break; + case OVS_ACTION_ATTR_CT: + err = ovs_ct_copy_action(net, a, key, sfa, log); + if (err) + return err; + skip_copy = true; + break; + default: OVS_NLERR(log, "Unknown Action type %d", type); return -EINVAL; @@ -2213,7 +2355,7 @@ static int __ovs_nla_copy_actions(const struct nlattr *attr, } /* 'key' must be the masked key. */ -int ovs_nla_copy_actions(const struct nlattr *attr, +int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, const struct sw_flow_key *key, struct sw_flow_actions **sfa, bool log) { @@ -2223,7 +2365,8 @@ int ovs_nla_copy_actions(const struct nlattr *attr, if (IS_ERR(*sfa)) return PTR_ERR(*sfa); - err = __ovs_nla_copy_actions(attr, key, 0, sfa, key->eth.type, + (*sfa)->orig_len = nla_len(attr); + err = __ovs_nla_copy_actions(net, attr, key, 0, sfa, key->eth.type, key->eth.tci, log); if (err) ovs_nla_free_flow_actions(*sfa); @@ -2283,10 +2426,11 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) if (!start) return -EMSGSIZE; - err = ipv4_tun_to_nlattr(skb, &tun_info->key, - tun_info->options_len ? - tun_info->options : NULL, - tun_info->options_len); + err = ip_tun_to_nlattr(skb, &tun_info->key, + tun_info->options_len ? + ip_tunnel_info_opts(tun_info) : NULL, + tun_info->options_len, + ip_tunnel_info_af(tun_info)); if (err) return err; nla_nest_end(skb, start); @@ -2348,6 +2492,13 @@ int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb) if (err) return err; break; + + case OVS_ACTION_ATTR_CT: + err = ovs_ct_action_to_attr(nla_data(a), skb); + if (err) + return err; + break; + default: if (nla_put(skb, type, nla_len(a), nla_data(a))) return -EMSGSIZE; diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h index acd074408f0a..6ca3f0baf449 100644 --- a/net/openvswitch/flow_netlink.h +++ b/net/openvswitch/flow_netlink.h @@ -45,26 +45,30 @@ void ovs_match_init(struct sw_flow_match *match, int ovs_nla_put_key(const struct sw_flow_key *, const struct sw_flow_key *, int attr, bool is_mask, struct sk_buff *); -int ovs_nla_get_flow_metadata(const struct nlattr *, struct sw_flow_key *, - bool log); +int ovs_nla_get_flow_metadata(struct net *, const struct nlattr *, + struct sw_flow_key *, bool log); int ovs_nla_put_identifier(const struct sw_flow *flow, struct sk_buff *skb); int ovs_nla_put_masked_key(const struct sw_flow *flow, struct sk_buff *skb); int ovs_nla_put_mask(const struct sw_flow *flow, struct sk_buff *skb); -int ovs_nla_get_match(struct sw_flow_match *, const struct nlattr *key, - const struct nlattr *mask, bool log); +int ovs_nla_get_match(struct net *, struct sw_flow_match *, + const struct nlattr *key, const struct nlattr *mask, + bool log); int ovs_nla_put_egress_tunnel_key(struct sk_buff *, - const struct ip_tunnel_info *); + const struct ip_tunnel_info *, + const void *egress_tun_opts); bool ovs_nla_get_ufid(struct sw_flow_id *, const struct nlattr *, bool log); int ovs_nla_get_identifier(struct sw_flow_id *sfid, const struct nlattr *ufid, const struct sw_flow_key *key, bool log); u32 ovs_nla_get_ufid_flags(const struct nlattr *attr); -int ovs_nla_copy_actions(const struct nlattr *attr, +int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, const struct sw_flow_key *key, struct sw_flow_actions **sfa, bool log); +int ovs_nla_add_action(struct sw_flow_actions **sfa, int attrtype, + void *data, int len, bool log); int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb); diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index d22d8e948d0f..95dbcedf0bd4 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -57,20 +57,21 @@ static u16 range_n_bytes(const struct sw_flow_key_range *range) } void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src, - const struct sw_flow_mask *mask) + bool full, const struct sw_flow_mask *mask) { - const long *m = (const long *)((const u8 *)&mask->key + - mask->range.start); - const long *s = (const long *)((const u8 *)src + - mask->range.start); - long *d = (long *)((u8 *)dst + mask->range.start); + int start = full ? 0 : mask->range.start; + int len = full ? sizeof *dst : range_n_bytes(&mask->range); + const long *m = (const long *)((const u8 *)&mask->key + start); + const long *s = (const long *)((const u8 *)src + start); + long *d = (long *)((u8 *)dst + start); int i; - /* The memory outside of the 'mask->range' are not set since - * further operations on 'dst' only uses contents within - * 'mask->range'. + /* If 'full' is true then all of 'dst' is fully initialized. Otherwise, + * if 'full' is false the memory outside of the 'mask->range' is left + * uninitialized. This can be used as an optimization when further + * operations on 'dst' only use contents within 'mask->range'. */ - for (i = 0; i < range_n_bytes(&mask->range); i += sizeof(long)) + for (i = 0; i < len; i += sizeof(long)) *d++ = *s++ & *m++; } @@ -426,7 +427,7 @@ static u32 flow_hash(const struct sw_flow_key *key, static int flow_key_start(const struct sw_flow_key *key) { - if (key->tun_key.u.ipv4.dst) + if (key->tun_proto) return 0; else return rounddown(offsetof(struct sw_flow_key, phy), @@ -475,7 +476,7 @@ static struct sw_flow *masked_flow_lookup(struct table_instance *ti, u32 hash; struct sw_flow_key masked_key; - ovs_flow_mask_key(&masked_key, unmasked, mask); + ovs_flow_mask_key(&masked_key, unmasked, false, mask); hash = flow_hash(&masked_key, &mask->range); head = find_bucket(ti, hash); hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver]) { diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h index 616eda10d955..2dd9900f533d 100644 --- a/net/openvswitch/flow_table.h +++ b/net/openvswitch/flow_table.h @@ -86,5 +86,5 @@ struct sw_flow *ovs_flow_tbl_lookup_ufid(struct flow_table *, bool ovs_flow_cmp(const struct sw_flow *, const struct sw_flow_match *); void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src, - const struct sw_flow_mask *mask); + bool full, const struct sw_flow_mask *mask); #endif /* flow_table.h */ diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c index d01bd6360970..2735e9c4a3b8 100644 --- a/net/openvswitch/vport-geneve.c +++ b/net/openvswitch/vport-geneve.c @@ -26,95 +26,42 @@ #include "datapath.h" #include "vport.h" +#include "vport-netdev.h" static struct vport_ops ovs_geneve_vport_ops; - /** * struct geneve_port - Keeps track of open UDP ports - * @gs: The socket created for this port number. - * @name: vport name. + * @dst_port: destination port. */ struct geneve_port { - struct geneve_sock *gs; - char name[IFNAMSIZ]; + u16 port_no; }; -static LIST_HEAD(geneve_ports); - static inline struct geneve_port *geneve_vport(const struct vport *vport) { return vport_priv(vport); } -/* Convert 64 bit tunnel ID to 24 bit VNI. */ -static void tunnel_id_to_vni(__be64 tun_id, __u8 *vni) -{ -#ifdef __BIG_ENDIAN - vni[0] = (__force __u8)(tun_id >> 16); - vni[1] = (__force __u8)(tun_id >> 8); - vni[2] = (__force __u8)tun_id; -#else - vni[0] = (__force __u8)((__force u64)tun_id >> 40); - vni[1] = (__force __u8)((__force u64)tun_id >> 48); - vni[2] = (__force __u8)((__force u64)tun_id >> 56); -#endif -} - -/* Convert 24 bit VNI to 64 bit tunnel ID. */ -static __be64 vni_to_tunnel_id(const __u8 *vni) -{ -#ifdef __BIG_ENDIAN - return (vni[0] << 16) | (vni[1] << 8) | vni[2]; -#else - return (__force __be64)(((__force u64)vni[0] << 40) | - ((__force u64)vni[1] << 48) | - ((__force u64)vni[2] << 56)); -#endif -} - -static void geneve_rcv(struct geneve_sock *gs, struct sk_buff *skb) -{ - struct vport *vport = gs->rcv_data; - struct genevehdr *geneveh = geneve_hdr(skb); - int opts_len; - struct ip_tunnel_info tun_info; - __be64 key; - __be16 flags; - - opts_len = geneveh->opt_len * 4; - - flags = TUNNEL_KEY | TUNNEL_GENEVE_OPT | - (udp_hdr(skb)->check != 0 ? TUNNEL_CSUM : 0) | - (geneveh->oam ? TUNNEL_OAM : 0) | - (geneveh->critical ? TUNNEL_CRIT_OPT : 0); - - key = vni_to_tunnel_id(geneveh->vni); - - ip_tunnel_info_init(&tun_info, ip_hdr(skb), - udp_hdr(skb)->source, udp_hdr(skb)->dest, - key, flags, geneveh->options, opts_len); - - ovs_vport_receive(vport, skb, &tun_info); -} - static int geneve_get_options(const struct vport *vport, struct sk_buff *skb) { struct geneve_port *geneve_port = geneve_vport(vport); - struct inet_sock *sk = inet_sk(geneve_port->gs->sock->sk); - if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(sk->inet_sport))) + if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, geneve_port->port_no)) return -EMSGSIZE; return 0; } -static void geneve_tnl_destroy(struct vport *vport) +static int geneve_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, + struct dp_upcall_info *upcall) { struct geneve_port *geneve_port = geneve_vport(vport); + struct net *net = ovs_dp_get_net(vport->dp); + __be16 dport = htons(geneve_port->port_no); + __be16 sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true); - geneve_sock_release(geneve_port->gs); - - ovs_vport_deferred_free(vport); + return ovs_tunnel_get_egress_info(upcall, ovs_dp_get_net(vport->dp), + skb, IPPROTO_UDP, sport, dport); } static struct vport *geneve_tnl_create(const struct vport_parms *parms) @@ -122,11 +69,11 @@ static struct vport *geneve_tnl_create(const struct vport_parms *parms) struct net *net = ovs_dp_get_net(parms->dp); struct nlattr *options = parms->options; struct geneve_port *geneve_port; - struct geneve_sock *gs; + struct net_device *dev; struct vport *vport; struct nlattr *a; - int err; u16 dst_port; + int err; if (!options) { err = -EINVAL; @@ -148,104 +95,40 @@ static struct vport *geneve_tnl_create(const struct vport_parms *parms) return vport; geneve_port = geneve_vport(vport); - strncpy(geneve_port->name, parms->name, IFNAMSIZ); + geneve_port->port_no = dst_port; - gs = geneve_sock_add(net, htons(dst_port), geneve_rcv, vport, true, 0); - if (IS_ERR(gs)) { + rtnl_lock(); + dev = geneve_dev_create_fb(net, parms->name, NET_NAME_USER, dst_port); + if (IS_ERR(dev)) { + rtnl_unlock(); ovs_vport_free(vport); - return (void *)gs; + return ERR_CAST(dev); } - geneve_port->gs = gs; + dev_change_flags(dev, dev->flags | IFF_UP); + rtnl_unlock(); return vport; error: return ERR_PTR(err); } -static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb) +static struct vport *geneve_create(const struct vport_parms *parms) { - const struct ip_tunnel_key *tun_key; - struct ip_tunnel_info *tun_info; - struct net *net = ovs_dp_get_net(vport->dp); - struct geneve_port *geneve_port = geneve_vport(vport); - __be16 dport = inet_sk(geneve_port->gs->sock->sk)->inet_sport; - __be16 sport; - struct rtable *rt; - struct flowi4 fl; - u8 vni[3], opts_len, *opts; - __be16 df; - int err; - - tun_info = OVS_CB(skb)->egress_tun_info; - if (unlikely(!tun_info)) { - err = -EINVAL; - goto error; - } - - tun_key = &tun_info->key; - rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_UDP); - if (IS_ERR(rt)) { - err = PTR_ERR(rt); - goto error; - } - - df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; - sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true); - tunnel_id_to_vni(tun_key->tun_id, vni); - skb->ignore_df = 1; - - if (tun_key->tun_flags & TUNNEL_GENEVE_OPT) { - opts = (u8 *)tun_info->options; - opts_len = tun_info->options_len; - } else { - opts = NULL; - opts_len = 0; - } - - err = geneve_xmit_skb(geneve_port->gs, rt, skb, fl.saddr, - tun_key->u.ipv4.dst, tun_key->tos, - tun_key->ttl, df, sport, dport, - tun_key->tun_flags, vni, opts_len, opts, - !!(tun_key->tun_flags & TUNNEL_CSUM), false); - if (err < 0) - ip_rt_put(rt); - return err; - -error: - kfree_skb(skb); - return err; -} - -static const char *geneve_get_name(const struct vport *vport) -{ - struct geneve_port *geneve_port = geneve_vport(vport); - - return geneve_port->name; -} + struct vport *vport; -static int geneve_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ip_tunnel_info *egress_tun_info) -{ - struct geneve_port *geneve_port = geneve_vport(vport); - struct net *net = ovs_dp_get_net(vport->dp); - __be16 dport = inet_sk(geneve_port->gs->sock->sk)->inet_sport; - __be16 sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true); + vport = geneve_tnl_create(parms); + if (IS_ERR(vport)) + return vport; - /* Get tp_src and tp_dst, refert to geneve_build_header(). - */ - return ovs_tunnel_get_egress_info(egress_tun_info, - ovs_dp_get_net(vport->dp), - OVS_CB(skb)->egress_tun_info, - IPPROTO_UDP, skb->mark, sport, dport); + return ovs_netdev_link(vport, parms->name); } static struct vport_ops ovs_geneve_vport_ops = { .type = OVS_VPORT_TYPE_GENEVE, - .create = geneve_tnl_create, - .destroy = geneve_tnl_destroy, - .get_name = geneve_get_name, + .create = geneve_create, + .destroy = ovs_netdev_tunnel_destroy, .get_options = geneve_get_options, - .send = geneve_tnl_send, + .send = ovs_netdev_send, .owner = THIS_MODULE, .get_egress_tun_info = geneve_get_egress_tun_info, }; diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index 871801d2ac23..4d24481669c9 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -85,12 +85,10 @@ static struct vport *gre_create(const struct vport_parms *parms) } static int gre_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ip_tunnel_info *egress_tun_info) + struct dp_upcall_info *upcall) { - return ovs_tunnel_get_egress_info(egress_tun_info, - ovs_dp_get_net(vport->dp), - OVS_CB(skb)->egress_tun_info, - IPPROTO_GRE, skb->mark, 0, 0); + return ovs_tunnel_get_egress_info(upcall, ovs_dp_get_net(vport->dp), + skb, IPPROTO_GRE, 0, 0); } static struct vport_ops ovs_gre_vport_ops = { diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index c058bbf876c3..388b8a6bf112 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -43,35 +43,26 @@ static struct internal_dev *internal_dev_priv(struct net_device *netdev) return netdev_priv(netdev); } -/* This function is only called by the kernel network layer.*/ -static struct rtnl_link_stats64 *internal_dev_get_stats(struct net_device *netdev, - struct rtnl_link_stats64 *stats) -{ - struct vport *vport = ovs_internal_dev_get_vport(netdev); - struct ovs_vport_stats vport_stats; - - ovs_vport_get_stats(vport, &vport_stats); - - /* The tx and rx stats need to be swapped because the - * switch and host OS have opposite perspectives. */ - stats->rx_packets = vport_stats.tx_packets; - stats->tx_packets = vport_stats.rx_packets; - stats->rx_bytes = vport_stats.tx_bytes; - stats->tx_bytes = vport_stats.rx_bytes; - stats->rx_errors = vport_stats.tx_errors; - stats->tx_errors = vport_stats.rx_errors; - stats->rx_dropped = vport_stats.tx_dropped; - stats->tx_dropped = vport_stats.rx_dropped; - - return stats; -} - /* Called with rcu_read_lock_bh. */ static int internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev) { + int len, err; + + len = skb->len; rcu_read_lock(); - ovs_vport_receive(internal_dev_priv(netdev)->vport, skb, NULL); + err = ovs_vport_receive(internal_dev_priv(netdev)->vport, skb, NULL); rcu_read_unlock(); + + if (likely(!err)) { + struct pcpu_sw_netstats *tstats = this_cpu_ptr(netdev->tstats); + + u64_stats_update_begin(&tstats->syncp); + tstats->tx_bytes += len; + tstats->tx_packets++; + u64_stats_update_end(&tstats->syncp); + } else { + netdev->stats.tx_errors++; + } return 0; } @@ -121,7 +112,6 @@ static const struct net_device_ops internal_dev_netdev_ops = { .ndo_start_xmit = internal_dev_xmit, .ndo_set_mac_address = eth_mac_addr, .ndo_change_mtu = internal_dev_change_mtu, - .ndo_get_stats64 = internal_dev_get_stats, }; static struct rtnl_link_ops internal_dev_link_ops __read_mostly = { @@ -135,7 +125,7 @@ static void do_setup(struct net_device *netdev) netdev->netdev_ops = &internal_dev_netdev_ops; netdev->priv_flags &= ~IFF_TX_SKB_SHARING; - netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH; netdev->destructor = internal_dev_destructor; netdev->ethtool_ops = &internal_dev_ethtool_ops; netdev->rtnl_link_ops = &internal_dev_link_ops; @@ -212,18 +202,17 @@ static void internal_dev_destroy(struct vport *vport) rtnl_unlock(); } -static int internal_dev_recv(struct vport *vport, struct sk_buff *skb) +static void internal_dev_recv(struct vport *vport, struct sk_buff *skb) { struct net_device *netdev = vport->dev; - int len; + struct pcpu_sw_netstats *stats; if (unlikely(!(netdev->flags & IFF_UP))) { kfree_skb(skb); - return 0; + netdev->stats.rx_dropped++; + return; } - len = skb->len; - skb_dst_drop(skb); nf_reset(skb); secpath_reset(skb); @@ -233,9 +222,13 @@ static int internal_dev_recv(struct vport *vport, struct sk_buff *skb) skb->protocol = eth_type_trans(skb, netdev); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); - netif_rx(skb); + stats = this_cpu_ptr(netdev->tstats); + u64_stats_update_begin(&stats->syncp); + stats->rx_packets++; + stats->rx_bytes += skb->len; + u64_stats_update_end(&stats->syncp); - return len; + netif_rx(skb); } static struct vport_ops ovs_internal_vport_ops = { diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index a75011505039..f7e8dcce7ada 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -39,8 +39,11 @@ static struct vport_ops ovs_netdev_vport_ops; /* Must be called with rcu_read_lock. */ -static void netdev_port_receive(struct vport *vport, struct sk_buff *skb) +static void netdev_port_receive(struct sk_buff *skb) { + struct vport *vport; + + vport = ovs_netdev_get_vport(skb->dev); if (unlikely(!vport)) goto error; @@ -56,10 +59,8 @@ static void netdev_port_receive(struct vport *vport, struct sk_buff *skb) skb_push(skb, ETH_HLEN); ovs_skb_postpush_rcsum(skb, skb->data, ETH_HLEN); - ovs_vport_receive(vport, skb, skb_tunnel_info(skb)); return; - error: kfree_skb(skb); } @@ -68,15 +69,11 @@ error: static rx_handler_result_t netdev_frame_hook(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; - struct vport *vport; if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) return RX_HANDLER_PASS; - vport = ovs_netdev_get_vport(skb->dev); - - netdev_port_receive(vport, skb); - + netdev_port_receive(skb); return RX_HANDLER_CONSUMED; } @@ -203,27 +200,24 @@ static unsigned int packet_length(const struct sk_buff *skb) return length; } -int ovs_netdev_send(struct vport *vport, struct sk_buff *skb) +void ovs_netdev_send(struct vport *vport, struct sk_buff *skb) { int mtu = vport->dev->mtu; - int len; if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) { net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n", vport->dev->name, packet_length(skb), mtu); + vport->dev->stats.tx_errors++; goto drop; } skb->dev = vport->dev; - len = skb->len; dev_queue_xmit(skb); - - return len; + return; drop: kfree_skb(skb); - return 0; } EXPORT_SYMBOL_GPL(ovs_netdev_send); diff --git a/net/openvswitch/vport-netdev.h b/net/openvswitch/vport-netdev.h index 497cc81f1aca..bf22fcedbc69 100644 --- a/net/openvswitch/vport-netdev.h +++ b/net/openvswitch/vport-netdev.h @@ -27,7 +27,7 @@ struct vport *ovs_netdev_get_vport(struct net_device *dev); struct vport *ovs_netdev_link(struct vport *vport, const char *name); -int ovs_netdev_send(struct vport *vport, struct sk_buff *skb); +void ovs_netdev_send(struct vport *vport, struct sk_buff *skb); void ovs_netdev_detach_dev(struct vport *); int __init ovs_netdev_init(void); diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index 1e8b00a23a23..fb3cdb85905d 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -147,11 +147,12 @@ static struct vport *vxlan_create(const struct vport_parms *parms) } static int vxlan_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ip_tunnel_info *egress_tun_info) + struct dp_upcall_info *upcall) { struct vxlan_dev *vxlan = netdev_priv(vport->dev); struct net *net = ovs_dp_get_net(vport->dp); - __be16 dst_port = vxlan_dev_dst_port(vxlan); + unsigned short family = ip_tunnel_info_af(upcall->egress_tun_info); + __be16 dst_port = vxlan_dev_dst_port(vxlan, family); __be16 src_port; int port_min; int port_max; @@ -159,9 +160,8 @@ static int vxlan_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, inet_get_local_port_range(net, &port_min, &port_max); src_port = udp_flow_src_port(net, skb, 0, 0, true); - return ovs_tunnel_get_egress_info(egress_tun_info, net, - OVS_CB(skb)->egress_tun_info, - IPPROTO_UDP, skb->mark, + return ovs_tunnel_get_egress_info(upcall, net, + skb, IPPROTO_UDP, src_port, dst_port); } diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index d73e5a16e7ca..dc81dc619aa2 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -34,9 +34,6 @@ #include "vport.h" #include "vport-internal_dev.h" -static void ovs_vport_record_error(struct vport *, - enum vport_err_type err_type); - static LIST_HEAD(vport_ops_list); /* Protected by RCU read lock for reading, ovs_mutex for writing. */ @@ -157,12 +154,6 @@ struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops, return ERR_PTR(-EINVAL); } - vport->percpu_stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!vport->percpu_stats) { - kfree(vport); - return ERR_PTR(-ENOMEM); - } - return vport; } EXPORT_SYMBOL_GPL(ovs_vport_alloc); @@ -183,7 +174,6 @@ void ovs_vport_free(struct vport *vport) * it is safe to use raw dereference. */ kfree(rcu_dereference_raw(vport->upcall_portids)); - free_percpu(vport->percpu_stats); kfree(vport); } EXPORT_SYMBOL_GPL(ovs_vport_free); @@ -290,30 +280,24 @@ void ovs_vport_del(struct vport *vport) */ void ovs_vport_get_stats(struct vport *vport, struct ovs_vport_stats *stats) { + struct net_device *dev = vport->dev; int i; memset(stats, 0, sizeof(*stats)); + stats->rx_errors = dev->stats.rx_errors; + stats->tx_errors = dev->stats.tx_errors; + stats->tx_dropped = dev->stats.tx_dropped; + stats->rx_dropped = dev->stats.rx_dropped; - /* We potentially have 2 sources of stats that need to be combined: - * those we have collected (split into err_stats and percpu_stats) from - * set_stats() and device error stats from netdev->get_stats() (for - * errors that happen downstream and therefore aren't reported through - * our vport_record_error() function). - * Stats from first source are reported by ovs (OVS_VPORT_ATTR_STATS). - * netdev-stats can be directly read over netlink-ioctl. - */ - - stats->rx_errors = atomic_long_read(&vport->err_stats.rx_errors); - stats->tx_errors = atomic_long_read(&vport->err_stats.tx_errors); - stats->tx_dropped = atomic_long_read(&vport->err_stats.tx_dropped); - stats->rx_dropped = atomic_long_read(&vport->err_stats.rx_dropped); + stats->rx_dropped += atomic_long_read(&dev->rx_dropped); + stats->tx_dropped += atomic_long_read(&dev->tx_dropped); for_each_possible_cpu(i) { const struct pcpu_sw_netstats *percpu_stats; struct pcpu_sw_netstats local_stats; unsigned int start; - percpu_stats = per_cpu_ptr(vport->percpu_stats, i); + percpu_stats = per_cpu_ptr(dev->tstats, i); do { start = u64_stats_fetch_begin_irq(&percpu_stats->syncp); @@ -468,94 +452,25 @@ u32 ovs_vport_find_upcall_portid(const struct vport *vport, struct sk_buff *skb) * Must be called with rcu_read_lock. The packet cannot be shared and * skb->data should point to the Ethernet header. */ -void ovs_vport_receive(struct vport *vport, struct sk_buff *skb, - const struct ip_tunnel_info *tun_info) +int ovs_vport_receive(struct vport *vport, struct sk_buff *skb, + const struct ip_tunnel_info *tun_info) { - struct pcpu_sw_netstats *stats; struct sw_flow_key key; int error; - stats = this_cpu_ptr(vport->percpu_stats); - u64_stats_update_begin(&stats->syncp); - stats->rx_packets++; - stats->rx_bytes += skb->len + - (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); - u64_stats_update_end(&stats->syncp); - OVS_CB(skb)->input_vport = vport; - OVS_CB(skb)->egress_tun_info = NULL; + OVS_CB(skb)->mru = 0; /* Extract flow from 'skb' into 'key'. */ error = ovs_flow_key_extract(tun_info, skb, &key); if (unlikely(error)) { kfree_skb(skb); - return; + return error; } ovs_dp_process_packet(skb, &key); + return 0; } EXPORT_SYMBOL_GPL(ovs_vport_receive); -/** - * ovs_vport_send - send a packet on a device - * - * @vport: vport on which to send the packet - * @skb: skb to send - * - * Sends the given packet and returns the length of data sent. Either ovs - * lock or rcu_read_lock must be held. - */ -int ovs_vport_send(struct vport *vport, struct sk_buff *skb) -{ - int sent = vport->ops->send(vport, skb); - - if (likely(sent > 0)) { - struct pcpu_sw_netstats *stats; - - stats = this_cpu_ptr(vport->percpu_stats); - - u64_stats_update_begin(&stats->syncp); - stats->tx_packets++; - stats->tx_bytes += sent; - u64_stats_update_end(&stats->syncp); - } else if (sent < 0) { - ovs_vport_record_error(vport, VPORT_E_TX_ERROR); - } else { - ovs_vport_record_error(vport, VPORT_E_TX_DROPPED); - } - return sent; -} - -/** - * ovs_vport_record_error - indicate device error to generic stats layer - * - * @vport: vport that encountered the error - * @err_type: one of enum vport_err_type types to indicate the error type - * - * If using the vport generic stats layer indicate that an error of the given - * type has occurred. - */ -static void ovs_vport_record_error(struct vport *vport, - enum vport_err_type err_type) -{ - switch (err_type) { - case VPORT_E_RX_DROPPED: - atomic_long_inc(&vport->err_stats.rx_dropped); - break; - - case VPORT_E_RX_ERROR: - atomic_long_inc(&vport->err_stats.rx_errors); - break; - - case VPORT_E_TX_DROPPED: - atomic_long_inc(&vport->err_stats.tx_dropped); - break; - - case VPORT_E_TX_ERROR: - atomic_long_inc(&vport->err_stats.tx_errors); - break; - } - -} - static void free_vport_rcu(struct rcu_head *rcu) { struct vport *vport = container_of(rcu, struct vport, rcu); @@ -572,20 +487,24 @@ void ovs_vport_deferred_free(struct vport *vport) } EXPORT_SYMBOL_GPL(ovs_vport_deferred_free); -int ovs_tunnel_get_egress_info(struct ip_tunnel_info *egress_tun_info, +int ovs_tunnel_get_egress_info(struct dp_upcall_info *upcall, struct net *net, - const struct ip_tunnel_info *tun_info, + struct sk_buff *skb, u8 ipproto, - u32 skb_mark, __be16 tp_src, __be16 tp_dst) { + struct ip_tunnel_info *egress_tun_info = upcall->egress_tun_info; + const struct ip_tunnel_info *tun_info = skb_tunnel_info(skb); const struct ip_tunnel_key *tun_key; + u32 skb_mark = skb->mark; struct rtable *rt; struct flowi4 fl; if (unlikely(!tun_info)) return -EINVAL; + if (ip_tunnel_info_af(tun_info) != AF_INET) + return -EINVAL; tun_key = &tun_info->key; @@ -602,26 +521,26 @@ int ovs_tunnel_get_egress_info(struct ip_tunnel_info *egress_tun_info, /* Generate egress_tun_info based on tun_info, * saddr, tp_src and tp_dst */ - __ip_tunnel_info_init(egress_tun_info, - fl.saddr, tun_key->u.ipv4.dst, - tun_key->tos, - tun_key->ttl, - tp_src, tp_dst, - tun_key->tun_id, - tun_key->tun_flags, - tun_info->options, - tun_info->options_len); - + ip_tunnel_key_init(&egress_tun_info->key, + fl.saddr, tun_key->u.ipv4.dst, + tun_key->tos, + tun_key->ttl, + tp_src, tp_dst, + tun_key->tun_id, + tun_key->tun_flags); + egress_tun_info->options_len = tun_info->options_len; + egress_tun_info->mode = tun_info->mode; + upcall->egress_tun_opts = ip_tunnel_info_opts(egress_tun_info); return 0; } EXPORT_SYMBOL_GPL(ovs_tunnel_get_egress_info); int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ip_tunnel_info *info) + struct dp_upcall_info *upcall) { /* get_egress_tun_info() is only implemented on tunnel ports. */ if (unlikely(!vport->ops->get_egress_tun_info)) return -EINVAL; - return vport->ops->get_egress_tun_info(vport, skb, info); + return vport->ops->get_egress_tun_info(vport, skb, upcall); } diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index b88b3ee86f07..a413f3ae6a7b 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -36,10 +36,6 @@ struct vport_parms; /* The following definitions are for users of the vport subsytem: */ -struct vport_net { - struct vport __rcu *gre_vport; -}; - int ovs_vport_init(void); void ovs_vport_exit(void); @@ -57,26 +53,16 @@ int ovs_vport_set_upcall_portids(struct vport *, const struct nlattr *pids); int ovs_vport_get_upcall_portids(const struct vport *, struct sk_buff *); u32 ovs_vport_find_upcall_portid(const struct vport *, struct sk_buff *); -int ovs_vport_send(struct vport *, struct sk_buff *); - -int ovs_tunnel_get_egress_info(struct ip_tunnel_info *egress_tun_info, +int ovs_tunnel_get_egress_info(struct dp_upcall_info *upcall, struct net *net, - const struct ip_tunnel_info *tun_info, + struct sk_buff *, u8 ipproto, - u32 skb_mark, __be16 tp_src, __be16 tp_dst); -int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ip_tunnel_info *info); -/* The following definitions are for implementers of vport devices: */ +int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, + struct dp_upcall_info *upcall); -struct vport_err_stats { - atomic_long_t rx_dropped; - atomic_long_t rx_errors; - atomic_long_t tx_dropped; - atomic_long_t tx_errors; -}; /** * struct vport_portids - array of netlink portids of a vport. * must be protected by rcu. @@ -102,8 +88,6 @@ struct vport_portids { * @hash_node: Element in @dev_table hash table in vport.c. * @dp_hash_node: Element in @datapath->ports hash table in datapath.c. * @ops: Class structure. - * @percpu_stats: Points to per-CPU statistics used and maintained by vport - * @err_stats: Points to error statistics used and maintained by vport * @detach_list: list used for detaching vport in net-exit call. */ struct vport { @@ -116,9 +100,6 @@ struct vport { struct hlist_node dp_hash_node; const struct vport_ops *ops; - struct pcpu_sw_netstats __percpu *percpu_stats; - - struct vport_err_stats err_stats; struct list_head detach_list; struct rcu_head rcu; }; @@ -157,8 +138,7 @@ struct vport_parms { * @get_options: Appends vport-specific attributes for the configuration of an * existing vport to a &struct sk_buff. May be %NULL for a vport that does not * have any configuration. - * @get_name: Get the device's name. - * @send: Send a packet on the device. Returns the length of the packet sent, + * @send: Send a packet on the device. * zero for dropped packets or negative for error. * @get_egress_tun_info: Get the egress tunnel 5-tuple and other info for * a packet. @@ -173,24 +153,14 @@ struct vport_ops { int (*set_options)(struct vport *, struct nlattr *); int (*get_options)(const struct vport *, struct sk_buff *); - /* Called with rcu_read_lock or ovs_mutex. */ - const char *(*get_name)(const struct vport *); - - int (*send)(struct vport *, struct sk_buff *); + void (*send)(struct vport *, struct sk_buff *); int (*get_egress_tun_info)(struct vport *, struct sk_buff *, - struct ip_tunnel_info *); + struct dp_upcall_info *upcall); struct module *owner; struct list_head list; }; -enum vport_err_type { - VPORT_E_RX_DROPPED, - VPORT_E_RX_ERROR, - VPORT_E_TX_DROPPED, - VPORT_E_TX_ERROR, -}; - struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *, const struct vport_parms *); void ovs_vport_free(struct vport *); @@ -227,8 +197,8 @@ static inline struct vport *vport_from_priv(void *priv) return (struct vport *)((u8 *)priv - ALIGN(sizeof(struct vport), VPORT_ALIGN)); } -void ovs_vport_receive(struct vport *, struct sk_buff *, - const struct ip_tunnel_info *); +int ovs_vport_receive(struct vport *, struct sk_buff *, + const struct ip_tunnel_info *); static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb, const void *start, unsigned int len) @@ -239,7 +209,7 @@ static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb, static inline const char *ovs_vport_name(struct vport *vport) { - return vport->dev ? vport->dev->name : vport->ops->get_name(vport); + return vport->dev->name; } int ovs_vport_ops_register(struct vport_ops *ops); @@ -263,4 +233,10 @@ static inline struct rtable *ovs_tunnel_route_lookup(struct net *net, rt = ip_route_output_key(net, fl); return rt; } + +static inline void ovs_vport_send(struct vport *vport, struct sk_buff *skb) +{ + vport->ops->send(vport, skb); +} + #endif /* vport.h */ diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 7b8e39a22387..691660b9b7ef 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -230,6 +230,8 @@ struct packet_skb_cb { } sa; }; +#define vio_le() virtio_legacy_is_little_endian() + #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb)) #define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc)) @@ -1421,7 +1423,7 @@ static unsigned int fanout_demux_bpf(struct packet_fanout *f, rcu_read_lock(); prog = rcu_dereference(f->bpf_prog); if (prog) - ret = BPF_PROG_RUN(prog, skb) % num; + ret = bpf_prog_run_clear_cb(prog, skb) % num; rcu_read_unlock(); return ret; @@ -1437,17 +1439,17 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, { struct packet_fanout *f = pt->af_packet_priv; unsigned int num = READ_ONCE(f->num_members); + struct net *net = read_pnet(&f->net); struct packet_sock *po; unsigned int idx; - if (!net_eq(dev_net(dev), read_pnet(&f->net)) || - !num) { + if (!net_eq(dev_net(dev), net) || !num) { kfree_skb(skb); return 0; } if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) { - skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET); + skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET); if (!skb) return 0; } @@ -1517,10 +1519,10 @@ static void __fanout_unlink(struct sock *sk, struct packet_sock *po) static bool match_fanout_group(struct packet_type *ptype, struct sock *sk) { - if (ptype->af_packet_priv == (void *)((struct packet_sock *)sk)->fanout) - return true; + if (sk->sk_family != PF_PACKET) + return false; - return false; + return ptype->af_packet_priv == pkt_sk(sk)->fanout; } static void fanout_init_data(struct packet_fanout *f) @@ -1565,7 +1567,7 @@ static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data, if (copy_from_user(&fprog, data, len)) return -EFAULT; - ret = bpf_prog_create_from_user(&new, &fprog, NULL); + ret = bpf_prog_create_from_user(&new, &fprog, NULL, false); if (ret) return ret; @@ -1937,16 +1939,16 @@ out_free: return err; } -static unsigned int run_filter(const struct sk_buff *skb, - const struct sock *sk, - unsigned int res) +static unsigned int run_filter(struct sk_buff *skb, + const struct sock *sk, + unsigned int res) { struct sk_filter *filter; rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); if (filter != NULL) - res = SK_RUN_FILTER(filter, skb); + res = bpf_prog_run_clear_cb(filter->prog, skb); rcu_read_unlock(); return res; @@ -2628,6 +2630,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) __be16 proto; unsigned char *addr; int err, reserve = 0; + struct sockcm_cookie sockc; struct virtio_net_hdr vnet_hdr = { 0 }; int offset = 0; int vnet_hdr_len; @@ -2663,6 +2666,13 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) if (unlikely(!(dev->flags & IFF_UP))) goto out_unlock; + sockc.mark = sk->sk_mark; + if (msg->msg_controllen) { + err = sock_cmsg_send(sk, msg, &sockc); + if (unlikely(err)) + goto out_unlock; + } + if (sock->type == SOCK_RAW) reserve = dev->hard_header_len; if (po->has_vnet_hdr) { @@ -2680,15 +2690,15 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) goto out_unlock; if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && - (__virtio16_to_cpu(false, vnet_hdr.csum_start) + - __virtio16_to_cpu(false, vnet_hdr.csum_offset) + 2 > - __virtio16_to_cpu(false, vnet_hdr.hdr_len))) - vnet_hdr.hdr_len = __cpu_to_virtio16(false, - __virtio16_to_cpu(false, vnet_hdr.csum_start) + - __virtio16_to_cpu(false, vnet_hdr.csum_offset) + 2); + (__virtio16_to_cpu(vio_le(), vnet_hdr.csum_start) + + __virtio16_to_cpu(vio_le(), vnet_hdr.csum_offset) + 2 > + __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len))) + vnet_hdr.hdr_len = __cpu_to_virtio16(vio_le(), + __virtio16_to_cpu(vio_le(), vnet_hdr.csum_start) + + __virtio16_to_cpu(vio_le(), vnet_hdr.csum_offset) + 2); err = -EINVAL; - if (__virtio16_to_cpu(false, vnet_hdr.hdr_len) > len) + if (__virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len) > len) goto out_unlock; if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) { @@ -2731,7 +2741,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, - __virtio16_to_cpu(false, vnet_hdr.hdr_len), + __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len), msg->msg_flags & MSG_DONTWAIT, &err); if (skb == NULL) goto out_unlock; @@ -2772,14 +2782,14 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) skb->protocol = proto; skb->dev = dev; skb->priority = sk->sk_priority; - skb->mark = sk->sk_mark; + skb->mark = sockc.mark; packet_pick_tx_queue(dev, skb); if (po->has_vnet_hdr) { if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { - u16 s = __virtio16_to_cpu(false, vnet_hdr.csum_start); - u16 o = __virtio16_to_cpu(false, vnet_hdr.csum_offset); + u16 s = __virtio16_to_cpu(vio_le(), vnet_hdr.csum_start); + u16 o = __virtio16_to_cpu(vio_le(), vnet_hdr.csum_offset); if (!skb_partial_csum_set(skb, s, o)) { err = -EINVAL; goto out_free; @@ -2787,7 +2797,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) } skb_shinfo(skb)->gso_size = - __virtio16_to_cpu(false, vnet_hdr.gso_size); + __virtio16_to_cpu(vio_le(), vnet_hdr.gso_size); skb_shinfo(skb)->gso_type = gso_type; /* Header must be checked, and gso_segs computed. */ @@ -3161,9 +3171,9 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, /* This is a hint as to how much should be linear. */ vnet_hdr.hdr_len = - __cpu_to_virtio16(false, skb_headlen(skb)); + __cpu_to_virtio16(vio_le(), skb_headlen(skb)); vnet_hdr.gso_size = - __cpu_to_virtio16(false, sinfo->gso_size); + __cpu_to_virtio16(vio_le(), sinfo->gso_size); if (sinfo->gso_type & SKB_GSO_TCPV4) vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4; else if (sinfo->gso_type & SKB_GSO_TCPV6) @@ -3181,9 +3191,9 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, if (skb->ip_summed == CHECKSUM_PARTIAL) { vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; - vnet_hdr.csum_start = __cpu_to_virtio16(false, + vnet_hdr.csum_start = __cpu_to_virtio16(vio_le(), skb_checksum_start_offset(skb)); - vnet_hdr.csum_offset = __cpu_to_virtio16(false, + vnet_hdr.csum_offset = __cpu_to_virtio16(vio_le(), skb->csum_offset); } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) { vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID; diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index a2f28a6d4dc5..384ea1e3cd69 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -72,13 +72,7 @@ static int rds_release(struct socket *sock) rds_clear_recv_queue(rs); rds_cong_remove_socket(rs); - /* - * the binding lookup hash uses rcu, we need to - * make sure we synchronize_rcu before we free our - * entry - */ rds_remove_bound(rs); - synchronize_rcu(); rds_send_drop_to(rs, NULL); rds_rdma_drop_keys(rs); @@ -588,6 +582,8 @@ static int rds_init(void) { int ret; + rds_bind_lock_init(); + ret = rds_conn_init(); if (ret) goto out; diff --git a/net/rds/bind.c b/net/rds/bind.c index dd666fb9b4e1..bc6b93ecedb5 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -38,48 +38,50 @@ #include <linux/ratelimit.h> #include "rds.h" +struct bind_bucket { + rwlock_t lock; + struct hlist_head head; +}; + #define BIND_HASH_SIZE 1024 -static struct hlist_head bind_hash_table[BIND_HASH_SIZE]; -static DEFINE_SPINLOCK(rds_bind_lock); +static struct bind_bucket bind_hash_table[BIND_HASH_SIZE]; -static struct hlist_head *hash_to_bucket(__be32 addr, __be16 port) +static struct bind_bucket *hash_to_bucket(__be32 addr, __be16 port) { return bind_hash_table + (jhash_2words((u32)addr, (u32)port, 0) & (BIND_HASH_SIZE - 1)); } -static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port, +/* must hold either read or write lock (write lock for insert != NULL) */ +static struct rds_sock *rds_bind_lookup(struct bind_bucket *bucket, + __be32 addr, __be16 port, struct rds_sock *insert) { struct rds_sock *rs; - struct hlist_head *head = hash_to_bucket(addr, port); + struct hlist_head *head = &bucket->head; u64 cmp; u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port); - rcu_read_lock(); - hlist_for_each_entry_rcu(rs, head, rs_bound_node) { + hlist_for_each_entry(rs, head, rs_bound_node) { cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) | be16_to_cpu(rs->rs_bound_port); if (cmp == needle) { - rcu_read_unlock(); + rds_sock_addref(rs); return rs; } } - rcu_read_unlock(); if (insert) { /* * make sure our addr and port are set before - * we are added to the list, other people - * in rcu will find us as soon as the - * hlist_add_head_rcu is done + * we are added to the list. */ insert->rs_bound_addr = addr; insert->rs_bound_port = port; rds_sock_addref(insert); - hlist_add_head_rcu(&insert->rs_bound_node, head); + hlist_add_head(&insert->rs_bound_node, head); } return NULL; } @@ -93,16 +95,21 @@ static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port, struct rds_sock *rds_find_bound(__be32 addr, __be16 port) { struct rds_sock *rs; + unsigned long flags; + struct bind_bucket *bucket = hash_to_bucket(addr, port); - rs = rds_bind_lookup(addr, port, NULL); + read_lock_irqsave(&bucket->lock, flags); + rs = rds_bind_lookup(bucket, addr, port, NULL); + read_unlock_irqrestore(&bucket->lock, flags); - if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD)) - rds_sock_addref(rs); - else + if (rs && sock_flag(rds_rs_to_sk(rs), SOCK_DEAD)) { + rds_sock_put(rs); rs = NULL; + } rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr, ntohs(port)); + return rs; } @@ -112,6 +119,7 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) unsigned long flags; int ret = -EADDRINUSE; u16 rover, last; + struct bind_bucket *bucket; if (*port != 0) { rover = be16_to_cpu(*port); @@ -121,42 +129,48 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) last = rover - 1; } - spin_lock_irqsave(&rds_bind_lock, flags); - do { + struct rds_sock *rrs; if (rover == 0) rover++; - if (!rds_bind_lookup(addr, cpu_to_be16(rover), rs)) { + + bucket = hash_to_bucket(addr, cpu_to_be16(rover)); + write_lock_irqsave(&bucket->lock, flags); + rrs = rds_bind_lookup(bucket, addr, cpu_to_be16(rover), rs); + write_unlock_irqrestore(&bucket->lock, flags); + if (!rrs) { *port = rs->rs_bound_port; ret = 0; rdsdebug("rs %p binding to %pI4:%d\n", rs, &addr, (int)ntohs(*port)); break; + } else { + rds_sock_put(rrs); } } while (rover++ != last); - spin_unlock_irqrestore(&rds_bind_lock, flags); - return ret; } void rds_remove_bound(struct rds_sock *rs) { unsigned long flags; + struct bind_bucket *bucket = + hash_to_bucket(rs->rs_bound_addr, rs->rs_bound_port); - spin_lock_irqsave(&rds_bind_lock, flags); + write_lock_irqsave(&bucket->lock, flags); if (rs->rs_bound_addr) { rdsdebug("rs %p unbinding from %pI4:%d\n", rs, &rs->rs_bound_addr, ntohs(rs->rs_bound_port)); - hlist_del_init_rcu(&rs->rs_bound_node); + hlist_del_init(&rs->rs_bound_node); rds_sock_put(rs); rs->rs_bound_addr = 0; } - spin_unlock_irqrestore(&rds_bind_lock, flags); + write_unlock_irqrestore(&bucket->lock, flags); } int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) @@ -200,9 +214,13 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) out: release_sock(sk); - - /* we might have called rds_remove_bound on error */ - if (ret) - synchronize_rcu(); return ret; } + +void rds_bind_lock_init(void) +{ + int i; + + for (i = 0; i < BIND_HASH_SIZE; i++) + rwlock_init(&bind_hash_table[i].lock); +} diff --git a/net/rds/connection.c b/net/rds/connection.c index a50e652eb269..d4564036a339 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -70,7 +70,8 @@ static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr) } while (0) /* rcu read lock must be held or the connection spinlock */ -static struct rds_connection *rds_conn_lookup(struct hlist_head *head, +static struct rds_connection *rds_conn_lookup(struct net *net, + struct hlist_head *head, __be32 laddr, __be32 faddr, struct rds_transport *trans) { @@ -78,7 +79,7 @@ static struct rds_connection *rds_conn_lookup(struct hlist_head *head, hlist_for_each_entry_rcu(conn, head, c_hash_node) { if (conn->c_faddr == faddr && conn->c_laddr == laddr && - conn->c_trans == trans) { + conn->c_trans == trans && net == rds_conn_net(conn)) { ret = conn; break; } @@ -127,12 +128,9 @@ static struct rds_connection *__rds_conn_create(struct net *net, struct rds_transport *loop_trans; unsigned long flags; int ret; - struct rds_transport *otrans = trans; - if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP) - goto new_conn; rcu_read_lock(); - conn = rds_conn_lookup(head, laddr, faddr, trans); + conn = rds_conn_lookup(net, head, laddr, faddr, trans); if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport && laddr == faddr && !is_outgoing) { /* This is a looped back IB connection, and we're @@ -146,7 +144,6 @@ static struct rds_connection *__rds_conn_create(struct net *net, if (conn) goto out; -new_conn: conn = kmem_cache_zalloc(rds_conn_slab, gfp); if (!conn) { conn = ERR_PTR(-ENOMEM); @@ -189,6 +186,12 @@ new_conn: } } + if (trans == NULL) { + kmem_cache_free(rds_conn_slab, conn); + conn = ERR_PTR(-ENODEV); + goto out; + } + conn->c_trans = trans; ret = trans->conn_alloc(conn, gfp); @@ -200,6 +203,7 @@ new_conn: atomic_set(&conn->c_state, RDS_CONN_DOWN); conn->c_send_gen = 0; + conn->c_outgoing = (is_outgoing ? 1 : 0); conn->c_reconnect_jiffies = 0; INIT_DELAYED_WORK(&conn->c_send_w, rds_send_worker); INIT_DELAYED_WORK(&conn->c_recv_w, rds_recv_worker); @@ -236,22 +240,13 @@ new_conn: /* Creating normal conn */ struct rds_connection *found; - if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP) - found = NULL; - else - found = rds_conn_lookup(head, laddr, faddr, trans); + found = rds_conn_lookup(net, head, laddr, faddr, trans); if (found) { trans->conn_free(conn->c_transport_data); kmem_cache_free(rds_conn_slab, conn); conn = found; } else { - if ((is_outgoing && otrans->t_type == RDS_TRANS_TCP) || - (otrans->t_type != RDS_TRANS_TCP)) { - /* Only the active side should be added to - * reconnect list for TCP. - */ - hlist_add_head_rcu(&conn->c_hash_node, head); - } + hlist_add_head_rcu(&conn->c_hash_node, head); rds_cong_add_conn(conn); rds_conn_count++; } @@ -330,7 +325,9 @@ void rds_conn_shutdown(struct rds_connection *conn) rcu_read_lock(); if (!hlist_unhashed(&conn->c_hash_node)) { rcu_read_unlock(); - rds_queue_reconnect(conn); + if (conn->c_trans->t_type != RDS_TRANS_TCP || + conn->c_outgoing == 1) + rds_queue_reconnect(conn); } else { rcu_read_unlock(); } diff --git a/net/rds/ib.c b/net/rds/ib.c index d020fade312c..a833ab7898fe 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -43,14 +43,14 @@ #include "rds.h" #include "ib.h" -static unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE; -unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */ +unsigned int rds_ib_fmr_1m_pool_size = RDS_FMR_1M_POOL_SIZE; +unsigned int rds_ib_fmr_8k_pool_size = RDS_FMR_8K_POOL_SIZE; unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT; -module_param(fmr_pool_size, int, 0444); -MODULE_PARM_DESC(fmr_pool_size, " Max number of fmr per HCA"); -module_param(fmr_message_size, int, 0444); -MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer"); +module_param(rds_ib_fmr_1m_pool_size, int, 0444); +MODULE_PARM_DESC(rds_ib_fmr_1m_pool_size, " Max number of 1M fmr per HCA"); +module_param(rds_ib_fmr_8k_pool_size, int, 0444); +MODULE_PARM_DESC(rds_ib_fmr_8k_pool_size, " Max number of 8K fmr per HCA"); module_param(rds_ib_retry_count, int, 0444); MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error"); @@ -97,10 +97,10 @@ static void rds_ib_dev_free(struct work_struct *work) struct rds_ib_device *rds_ibdev = container_of(work, struct rds_ib_device, free_work); - if (rds_ibdev->mr_pool) - rds_ib_destroy_mr_pool(rds_ibdev->mr_pool); - if (rds_ibdev->mr) - ib_dereg_mr(rds_ibdev->mr); + if (rds_ibdev->mr_8k_pool) + rds_ib_destroy_mr_pool(rds_ibdev->mr_8k_pool); + if (rds_ibdev->mr_1m_pool) + rds_ib_destroy_mr_pool(rds_ibdev->mr_1m_pool); if (rds_ibdev->pd) ib_dealloc_pd(rds_ibdev->pd); @@ -150,9 +150,13 @@ static void rds_ib_add_one(struct ib_device *device) rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE); rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32; - rds_ibdev->max_fmrs = dev_attr->max_fmr ? - min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) : - fmr_pool_size; + rds_ibdev->max_1m_fmrs = dev_attr->max_mr ? + min_t(unsigned int, (dev_attr->max_mr / 2), + rds_ib_fmr_1m_pool_size) : rds_ib_fmr_1m_pool_size; + + rds_ibdev->max_8k_fmrs = dev_attr->max_mr ? + min_t(unsigned int, ((dev_attr->max_mr / 2) * RDS_MR_8K_SCALE), + rds_ib_fmr_8k_pool_size) : rds_ib_fmr_8k_pool_size; rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom; rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom; @@ -164,18 +168,25 @@ static void rds_ib_add_one(struct ib_device *device) goto put_dev; } - rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, IB_ACCESS_LOCAL_WRITE); - if (IS_ERR(rds_ibdev->mr)) { - rds_ibdev->mr = NULL; + rds_ibdev->mr_1m_pool = + rds_ib_create_mr_pool(rds_ibdev, RDS_IB_MR_1M_POOL); + if (IS_ERR(rds_ibdev->mr_1m_pool)) { + rds_ibdev->mr_1m_pool = NULL; goto put_dev; } - rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev); - if (IS_ERR(rds_ibdev->mr_pool)) { - rds_ibdev->mr_pool = NULL; + rds_ibdev->mr_8k_pool = + rds_ib_create_mr_pool(rds_ibdev, RDS_IB_MR_8K_POOL); + if (IS_ERR(rds_ibdev->mr_8k_pool)) { + rds_ibdev->mr_8k_pool = NULL; goto put_dev; } + rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, fmr_max_remaps = %d, max_1m_fmrs = %d, max_8k_fmrs = %d\n", + dev_attr->max_fmr, rds_ibdev->max_wrs, rds_ibdev->max_sge, + rds_ibdev->fmr_max_remaps, rds_ibdev->max_1m_fmrs, + rds_ibdev->max_8k_fmrs); + INIT_LIST_HEAD(&rds_ibdev->ipaddr_list); INIT_LIST_HEAD(&rds_ibdev->conn_list); @@ -230,11 +241,10 @@ struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device) * * This can be called at any time and can be racing with any other RDS path. */ -static void rds_ib_remove_one(struct ib_device *device) +static void rds_ib_remove_one(struct ib_device *device, void *client_data) { - struct rds_ib_device *rds_ibdev; + struct rds_ib_device *rds_ibdev = client_data; - rds_ibdev = ib_get_client_data(device, &rds_ib_client); if (!rds_ibdev) return; diff --git a/net/rds/ib.h b/net/rds/ib.h index 9fc95e38659a..f17d09567890 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -9,8 +9,11 @@ #include "rds.h" #include "rdma_transport.h" -#define RDS_FMR_SIZE 256 -#define RDS_FMR_POOL_SIZE 8192 +#define RDS_FMR_1M_POOL_SIZE (8192 / 2) +#define RDS_FMR_1M_MSG_SIZE 256 +#define RDS_FMR_8K_MSG_SIZE 2 +#define RDS_MR_8K_SCALE (256 / (RDS_FMR_8K_MSG_SIZE + 1)) +#define RDS_FMR_8K_POOL_SIZE (RDS_MR_8K_SCALE * (8192 / 2)) #define RDS_IB_MAX_SGE 8 #define RDS_IB_RECV_SGE 2 @@ -24,6 +27,9 @@ #define RDS_IB_RECYCLE_BATCH_COUNT 32 +#define RDS_IB_WC_MAX 32 +#define RDS_IB_SEND_OP BIT_ULL(63) + extern struct rw_semaphore rds_ib_devices_lock; extern struct list_head rds_ib_devices; @@ -89,6 +95,20 @@ struct rds_ib_work_ring { atomic_t w_free_ctr; }; +/* Rings are posted with all the allocations they'll need to queue the + * incoming message to the receiving socket so this can't fail. + * All fragments start with a header, so we can make sure we're not receiving + * garbage, and we can tell a small 8 byte fragment from an ACK frame. + */ +struct rds_ib_ack_state { + u64 ack_next; + u64 ack_recv; + unsigned int ack_required:1; + unsigned int ack_next_valid:1; + unsigned int ack_recv_valid:1; +}; + + struct rds_ib_device; struct rds_ib_connection { @@ -100,9 +120,14 @@ struct rds_ib_connection { /* alphabet soup, IBTA style */ struct rdma_cm_id *i_cm_id; struct ib_pd *i_pd; - struct ib_mr *i_mr; struct ib_cq *i_send_cq; struct ib_cq *i_recv_cq; + struct ib_wc i_send_wc[RDS_IB_WC_MAX]; + struct ib_wc i_recv_wc[RDS_IB_WC_MAX]; + + /* interrupt handling */ + struct tasklet_struct i_send_tasklet; + struct tasklet_struct i_recv_tasklet; /* tx */ struct rds_ib_work_ring i_send_ring; @@ -113,7 +138,6 @@ struct rds_ib_connection { atomic_t i_signaled_sends; /* rx */ - struct tasklet_struct i_recv_tasklet; struct mutex i_recv_mutex; struct rds_ib_work_ring i_recv_ring; struct rds_ib_incoming *i_ibinc; @@ -165,6 +189,12 @@ struct rds_ib_connection { struct rds_ib_ipaddr { struct list_head list; __be32 ipaddr; + struct rcu_head rcu; +}; + +enum { + RDS_IB_MR_8K_POOL, + RDS_IB_MR_1M_POOL, }; struct rds_ib_device { @@ -173,10 +203,12 @@ struct rds_ib_device { struct list_head conn_list; struct ib_device *dev; struct ib_pd *pd; - struct ib_mr *mr; - struct rds_ib_mr_pool *mr_pool; - unsigned int fmr_max_remaps; unsigned int max_fmrs; + struct rds_ib_mr_pool *mr_1m_pool; + struct rds_ib_mr_pool *mr_8k_pool; + unsigned int fmr_max_remaps; + unsigned int max_8k_fmrs; + unsigned int max_1m_fmrs; int max_sge; unsigned int max_wrs; unsigned int max_initiator_depth; @@ -199,14 +231,14 @@ struct rds_ib_device { struct rds_ib_statistics { uint64_t s_ib_connect_raced; uint64_t s_ib_listen_closed_stale; - uint64_t s_ib_tx_cq_call; + uint64_t s_ib_evt_handler_call; + uint64_t s_ib_tasklet_call; uint64_t s_ib_tx_cq_event; uint64_t s_ib_tx_ring_full; uint64_t s_ib_tx_throttle; uint64_t s_ib_tx_sg_mapping_failure; uint64_t s_ib_tx_stalled; uint64_t s_ib_tx_credit_updates; - uint64_t s_ib_rx_cq_call; uint64_t s_ib_rx_cq_event; uint64_t s_ib_rx_ring_empty; uint64_t s_ib_rx_refill_from_cq; @@ -218,12 +250,18 @@ struct rds_ib_statistics { uint64_t s_ib_ack_send_delayed; uint64_t s_ib_ack_send_piggybacked; uint64_t s_ib_ack_received; - uint64_t s_ib_rdma_mr_alloc; - uint64_t s_ib_rdma_mr_free; - uint64_t s_ib_rdma_mr_used; - uint64_t s_ib_rdma_mr_pool_flush; - uint64_t s_ib_rdma_mr_pool_wait; - uint64_t s_ib_rdma_mr_pool_depleted; + uint64_t s_ib_rdma_mr_8k_alloc; + uint64_t s_ib_rdma_mr_8k_free; + uint64_t s_ib_rdma_mr_8k_used; + uint64_t s_ib_rdma_mr_8k_pool_flush; + uint64_t s_ib_rdma_mr_8k_pool_wait; + uint64_t s_ib_rdma_mr_8k_pool_depleted; + uint64_t s_ib_rdma_mr_1m_alloc; + uint64_t s_ib_rdma_mr_1m_free; + uint64_t s_ib_rdma_mr_1m_used; + uint64_t s_ib_rdma_mr_1m_pool_flush; + uint64_t s_ib_rdma_mr_1m_pool_wait; + uint64_t s_ib_rdma_mr_1m_pool_depleted; uint64_t s_ib_atomic_cswp; uint64_t s_ib_atomic_fadd; }; @@ -275,7 +313,8 @@ struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device); void rds_ib_dev_put(struct rds_ib_device *rds_ibdev); extern struct ib_client rds_ib_client; -extern unsigned int fmr_message_size; +extern unsigned int rds_ib_fmr_1m_pool_size; +extern unsigned int rds_ib_fmr_8k_pool_size; extern unsigned int rds_ib_retry_count; extern spinlock_t ib_nodev_conns_lock; @@ -305,7 +344,8 @@ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr); void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); void rds_ib_destroy_nodev_conns(void); -struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *); +struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev, + int npages); void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo); void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *); void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, @@ -325,7 +365,8 @@ void rds_ib_recv_free_caches(struct rds_ib_connection *ic); void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp); void rds_ib_inc_free(struct rds_incoming *inc); int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to); -void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context); +void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc, + struct rds_ib_ack_state *state); void rds_ib_recv_tasklet_fn(unsigned long data); void rds_ib_recv_init_ring(struct rds_ib_connection *ic); void rds_ib_recv_clear_ring(struct rds_ib_connection *ic); @@ -333,6 +374,7 @@ void rds_ib_recv_init_ack(struct rds_ib_connection *ic); void rds_ib_attempt_ack(struct rds_ib_connection *ic); void rds_ib_ack_send_complete(struct rds_ib_connection *ic); u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic); +void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, int ack_required); /* ib_ring.c */ void rds_ib_ring_init(struct rds_ib_work_ring *ring, u32 nr); @@ -350,7 +392,7 @@ extern wait_queue_head_t rds_ib_ring_empty_wait; void rds_ib_xmit_complete(struct rds_connection *conn); int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off); -void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context); +void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc); void rds_ib_send_init_ring(struct rds_ib_connection *ic); void rds_ib_send_clear_ring(struct rds_ib_connection *ic); int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op); diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index d150bb4aa3cb..2b2370e7f356 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -216,6 +216,96 @@ static void rds_ib_cq_event_handler(struct ib_event *event, void *data) event->event, ib_event_msg(event->event), data); } +/* Plucking the oldest entry from the ring can be done concurrently with + * the thread refilling the ring. Each ring operation is protected by + * spinlocks and the transient state of refilling doesn't change the + * recording of which entry is oldest. + * + * This relies on IB only calling one cq comp_handler for each cq so that + * there will only be one caller of rds_recv_incoming() per RDS connection. + */ +static void rds_ib_cq_comp_handler_recv(struct ib_cq *cq, void *context) +{ + struct rds_connection *conn = context; + struct rds_ib_connection *ic = conn->c_transport_data; + + rdsdebug("conn %p cq %p\n", conn, cq); + + rds_ib_stats_inc(s_ib_evt_handler_call); + + tasklet_schedule(&ic->i_recv_tasklet); +} + +static void poll_cq(struct rds_ib_connection *ic, struct ib_cq *cq, + struct ib_wc *wcs, + struct rds_ib_ack_state *ack_state) +{ + int nr; + int i; + struct ib_wc *wc; + + while ((nr = ib_poll_cq(cq, RDS_IB_WC_MAX, wcs)) > 0) { + for (i = 0; i < nr; i++) { + wc = wcs + i; + rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", + (unsigned long long)wc->wr_id, wc->status, + wc->byte_len, be32_to_cpu(wc->ex.imm_data)); + + if (wc->wr_id & RDS_IB_SEND_OP) + rds_ib_send_cqe_handler(ic, wc); + else + rds_ib_recv_cqe_handler(ic, wc, ack_state); + } + } +} + +static void rds_ib_tasklet_fn_send(unsigned long data) +{ + struct rds_ib_connection *ic = (struct rds_ib_connection *)data; + struct rds_connection *conn = ic->conn; + struct rds_ib_ack_state state; + + rds_ib_stats_inc(s_ib_tasklet_call); + + memset(&state, 0, sizeof(state)); + poll_cq(ic, ic->i_send_cq, ic->i_send_wc, &state); + ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP); + poll_cq(ic, ic->i_send_cq, ic->i_send_wc, &state); + + if (rds_conn_up(conn) && + (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags) || + test_bit(0, &conn->c_map_queued))) + rds_send_xmit(ic->conn); +} + +static void rds_ib_tasklet_fn_recv(unsigned long data) +{ + struct rds_ib_connection *ic = (struct rds_ib_connection *)data; + struct rds_connection *conn = ic->conn; + struct rds_ib_device *rds_ibdev = ic->rds_ibdev; + struct rds_ib_ack_state state; + + if (!rds_ibdev) + rds_conn_drop(conn); + + rds_ib_stats_inc(s_ib_tasklet_call); + + memset(&state, 0, sizeof(state)); + poll_cq(ic, ic->i_recv_cq, ic->i_recv_wc, &state); + ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED); + poll_cq(ic, ic->i_recv_cq, ic->i_recv_wc, &state); + + if (state.ack_next_valid) + rds_ib_set_ack(ic, state.ack_next, state.ack_required); + if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) { + rds_send_drop_acked(conn, state.ack_recv, NULL); + ic->i_ack_recv = state.ack_recv; + } + + if (rds_conn_up(conn)) + rds_ib_attempt_ack(ic); +} + static void rds_ib_qp_event_handler(struct ib_event *event, void *data) { struct rds_connection *conn = data; @@ -238,6 +328,18 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data) } } +static void rds_ib_cq_comp_handler_send(struct ib_cq *cq, void *context) +{ + struct rds_connection *conn = context; + struct rds_ib_connection *ic = conn->c_transport_data; + + rdsdebug("conn %p cq %p\n", conn, cq); + + rds_ib_stats_inc(s_ib_evt_handler_call); + + tasklet_schedule(&ic->i_send_tasklet); +} + /* * This needs to be very careful to not leave IS_ERR pointers around for * cleanup to trip over. @@ -269,10 +371,10 @@ static int rds_ib_setup_qp(struct rds_connection *conn) /* Protection domain and memory range */ ic->i_pd = rds_ibdev->pd; - ic->i_mr = rds_ibdev->mr; cq_attr.cqe = ic->i_send_ring.w_nr + 1; - ic->i_send_cq = ib_create_cq(dev, rds_ib_send_cq_comp_handler, + + ic->i_send_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_send, rds_ib_cq_event_handler, conn, &cq_attr); if (IS_ERR(ic->i_send_cq)) { @@ -283,7 +385,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) } cq_attr.cqe = ic->i_recv_ring.w_nr; - ic->i_recv_cq = ib_create_cq(dev, rds_ib_recv_cq_comp_handler, + ic->i_recv_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_recv, rds_ib_cq_event_handler, conn, &cq_attr); if (IS_ERR(ic->i_recv_cq)) { @@ -375,7 +477,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) rds_ib_recv_init_ack(ic); - rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr, + rdsdebug("conn %p pd %p cq %p %p\n", conn, ic->i_pd, ic->i_send_cq, ic->i_recv_cq); out: @@ -638,6 +740,7 @@ void rds_ib_conn_shutdown(struct rds_connection *conn) wait_event(rds_ib_ring_empty_wait, rds_ib_ring_empty(&ic->i_recv_ring) && (atomic_read(&ic->i_signaled_sends) == 0)); + tasklet_kill(&ic->i_send_tasklet); tasklet_kill(&ic->i_recv_tasklet); /* first destroy the ib state that generates callbacks */ @@ -682,7 +785,6 @@ void rds_ib_conn_shutdown(struct rds_connection *conn) ic->i_cm_id = NULL; ic->i_pd = NULL; - ic->i_mr = NULL; ic->i_send_cq = NULL; ic->i_recv_cq = NULL; ic->i_send_hdrs = NULL; @@ -745,8 +847,10 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) } INIT_LIST_HEAD(&ic->ib_node); - tasklet_init(&ic->i_recv_tasklet, rds_ib_recv_tasklet_fn, - (unsigned long) ic); + tasklet_init(&ic->i_send_tasklet, rds_ib_tasklet_fn_send, + (unsigned long)ic); + tasklet_init(&ic->i_recv_tasklet, rds_ib_tasklet_fn_recv, + (unsigned long)ic); mutex_init(&ic->i_recv_mutex); #ifndef KERNEL_HAS_ATOMIC64 spin_lock_init(&ic->i_ack_lock); diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 251d1ce0b7c7..a2340748ec86 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -65,6 +65,7 @@ struct rds_ib_mr { * Our own little FMR pool */ struct rds_ib_mr_pool { + unsigned int pool_type; struct mutex flush_lock; /* serialize fmr invalidate */ struct delayed_work flush_worker; /* flush worker */ @@ -83,7 +84,7 @@ struct rds_ib_mr_pool { struct ib_fmr_attr fmr_attr; }; -struct workqueue_struct *rds_ib_fmr_wq; +static struct workqueue_struct *rds_ib_fmr_wq; int rds_ib_fmr_init(void) { @@ -159,10 +160,8 @@ static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) } spin_unlock_irq(&rds_ibdev->spinlock); - if (to_free) { - synchronize_rcu(); - kfree(to_free); - } + if (to_free) + kfree_rcu(to_free, rcu); } int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) @@ -236,7 +235,8 @@ void rds_ib_destroy_nodev_conns(void) rds_conn_destroy(ic->conn); } -struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev) +struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev, + int pool_type) { struct rds_ib_mr_pool *pool; @@ -244,6 +244,7 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev) if (!pool) return ERR_PTR(-ENOMEM); + pool->pool_type = pool_type; init_llist_head(&pool->free_list); init_llist_head(&pool->drop_list); init_llist_head(&pool->clean_list); @@ -251,28 +252,30 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev) init_waitqueue_head(&pool->flush_wait); INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker); - pool->fmr_attr.max_pages = fmr_message_size; + if (pool_type == RDS_IB_MR_1M_POOL) { + /* +1 allows for unaligned MRs */ + pool->fmr_attr.max_pages = RDS_FMR_1M_MSG_SIZE + 1; + pool->max_items = RDS_FMR_1M_POOL_SIZE; + } else { + /* pool_type == RDS_IB_MR_8K_POOL */ + pool->fmr_attr.max_pages = RDS_FMR_8K_MSG_SIZE + 1; + pool->max_items = RDS_FMR_8K_POOL_SIZE; + } + + pool->max_free_pinned = pool->max_items * pool->fmr_attr.max_pages / 4; pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps; pool->fmr_attr.page_shift = PAGE_SHIFT; - pool->max_free_pinned = rds_ibdev->max_fmrs * fmr_message_size / 4; - - /* We never allow more than max_items MRs to be allocated. - * When we exceed more than max_items_soft, we start freeing - * items more aggressively. - * Make sure that max_items > max_items_soft > max_items / 2 - */ pool->max_items_soft = rds_ibdev->max_fmrs * 3 / 4; - pool->max_items = rds_ibdev->max_fmrs; return pool; } void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo) { - struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; + struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool; - iinfo->rdma_mr_max = pool->max_items; - iinfo->rdma_mr_size = pool->fmr_attr.max_pages; + iinfo->rdma_mr_max = pool_1m->max_items; + iinfo->rdma_mr_size = pool_1m->fmr_attr.max_pages; } void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool) @@ -314,14 +317,28 @@ static inline void wait_clean_list_grace(void) } } -static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) +static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev, + int npages) { - struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; + struct rds_ib_mr_pool *pool; struct rds_ib_mr *ibmr = NULL; int err = 0, iter = 0; + if (npages <= RDS_FMR_8K_MSG_SIZE) + pool = rds_ibdev->mr_8k_pool; + else + pool = rds_ibdev->mr_1m_pool; + if (atomic_read(&pool->dirty_count) >= pool->max_items / 10) - schedule_delayed_work(&pool->flush_worker, 10); + queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10); + + /* Switch pools if one of the pool is reaching upper limit */ + if (atomic_read(&pool->dirty_count) >= pool->max_items * 9 / 10) { + if (pool->pool_type == RDS_IB_MR_8K_POOL) + pool = rds_ibdev->mr_1m_pool; + else + pool = rds_ibdev->mr_8k_pool; + } while (1) { ibmr = rds_ib_reuse_fmr(pool); @@ -343,12 +360,18 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) atomic_dec(&pool->item_count); if (++iter > 2) { - rds_ib_stats_inc(s_ib_rdma_mr_pool_depleted); + if (pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_depleted); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_depleted); return ERR_PTR(-EAGAIN); } /* We do have some empty MRs. Flush them out. */ - rds_ib_stats_inc(s_ib_rdma_mr_pool_wait); + if (pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_wait); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_wait); rds_ib_flush_mr_pool(pool, 0, &ibmr); if (ibmr) return ibmr; @@ -373,7 +396,12 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) goto out_no_cigar; } - rds_ib_stats_inc(s_ib_rdma_mr_alloc); + ibmr->pool = pool; + if (pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_alloc); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_alloc); + return ibmr; out_no_cigar: @@ -429,7 +457,7 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm } page_cnt += len >> PAGE_SHIFT; - if (page_cnt > fmr_message_size) + if (page_cnt > ibmr->pool->fmr_attr.max_pages) return -EINVAL; dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC, @@ -461,7 +489,10 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm ibmr->sg_dma_len = sg_dma_len; ibmr->remap_count++; - rds_ib_stats_inc(s_ib_rdma_mr_used); + if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_used); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_used); ret = 0; out: @@ -524,8 +555,7 @@ static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr) __rds_ib_teardown_mr(ibmr); if (pinned) { - struct rds_ib_device *rds_ibdev = ibmr->device; - struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; + struct rds_ib_mr_pool *pool = ibmr->pool; atomic_sub(pinned, &pool->free_pinned); } @@ -594,7 +624,7 @@ static void list_to_llist_nodes(struct rds_ib_mr_pool *pool, * to free as many MRs as needed to get back to this limit. */ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, - int free_all, struct rds_ib_mr **ibmr_ret) + int free_all, struct rds_ib_mr **ibmr_ret) { struct rds_ib_mr *ibmr, *next; struct llist_node *clean_nodes; @@ -605,11 +635,14 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, unsigned int nfreed = 0, dirty_to_clean = 0, free_goal; int ret = 0; - rds_ib_stats_inc(s_ib_rdma_mr_pool_flush); + if (pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_flush); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_flush); if (ibmr_ret) { DEFINE_WAIT(wait); - while(!mutex_trylock(&pool->flush_lock)) { + while (!mutex_trylock(&pool->flush_lock)) { ibmr = rds_ib_reuse_fmr(pool); if (ibmr) { *ibmr_ret = ibmr; @@ -666,8 +699,12 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, list_for_each_entry_safe(ibmr, next, &unmap_list, unmap_list) { unpinned += ibmr->sg_len; __rds_ib_teardown_mr(ibmr); - if (nfreed < free_goal || ibmr->remap_count >= pool->fmr_attr.max_maps) { - rds_ib_stats_inc(s_ib_rdma_mr_free); + if (nfreed < free_goal || + ibmr->remap_count >= pool->fmr_attr.max_maps) { + if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_free); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_free); list_del(&ibmr->unmap_list); ib_dealloc_fmr(ibmr->fmr); kfree(ibmr); @@ -719,8 +756,8 @@ static void rds_ib_mr_pool_flush_worker(struct work_struct *work) void rds_ib_free_mr(void *trans_private, int invalidate) { struct rds_ib_mr *ibmr = trans_private; + struct rds_ib_mr_pool *pool = ibmr->pool; struct rds_ib_device *rds_ibdev = ibmr->device; - struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len); @@ -759,10 +796,11 @@ void rds_ib_flush_mrs(void) down_read(&rds_ib_devices_lock); list_for_each_entry(rds_ibdev, &rds_ib_devices, list) { - struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; + if (rds_ibdev->mr_8k_pool) + rds_ib_flush_mr_pool(rds_ibdev->mr_8k_pool, 0, NULL); - if (pool) - rds_ib_flush_mr_pool(pool, 0, NULL); + if (rds_ibdev->mr_1m_pool) + rds_ib_flush_mr_pool(rds_ibdev->mr_1m_pool, 0, NULL); } up_read(&rds_ib_devices_lock); } @@ -780,12 +818,12 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, goto out; } - if (!rds_ibdev->mr_pool) { + if (!rds_ibdev->mr_8k_pool || !rds_ibdev->mr_1m_pool) { ret = -ENODEV; goto out; } - ibmr = rds_ib_alloc_fmr(rds_ibdev); + ibmr = rds_ib_alloc_fmr(rds_ibdev, nents); if (IS_ERR(ibmr)) { rds_ib_dev_put(rds_ibdev); return ibmr; diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index 6bbe62060060..96744b75db93 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -62,12 +62,12 @@ void rds_ib_recv_init_ring(struct rds_ib_connection *ic) sge = &recv->r_sge[0]; sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header)); sge->length = sizeof(struct rds_header); - sge->lkey = ic->i_mr->lkey; + sge->lkey = ic->i_pd->local_dma_lkey; sge = &recv->r_sge[1]; sge->addr = 0; sge->length = RDS_FRAG_SIZE; - sge->lkey = ic->i_mr->lkey; + sge->lkey = ic->i_pd->local_dma_lkey; } } @@ -564,7 +564,7 @@ void rds_ib_recv_init_ack(struct rds_ib_connection *ic) sge->addr = ic->i_ack_dma; sge->length = sizeof(struct rds_header); - sge->lkey = ic->i_mr->lkey; + sge->lkey = ic->i_pd->local_dma_lkey; wr->sg_list = sge; wr->num_sge = 1; @@ -596,8 +596,7 @@ void rds_ib_recv_init_ack(struct rds_ib_connection *ic) * wr_id and avoids working with the ring in that case. */ #ifndef KERNEL_HAS_ATOMIC64 -static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, - int ack_required) +void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, int ack_required) { unsigned long flags; @@ -622,8 +621,7 @@ static u64 rds_ib_get_ack(struct rds_ib_connection *ic) return seq; } #else -static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, - int ack_required) +void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, int ack_required) { atomic64_set(&ic->i_ack_next, seq); if (ack_required) { @@ -830,20 +828,6 @@ static void rds_ib_cong_recv(struct rds_connection *conn, rds_cong_map_updated(map, uncongested); } -/* - * Rings are posted with all the allocations they'll need to queue the - * incoming message to the receiving socket so this can't fail. - * All fragments start with a header, so we can make sure we're not receiving - * garbage, and we can tell a small 8 byte fragment from an ACK frame. - */ -struct rds_ib_ack_state { - u64 ack_next; - u64 ack_recv; - unsigned int ack_required:1; - unsigned int ack_next_valid:1; - unsigned int ack_recv_valid:1; -}; - static void rds_ib_process_recv(struct rds_connection *conn, struct rds_ib_recv_work *recv, u32 data_len, struct rds_ib_ack_state *state) @@ -969,96 +953,50 @@ static void rds_ib_process_recv(struct rds_connection *conn, } } -/* - * Plucking the oldest entry from the ring can be done concurrently with - * the thread refilling the ring. Each ring operation is protected by - * spinlocks and the transient state of refilling doesn't change the - * recording of which entry is oldest. - * - * This relies on IB only calling one cq comp_handler for each cq so that - * there will only be one caller of rds_recv_incoming() per RDS connection. - */ -void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context) -{ - struct rds_connection *conn = context; - struct rds_ib_connection *ic = conn->c_transport_data; - - rdsdebug("conn %p cq %p\n", conn, cq); - - rds_ib_stats_inc(s_ib_rx_cq_call); - - tasklet_schedule(&ic->i_recv_tasklet); -} - -static inline void rds_poll_cq(struct rds_ib_connection *ic, - struct rds_ib_ack_state *state) +void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic, + struct ib_wc *wc, + struct rds_ib_ack_state *state) { struct rds_connection *conn = ic->conn; - struct ib_wc wc; struct rds_ib_recv_work *recv; - while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) { - rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n", - (unsigned long long)wc.wr_id, wc.status, - ib_wc_status_msg(wc.status), wc.byte_len, - be32_to_cpu(wc.ex.imm_data)); - rds_ib_stats_inc(s_ib_rx_cq_event); + rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n", + (unsigned long long)wc->wr_id, wc->status, + ib_wc_status_msg(wc->status), wc->byte_len, + be32_to_cpu(wc->ex.imm_data)); - recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)]; - - ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE); - - /* - * Also process recvs in connecting state because it is possible - * to get a recv completion _before_ the rdmacm ESTABLISHED - * event is processed. - */ - if (wc.status == IB_WC_SUCCESS) { - rds_ib_process_recv(conn, recv, wc.byte_len, state); - } else { - /* We expect errors as the qp is drained during shutdown */ - if (rds_conn_up(conn) || rds_conn_connecting(conn)) - rds_ib_conn_error(conn, "recv completion on %pI4 had " - "status %u (%s), disconnecting and " - "reconnecting\n", &conn->c_faddr, - wc.status, - ib_wc_status_msg(wc.status)); - } + rds_ib_stats_inc(s_ib_rx_cq_event); + recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)]; + ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, + DMA_FROM_DEVICE); - /* - * rds_ib_process_recv() doesn't always consume the frag, and - * we might not have called it at all if the wc didn't indicate - * success. We already unmapped the frag's pages, though, and - * the following rds_ib_ring_free() call tells the refill path - * that it will not find an allocated frag here. Make sure we - * keep that promise by freeing a frag that's still on the ring. - */ - if (recv->r_frag) { - rds_ib_frag_free(ic, recv->r_frag); - recv->r_frag = NULL; - } - rds_ib_ring_free(&ic->i_recv_ring, 1); + /* Also process recvs in connecting state because it is possible + * to get a recv completion _before_ the rdmacm ESTABLISHED + * event is processed. + */ + if (wc->status == IB_WC_SUCCESS) { + rds_ib_process_recv(conn, recv, wc->byte_len, state); + } else { + /* We expect errors as the qp is drained during shutdown */ + if (rds_conn_up(conn) || rds_conn_connecting(conn)) + rds_ib_conn_error(conn, "recv completion on %pI4 had status %u (%s), disconnecting and reconnecting\n", + &conn->c_faddr, + wc->status, + ib_wc_status_msg(wc->status)); } -} -void rds_ib_recv_tasklet_fn(unsigned long data) -{ - struct rds_ib_connection *ic = (struct rds_ib_connection *) data; - struct rds_connection *conn = ic->conn; - struct rds_ib_ack_state state = { 0, }; - - rds_poll_cq(ic, &state); - ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED); - rds_poll_cq(ic, &state); - - if (state.ack_next_valid) - rds_ib_set_ack(ic, state.ack_next, state.ack_required); - if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) { - rds_send_drop_acked(conn, state.ack_recv, NULL); - ic->i_ack_recv = state.ack_recv; + /* rds_ib_process_recv() doesn't always consume the frag, and + * we might not have called it at all if the wc didn't indicate + * success. We already unmapped the frag's pages, though, and + * the following rds_ib_ring_free() call tells the refill path + * that it will not find an allocated frag here. Make sure we + * keep that promise by freeing a frag that's still on the ring. + */ + if (recv->r_frag) { + rds_ib_frag_free(ic, recv->r_frag); + recv->r_frag = NULL; } - if (rds_conn_up(conn)) - rds_ib_attempt_ack(ic); + rds_ib_ring_free(&ic->i_recv_ring, 1); /* If we ever end up with a really empty receive ring, we're * in deep trouble, as the sender will definitely see RNR diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index c576ebeb4115..670882c752e9 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -195,16 +195,16 @@ void rds_ib_send_init_ring(struct rds_ib_connection *ic) send->s_op = NULL; - send->s_wr.wr_id = i; + send->s_wr.wr_id = i | RDS_IB_SEND_OP; send->s_wr.sg_list = send->s_sge; send->s_wr.ex.imm_data = 0; sge = &send->s_sge[0]; sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header)); sge->length = sizeof(struct rds_header); - sge->lkey = ic->i_mr->lkey; + sge->lkey = ic->i_pd->local_dma_lkey; - send->s_sge[1].lkey = ic->i_mr->lkey; + send->s_sge[1].lkey = ic->i_pd->local_dma_lkey; } } @@ -237,81 +237,73 @@ static void rds_ib_sub_signaled(struct rds_ib_connection *ic, int nr) * unallocs the next free entry in the ring it doesn't alter which is * the next to be freed, which is what this is concerned with. */ -void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) +void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc) { - struct rds_connection *conn = context; - struct rds_ib_connection *ic = conn->c_transport_data; struct rds_message *rm = NULL; - struct ib_wc wc; + struct rds_connection *conn = ic->conn; struct rds_ib_send_work *send; u32 completed; u32 oldest; u32 i = 0; - int ret; int nr_sig = 0; - rdsdebug("cq %p conn %p\n", cq, conn); - rds_ib_stats_inc(s_ib_tx_cq_call); - ret = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); - if (ret) - rdsdebug("ib_req_notify_cq send failed: %d\n", ret); - - while (ib_poll_cq(cq, 1, &wc) > 0) { - rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n", - (unsigned long long)wc.wr_id, wc.status, - ib_wc_status_msg(wc.status), wc.byte_len, - be32_to_cpu(wc.ex.imm_data)); - rds_ib_stats_inc(s_ib_tx_cq_event); - - if (wc.wr_id == RDS_IB_ACK_WR_ID) { - if (time_after(jiffies, ic->i_ack_queued + HZ/2)) - rds_ib_stats_inc(s_ib_tx_stalled); - rds_ib_ack_send_complete(ic); - continue; - } - oldest = rds_ib_ring_oldest(&ic->i_send_ring); + rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n", + (unsigned long long)wc->wr_id, wc->status, + ib_wc_status_msg(wc->status), wc->byte_len, + be32_to_cpu(wc->ex.imm_data)); + rds_ib_stats_inc(s_ib_tx_cq_event); - completed = rds_ib_ring_completed(&ic->i_send_ring, wc.wr_id, oldest); + if (wc->wr_id == RDS_IB_ACK_WR_ID) { + if (time_after(jiffies, ic->i_ack_queued + HZ / 2)) + rds_ib_stats_inc(s_ib_tx_stalled); + rds_ib_ack_send_complete(ic); + return; + } - for (i = 0; i < completed; i++) { - send = &ic->i_sends[oldest]; - if (send->s_wr.send_flags & IB_SEND_SIGNALED) - nr_sig++; + oldest = rds_ib_ring_oldest(&ic->i_send_ring); - rm = rds_ib_send_unmap_op(ic, send, wc.status); + completed = rds_ib_ring_completed(&ic->i_send_ring, + (wc->wr_id & ~RDS_IB_SEND_OP), + oldest); - if (time_after(jiffies, send->s_queued + HZ/2)) - rds_ib_stats_inc(s_ib_tx_stalled); + for (i = 0; i < completed; i++) { + send = &ic->i_sends[oldest]; + if (send->s_wr.send_flags & IB_SEND_SIGNALED) + nr_sig++; - if (send->s_op) { - if (send->s_op == rm->m_final_op) { - /* If anyone waited for this message to get flushed out, wake - * them up now */ - rds_message_unmapped(rm); - } - rds_message_put(rm); - send->s_op = NULL; - } + rm = rds_ib_send_unmap_op(ic, send, wc->status); - oldest = (oldest + 1) % ic->i_send_ring.w_nr; - } + if (time_after(jiffies, send->s_queued + HZ / 2)) + rds_ib_stats_inc(s_ib_tx_stalled); - rds_ib_ring_free(&ic->i_send_ring, completed); - rds_ib_sub_signaled(ic, nr_sig); - nr_sig = 0; - - if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) || - test_bit(0, &conn->c_map_queued)) - queue_delayed_work(rds_wq, &conn->c_send_w, 0); - - /* We expect errors as the qp is drained during shutdown */ - if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) { - rds_ib_conn_error(conn, "send completion on %pI4 had status " - "%u (%s), disconnecting and reconnecting\n", - &conn->c_faddr, wc.status, - ib_wc_status_msg(wc.status)); + if (send->s_op) { + if (send->s_op == rm->m_final_op) { + /* If anyone waited for this message to get + * flushed out, wake them up now + */ + rds_message_unmapped(rm); + } + rds_message_put(rm); + send->s_op = NULL; } + + oldest = (oldest + 1) % ic->i_send_ring.w_nr; + } + + rds_ib_ring_free(&ic->i_send_ring, completed); + rds_ib_sub_signaled(ic, nr_sig); + nr_sig = 0; + + if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) || + test_bit(0, &conn->c_map_queued)) + queue_delayed_work(rds_wq, &conn->c_send_w, 0); + + /* We expect errors as the qp is drained during shutdown */ + if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) { + rds_ib_conn_error(conn, "send completion on %pI4 had status %u (%s), disconnecting and reconnecting\n", + &conn->c_faddr, wc->status, + ib_wc_status_msg(wc->status)); } } @@ -818,7 +810,7 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op) /* Convert our struct scatterlist to struct ib_sge */ send->s_sge[0].addr = ib_sg_dma_address(ic->i_cm_id->device, op->op_sg); send->s_sge[0].length = ib_sg_dma_len(ic->i_cm_id->device, op->op_sg); - send->s_sge[0].lkey = ic->i_mr->lkey; + send->s_sge[0].lkey = ic->i_pd->local_dma_lkey; rdsdebug("rva %Lx rpa %Lx len %u\n", op->op_remote_addr, send->s_sge[0].addr, send->s_sge[0].length); @@ -932,7 +924,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) send->s_sge[j].addr = ib_sg_dma_address(ic->i_cm_id->device, scat); send->s_sge[j].length = len; - send->s_sge[j].lkey = ic->i_mr->lkey; + send->s_sge[j].lkey = ic->i_pd->local_dma_lkey; sent += len; rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr); diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c index 2d5965d6e97c..d77e04473056 100644 --- a/net/rds/ib_stats.c +++ b/net/rds/ib_stats.c @@ -42,14 +42,14 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_ib_statistics, rds_ib_stats); static const char *const rds_ib_stat_names[] = { "ib_connect_raced", "ib_listen_closed_stale", - "ib_tx_cq_call", + "s_ib_evt_handler_call", + "ib_tasklet_call", "ib_tx_cq_event", "ib_tx_ring_full", "ib_tx_throttle", "ib_tx_sg_mapping_failure", "ib_tx_stalled", "ib_tx_credit_updates", - "ib_rx_cq_call", "ib_rx_cq_event", "ib_rx_ring_empty", "ib_rx_refill_from_cq", @@ -61,12 +61,18 @@ static const char *const rds_ib_stat_names[] = { "ib_ack_send_delayed", "ib_ack_send_piggybacked", "ib_ack_received", - "ib_rdma_mr_alloc", - "ib_rdma_mr_free", - "ib_rdma_mr_used", - "ib_rdma_mr_pool_flush", - "ib_rdma_mr_pool_wait", - "ib_rdma_mr_pool_depleted", + "ib_rdma_mr_8k_alloc", + "ib_rdma_mr_8k_free", + "ib_rdma_mr_8k_used", + "ib_rdma_mr_8k_pool_flush", + "ib_rdma_mr_8k_pool_wait", + "ib_rdma_mr_8k_pool_depleted", + "ib_rdma_mr_1m_alloc", + "ib_rdma_mr_1m_free", + "ib_rdma_mr_1m_used", + "ib_rdma_mr_1m_pool_flush", + "ib_rdma_mr_1m_pool_wait", + "ib_rdma_mr_1m_pool_depleted", "ib_atomic_cswp", "ib_atomic_fadd", }; diff --git a/net/rds/iw.c b/net/rds/iw.c index 5d5a9d258658..3df0295c6659 100644 --- a/net/rds/iw.c +++ b/net/rds/iw.c @@ -125,12 +125,11 @@ free_attr: kfree(dev_attr); } -static void rds_iw_remove_one(struct ib_device *device) +static void rds_iw_remove_one(struct ib_device *device, void *client_data) { - struct rds_iw_device *rds_iwdev; + struct rds_iw_device *rds_iwdev = client_data; struct rds_iw_cm_id *i_cm_id, *next; - rds_iwdev = ib_get_client_data(device, &rds_iw_client); if (!rds_iwdev) return; @@ -149,10 +148,7 @@ static void rds_iw_remove_one(struct ib_device *device) if (rds_iwdev->mr) ib_dereg_mr(rds_iwdev->mr); - while (ib_dealloc_pd(rds_iwdev->pd)) { - rdsdebug("Failed to dealloc pd %p\n", rds_iwdev->pd); - msleep(1); - } + ib_dealloc_pd(rds_iwdev->pd); list_del(&rds_iwdev->list); kfree(rds_iwdev); diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c index dba8d0864f18..6a8fbd6e69e7 100644 --- a/net/rds/iw_rdma.c +++ b/net/rds/iw_rdma.c @@ -667,11 +667,12 @@ static int rds_iw_init_fastreg(struct rds_iw_mr_pool *pool, struct ib_mr *mr; int err; - mr = ib_alloc_fast_reg_mr(rds_iwdev->pd, pool->max_message_size); + mr = ib_alloc_mr(rds_iwdev->pd, IB_MR_TYPE_MEM_REG, + pool->max_message_size); if (IS_ERR(mr)) { err = PTR_ERR(mr); - printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_mr failed (err=%d)\n", err); + printk(KERN_WARNING "RDS/IW: ib_alloc_mr failed (err=%d)\n", err); return err; } diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c index 334fe98c5084..86152ec3b887 100644 --- a/net/rds/iw_send.c +++ b/net/rds/iw_send.c @@ -153,9 +153,10 @@ void rds_iw_send_init_ring(struct rds_iw_connection *ic) sge->length = sizeof(struct rds_header); sge->lkey = 0; - send->s_mr = ib_alloc_fast_reg_mr(ic->i_pd, fastreg_message_size); + send->s_mr = ib_alloc_mr(ic->i_pd, IB_MR_TYPE_MEM_REG, + fastreg_message_size); if (IS_ERR(send->s_mr)) { - printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_mr failed\n"); + printk(KERN_WARNING "RDS/IW: ib_alloc_mr failed\n"); break; } diff --git a/net/rds/rds.h b/net/rds/rds.h index afb4048d0cfd..543c308fcc2a 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -86,7 +86,9 @@ struct rds_connection { struct hlist_node c_hash_node; __be32 c_laddr; __be32 c_faddr; - unsigned int c_loopback:1; + unsigned int c_loopback:1, + c_outgoing:1, + c_pad_to_32:30; struct rds_connection *c_passive; struct rds_cong_map *c_lcong; @@ -603,6 +605,7 @@ extern wait_queue_head_t rds_poll_waitq; int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); void rds_remove_bound(struct rds_sock *rs); struct rds_sock *rds_find_bound(__be32 addr, __be16 port); +void rds_bind_lock_init(void); /* cong.c */ int rds_cong_get_maps(struct rds_connection *conn); diff --git a/net/rds/send.c b/net/rds/send.c index 4df61a515b83..ee49c2556f47 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -38,6 +38,7 @@ #include <linux/list.h> #include <linux/ratelimit.h> #include <linux/export.h> +#include <linux/sizes.h> #include "rds.h" @@ -51,7 +52,7 @@ * it to 0 will restore the old behavior (where we looped until we had * drained the queue). */ -static int send_batch_count = 64; +static int send_batch_count = SZ_1K; module_param(send_batch_count, int, 0444); MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue"); @@ -223,7 +224,7 @@ restart: * through a lot of messages, lets back off and see * if anyone else jumps in */ - if (batch_count >= 1024) + if (batch_count >= send_batch_count) goto over_batch; spin_lock_irqsave(&conn->c_lock, flags); @@ -423,12 +424,15 @@ over_batch: !list_empty(&conn->c_send_queue)) && send_gen == conn->c_send_gen) { rds_stats_inc(s_send_lock_queue_raced); - goto restart; + if (batch_count < send_batch_count) + goto restart; + queue_delayed_work(rds_wq, &conn->c_send_w, 1); } } out: return ret; } +EXPORT_SYMBOL_GPL(rds_send_xmit); static void rds_send_sndbuf_remove(struct rds_sock *rs, struct rds_message *rm) { @@ -1120,8 +1124,9 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) */ rds_stats_inc(s_send_queued); - if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags)) - rds_send_xmit(conn); + ret = rds_send_xmit(conn); + if (ret == -ENOMEM || ret == -EAGAIN) + queue_delayed_work(rds_wq, &conn->c_send_w, 1); rds_message_put(rm); return payload_len; @@ -1177,8 +1182,9 @@ rds_send_pong(struct rds_connection *conn, __be16 dport) rds_stats_inc(s_send_queued); rds_stats_inc(s_send_pong); - if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags)) - queue_delayed_work(rds_wq, &conn->c_send_w, 0); + ret = rds_send_xmit(conn); + if (ret == -ENOMEM || ret == -EAGAIN) + queue_delayed_work(rds_wq, &conn->c_send_w, 1); rds_message_put(rm); return 0; diff --git a/net/rds/tcp.c b/net/rds/tcp.c index c42b60bf4c68..9d6ddbacd875 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -67,21 +67,13 @@ void rds_tcp_nonagle(struct socket *sock) set_fs(oldfs); } +/* All module specific customizations to the RDS-TCP socket should be done in + * rds_tcp_tune() and applied after socket creation. In general these + * customizations should be tunable via module_param() + */ void rds_tcp_tune(struct socket *sock) { - struct sock *sk = sock->sk; - rds_tcp_nonagle(sock); - - /* - * We're trying to saturate gigabit with the default, - * see svc_sock_setbufsize(). - */ - lock_sock(sk); - sk->sk_sndbuf = RDS_TCP_DEFAULT_BUFSIZE; - sk->sk_rcvbuf = RDS_TCP_DEFAULT_BUFSIZE; - sk->sk_userlocks |= SOCK_SNDBUF_LOCK|SOCK_RCVBUF_LOCK; - release_sock(sk); } u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc) diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 444d78d0bd77..1d90240e5d82 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -110,28 +110,24 @@ int rds_tcp_accept_one(struct socket *sock) goto out; } /* An incoming SYN request came in, and TCP just accepted it. - * We always create a new conn for listen side of TCP, and do not - * add it to the c_hash_list. * * If the client reboots, this conn will need to be cleaned up. * rds_tcp_state_change() will do that cleanup */ rs_tcp = (struct rds_tcp_connection *)conn->c_transport_data; - WARN_ON(!rs_tcp || rs_tcp->t_sock); - - /* - * see the comment above rds_queue_delayed_reconnect() - */ - if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) { - if (rds_conn_state(conn) == RDS_CONN_UP) - rds_tcp_stats_inc(s_tcp_listen_closed_stale); - else - rds_tcp_stats_inc(s_tcp_connect_raced); - rds_conn_drop(conn); + if (rs_tcp->t_sock && + ntohl(inet->inet_saddr) < ntohl(inet->inet_daddr)) { + struct sock *nsk = new_sock->sk; + + nsk->sk_user_data = NULL; + nsk->sk_prot->disconnect(nsk, 0); + tcp_done(nsk); + new_sock = NULL; ret = 0; goto out; } + rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING); rds_tcp_set_callbacks(new_sock, conn); rds_connect_complete(conn); new_sock = NULL; diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c index 53b17ca0dff5..2894e6095e3b 100644 --- a/net/rds/tcp_send.c +++ b/net/rds/tcp_send.c @@ -83,6 +83,7 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, struct rds_tcp_connection *tc = conn->c_transport_data; int done = 0; int ret = 0; + int more; if (hdr_off == 0) { /* @@ -116,12 +117,15 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, goto out; } + more = rm->data.op_nents > 1 ? (MSG_MORE | MSG_SENDPAGE_NOTLAST) : 0; while (sg < rm->data.op_nents) { + int flags = MSG_DONTWAIT | MSG_NOSIGNAL | more; + ret = tc->t_sock->ops->sendpage(tc->t_sock, sg_page(&rm->data.op_sg[sg]), rm->data.op_sg[sg].offset + off, rm->data.op_sg[sg].length - off, - MSG_DONTWAIT|MSG_NOSIGNAL); + flags); rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->data.op_sg[sg]), rm->data.op_sg[sg].offset + off, rm->data.op_sg[sg].length - off, ret); @@ -134,6 +138,8 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, off = 0; sg++; } + if (sg == rm->data.op_nents - 1) + more = 0; } out: diff --git a/net/rds/threads.c b/net/rds/threads.c index dc2402e871fd..454aa6d23327 100644 --- a/net/rds/threads.c +++ b/net/rds/threads.c @@ -162,7 +162,9 @@ void rds_send_worker(struct work_struct *work) int ret; if (rds_conn_state(conn) == RDS_CONN_UP) { + clear_bit(RDS_LL_SEND_FULL, &conn->c_flags); ret = rds_send_xmit(conn); + cond_resched(); rdsdebug("conn %p ret %d\n", conn, ret); switch (ret) { case -EAGAIN: diff --git a/net/rfkill/core.c b/net/rfkill/core.c index f12149a29cb1..b41e9ea2ffff 100644 --- a/net/rfkill/core.c +++ b/net/rfkill/core.c @@ -341,7 +341,15 @@ static void __rfkill_switch_all(const enum rfkill_type type, bool blocked) { struct rfkill *rfkill; - rfkill_global_states[type].cur = blocked; + if (type == RFKILL_TYPE_ALL) { + int i; + + for (i = 0; i < NUM_RFKILL_TYPES; i++) + rfkill_global_states[i].cur = blocked; + } else { + rfkill_global_states[type].cur = blocked; + } + list_for_each_entry(rfkill, &rfkill_list, node) { if (rfkill->type != type && type != RFKILL_TYPE_ALL) continue; diff --git a/net/rxrpc/ar-connection.c b/net/rxrpc/ar-connection.c index 6631f4f1e39b..692b3e67fb54 100644 --- a/net/rxrpc/ar-connection.c +++ b/net/rxrpc/ar-connection.c @@ -808,7 +808,7 @@ void rxrpc_put_connection(struct rxrpc_connection *conn) ASSERTCMP(atomic_read(&conn->usage), >, 0); - conn->put_time = get_seconds(); + conn->put_time = ktime_get_seconds(); if (atomic_dec_and_test(&conn->usage)) { _debug("zombie"); rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0); @@ -852,7 +852,7 @@ static void rxrpc_connection_reaper(struct work_struct *work) _enter(""); - now = get_seconds(); + now = ktime_get_seconds(); earliest = ULONG_MAX; write_lock_bh(&rxrpc_connection_lock); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index aef1bd294e17..2934a73a5981 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -208,7 +208,7 @@ struct rxrpc_transport { struct rb_root server_conns; /* server connections on this transport */ struct list_head link; /* link in master session list */ struct sk_buff_head error_queue; /* error packets awaiting processing */ - time_t put_time; /* time at which to reap */ + unsigned long put_time; /* time at which to reap */ spinlock_t client_lock; /* client connection allocation lock */ rwlock_t conn_lock; /* lock for active/dead connections */ atomic_t usage; @@ -256,7 +256,7 @@ struct rxrpc_connection { struct rxrpc_crypt csum_iv; /* packet checksum base */ unsigned long events; #define RXRPC_CONN_CHALLENGE 0 /* send challenge packet */ - time_t put_time; /* time at which to reap */ + unsigned long put_time; /* time at which to reap */ rwlock_t lock; /* access lock */ spinlock_t state_lock; /* state-change lock */ atomic_t usage; diff --git a/net/rxrpc/ar-transport.c b/net/rxrpc/ar-transport.c index 1976dec84f29..9946467f16b4 100644 --- a/net/rxrpc/ar-transport.c +++ b/net/rxrpc/ar-transport.c @@ -189,7 +189,7 @@ void rxrpc_put_transport(struct rxrpc_transport *trans) ASSERTCMP(atomic_read(&trans->usage), >, 0); - trans->put_time = get_seconds(); + trans->put_time = ktime_get_seconds(); if (unlikely(atomic_dec_and_test(&trans->usage))) { _debug("zombie"); /* let the reaper determine the timeout to avoid a race with @@ -226,7 +226,7 @@ static void rxrpc_transport_reaper(struct work_struct *work) _enter(""); - now = get_seconds(); + now = ktime_get_seconds(); earliest = ULONG_MAX; /* extract all the transports that have been dead too long */ diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 559bfa011bda..0bc6f912f870 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -72,6 +72,7 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act, case TC_ACT_PIPE: case TC_ACT_RECLASSIFY: case TC_ACT_OK: + case TC_ACT_REDIRECT: action = filter_res; break; case TC_ACT_SHOT: diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 5019a47b9270..bb41699c6c49 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -68,13 +68,13 @@ static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a, } if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), - proto, &tuple)) + proto, ca->net, &tuple)) goto out; zone.id = ca->zone; zone.dir = NF_CT_DEFAULT_ZONE_DIR; - thash = nf_conntrack_find_get(dev_net(skb->dev), &zone, &tuple); + thash = nf_conntrack_find_get(ca->net, &zone, &tuple); if (!thash) goto out; @@ -119,6 +119,7 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, ci = to_connmark(a); ci->tcf_action = parm->action; + ci->net = net; ci->zone = parm->zone; tcf_hash_insert(a); diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 99c9cc1c7af9..d05869646515 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -189,6 +189,7 @@ static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a, * worry later - danger - this API seems to have changed * from earlier kernels */ + par.net = dev_net(skb->dev); par.in = skb->dev; par.out = NULL; par.hooknum = ipt->tcfi_hook; diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index e5168f8b9640..5faaa5425f7b 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -38,6 +38,7 @@ struct cls_bpf_prog { struct bpf_prog *filter; struct list_head link; struct tcf_result res; + bool exts_integrated; struct tcf_exts exts; u32 handle; union { @@ -52,6 +53,7 @@ struct cls_bpf_prog { static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = { [TCA_BPF_CLASSID] = { .type = NLA_U32 }, + [TCA_BPF_FLAGS] = { .type = NLA_U32 }, [TCA_BPF_FD] = { .type = NLA_U32 }, [TCA_BPF_NAME] = { .type = NLA_NUL_STRING, .len = CLS_BPF_NAME_LEN }, [TCA_BPF_OPS_LEN] = { .type = NLA_U16 }, @@ -59,6 +61,20 @@ static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = { .len = sizeof(struct sock_filter) * BPF_MAXINSNS }, }; +static int cls_bpf_exec_opcode(int code) +{ + switch (code) { + case TC_ACT_OK: + case TC_ACT_SHOT: + case TC_ACT_STOLEN: + case TC_ACT_REDIRECT: + case TC_ACT_UNSPEC: + return code; + default: + return TC_ACT_UNSPEC; + } +} + static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { @@ -79,6 +95,8 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, list_for_each_entry_rcu(prog, &head->plist, link) { int filter_res; + qdisc_skb_cb(skb)->tc_classid = prog->res.classid; + if (at_ingress) { /* It is safe to push/pull even if skb_shared() */ __skb_push(skb, skb->mac_len); @@ -88,6 +106,16 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, filter_res = BPF_PROG_RUN(prog->filter, skb); } + if (prog->exts_integrated) { + res->class = prog->res.class; + res->classid = qdisc_skb_cb(skb)->tc_classid; + + ret = cls_bpf_exec_opcode(filter_res); + if (ret == TC_ACT_UNSPEC) + continue; + break; + } + if (filter_res == 0) continue; @@ -195,8 +223,7 @@ static unsigned long cls_bpf_get(struct tcf_proto *tp, u32 handle) return ret; } -static int cls_bpf_prog_from_ops(struct nlattr **tb, - struct cls_bpf_prog *prog, u32 classid) +static int cls_bpf_prog_from_ops(struct nlattr **tb, struct cls_bpf_prog *prog) { struct sock_filter *bpf_ops; struct sock_fprog_kern fprog_tmp; @@ -230,15 +257,13 @@ static int cls_bpf_prog_from_ops(struct nlattr **tb, prog->bpf_ops = bpf_ops; prog->bpf_num_ops = bpf_num_ops; prog->bpf_name = NULL; - prog->filter = fp; - prog->res.classid = classid; return 0; } -static int cls_bpf_prog_from_efd(struct nlattr **tb, - struct cls_bpf_prog *prog, u32 classid) +static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog, + const struct tcf_proto *tp) { struct bpf_prog *fp; char *name = NULL; @@ -268,9 +293,10 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb, prog->bpf_ops = NULL; prog->bpf_fd = bpf_fd; prog->bpf_name = name; - prog->filter = fp; - prog->res.classid = classid; + + if (fp->dst_needed) + netif_keep_dst(qdisc_dev(tp->q)); return 0; } @@ -280,16 +306,13 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, unsigned long base, struct nlattr **tb, struct nlattr *est, bool ovr) { + bool is_bpf, is_ebpf, have_exts = false; struct tcf_exts exts; - bool is_bpf, is_ebpf; - u32 classid; int ret; is_bpf = tb[TCA_BPF_OPS_LEN] && tb[TCA_BPF_OPS]; is_ebpf = tb[TCA_BPF_FD]; - - if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf) || - !tb[TCA_BPF_CLASSID]) + if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf)) return -EINVAL; tcf_exts_init(&exts, TCA_BPF_ACT, TCA_BPF_POLICE); @@ -297,18 +320,32 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, if (ret < 0) return ret; - classid = nla_get_u32(tb[TCA_BPF_CLASSID]); + if (tb[TCA_BPF_FLAGS]) { + u32 bpf_flags = nla_get_u32(tb[TCA_BPF_FLAGS]); + + if (bpf_flags & ~TCA_BPF_FLAG_ACT_DIRECT) { + tcf_exts_destroy(&exts); + return -EINVAL; + } + + have_exts = bpf_flags & TCA_BPF_FLAG_ACT_DIRECT; + } + + prog->exts_integrated = have_exts; - ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog, classid) : - cls_bpf_prog_from_efd(tb, prog, classid); + ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog) : + cls_bpf_prog_from_efd(tb, prog, tp); if (ret < 0) { tcf_exts_destroy(&exts); return ret; } - tcf_bind_filter(tp, &prog->res, base); - tcf_exts_change(tp, &prog->exts, &exts); + if (tb[TCA_BPF_CLASSID]) { + prog->res.classid = nla_get_u32(tb[TCA_BPF_CLASSID]); + tcf_bind_filter(tp, &prog->res, base); + } + tcf_exts_change(tp, &prog->exts, &exts); return 0; } @@ -429,6 +466,7 @@ static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, { struct cls_bpf_prog *prog = (struct cls_bpf_prog *) fh; struct nlattr *nest; + u32 bpf_flags = 0; int ret; if (prog == NULL) @@ -440,7 +478,8 @@ static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, if (nest == NULL) goto nla_put_failure; - if (nla_put_u32(skb, TCA_BPF_CLASSID, prog->res.classid)) + if (prog->res.classid && + nla_put_u32(skb, TCA_BPF_CLASSID, prog->res.classid)) goto nla_put_failure; if (cls_bpf_is_ebpf(prog)) @@ -453,6 +492,11 @@ static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, if (tcf_exts_dump(skb, &prog->exts) < 0) goto nla_put_failure; + if (prog->exts_integrated) + bpf_flags |= TCA_BPF_FLAG_ACT_DIRECT; + if (bpf_flags && nla_put_u32(skb, TCA_BPF_FLAGS, bpf_flags)) + goto nla_put_failure; + nla_nest_end(skb, nest); if (tcf_exts_dump_stats(skb, &prog->exts) < 0) diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index bb2a0f529c1f..536838b657bf 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -301,7 +301,7 @@ static int flow_classify(struct sk_buff *skb, const struct tcf_proto *tp, keymask = f->keymask; if (keymask & FLOW_KEYS_NEEDED) - skb_flow_dissect_flow_keys(skb, &flow_keys); + skb_flow_dissect_flow_keys(skb, &flow_keys, 0); for (n = 0; n < f->nkeys; n++) { key = ffs(keymask) - 1; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 2f3d03f99487..57692947ebbe 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -129,7 +129,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, * so do it rather here. */ skb_key.basic.n_proto = skb->protocol; - skb_flow_dissect(skb, &head->dissector, &skb_key); + skb_flow_dissect(skb, &head->dissector, &skb_key, 0); fl_set_masked_key(&skb_mkey, &skb_key, &head->mask); diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index 715e01e5910a..f23a3b68bba6 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -33,7 +33,6 @@ struct fw_head { u32 mask; - bool mask_set; struct fw_filter __rcu *ht[HTSIZE]; struct rcu_head rcu; }; @@ -84,7 +83,7 @@ static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp, } } } else { - /* old method */ + /* Old method: classify the packet using its skb mark. */ if (id && (TC_H_MAJ(id) == 0 || !(TC_H_MAJ(id ^ tp->q->handle)))) { res->classid = id; @@ -114,14 +113,9 @@ static unsigned long fw_get(struct tcf_proto *tp, u32 handle) static int fw_init(struct tcf_proto *tp) { - struct fw_head *head; - - head = kzalloc(sizeof(struct fw_head), GFP_KERNEL); - if (head == NULL) - return -ENOBUFS; - - head->mask_set = false; - rcu_assign_pointer(tp->root, head); + /* We don't allocate fw_head here, because in the old method + * we don't need it at all. + */ return 0; } @@ -252,7 +246,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, int err; if (!opt) - return handle ? -EINVAL : 0; + return handle ? -EINVAL : 0; /* Succeed if it is old method. */ err = nla_parse_nested(tb, TCA_FW_MAX, opt, fw_policy); if (err < 0) @@ -302,11 +296,17 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, if (!handle) return -EINVAL; - if (!head->mask_set) { - head->mask = 0xFFFFFFFF; + if (!head) { + u32 mask = 0xFFFFFFFF; if (tb[TCA_FW_MASK]) - head->mask = nla_get_u32(tb[TCA_FW_MASK]); - head->mask_set = true; + mask = nla_get_u32(tb[TCA_FW_MASK]); + + head = kzalloc(sizeof(*head), GFP_KERNEL); + if (!head) + return -ENOBUFS; + head->mask = mask; + + rcu_assign_pointer(tp->root, head); } f = kzalloc(sizeof(struct fw_filter), GFP_KERNEL); diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index cab9e9b43967..4fbb67430ce4 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -490,6 +490,19 @@ static bool u32_destroy(struct tcf_proto *tp, bool force) return false; } } + + if (tp_c->refcnt > 1) + return false; + + if (tp_c->refcnt == 1) { + struct tc_u_hnode *ht; + + for (ht = rtnl_dereference(tp_c->hlist); + ht; + ht = rtnl_dereference(ht->next)) + if (!ht_empty(ht)) + return false; + } } if (root_ht && --root_ht->refcnt == 0) diff --git a/net/sched/em_ipset.c b/net/sched/em_ipset.c index df0328ba6a48..c66ca9400ab4 100644 --- a/net/sched/em_ipset.c +++ b/net/sched/em_ipset.c @@ -95,6 +95,7 @@ static int em_ipset_match(struct sk_buff *skb, struct tcf_ematch *em, if (skb->skb_iif) indev = dev_get_by_index_rcu(em->net, skb->skb_iif); + acpar.net = em->net; acpar.in = indev ? indev : dev; acpar.out = dev; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index f06aa01d60fd..f43c8f33f09e 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1806,51 +1806,45 @@ done: * to this qdisc, (optionally) tests for protocol and asks * specific classifiers. */ -int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res) +int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp, + struct tcf_result *res, bool compat_mode) { __be16 protocol = tc_skb_protocol(skb); - int err; +#ifdef CONFIG_NET_CLS_ACT + const struct tcf_proto *old_tp = tp; + int limit = 0; +reclassify: +#endif for (; tp; tp = rcu_dereference_bh(tp->next)) { + int err; + if (tp->protocol != protocol && tp->protocol != htons(ETH_P_ALL)) continue; - err = tp->classify(skb, tp, res); + err = tp->classify(skb, tp, res); +#ifdef CONFIG_NET_CLS_ACT + if (unlikely(err == TC_ACT_RECLASSIFY && !compat_mode)) + goto reset; +#endif if (err >= 0) return err; } - return -1; -} -EXPORT_SYMBOL(tc_classify_compat); -int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res) -{ - int err = 0; -#ifdef CONFIG_NET_CLS_ACT - const struct tcf_proto *otp = tp; - int limit = 0; -reclassify: -#endif - - err = tc_classify_compat(skb, tp, res); + return -1; #ifdef CONFIG_NET_CLS_ACT - if (err == TC_ACT_RECLASSIFY) { - tp = otp; - - if (unlikely(limit++ >= MAX_REC_LOOP)) { - net_notice_ratelimited("%s: packet reclassify loop rule prio %u protocol %02x\n", - tp->q->ops->id, - tp->prio & 0xffff, - ntohs(tp->protocol)); - return TC_ACT_SHOT; - } - goto reclassify; +reset: + if (unlikely(limit++ >= MAX_REC_LOOP)) { + net_notice_ratelimited("%s: reclassify loop, rule prio %u, protocol %02x\n", + tp->q->ops->id, tp->prio & 0xffff, + ntohs(tp->protocol)); + return TC_ACT_SHOT; } + + tp = old_tp; + goto reclassify; #endif - return err; } EXPORT_SYMBOL(tc_classify); @@ -1947,6 +1941,7 @@ static int __init pktsched_init(void) register_qdisc(&bfifo_qdisc_ops); register_qdisc(&pfifo_head_drop_qdisc_ops); register_qdisc(&mq_qdisc_ops); + register_qdisc(&noqueue_qdisc_ops); rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL); rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL); diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index e3e2cc5fd068..1911af3ca7c0 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -375,7 +375,7 @@ static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch) list_for_each_entry(flow, &p->flows, list) { fl = rcu_dereference_bh(flow->filter_list); if (fl) { - result = tc_classify_compat(skb, fl, &res); + result = tc_classify(skb, fl, &res, true); if (result < 0) continue; flow = (struct atm_flow_data *)res.class; diff --git a/net/sched/sch_blackhole.c b/net/sched/sch_blackhole.c index 094a874b48bc..3fee70d9814f 100644 --- a/net/sched/sch_blackhole.c +++ b/net/sched/sch_blackhole.c @@ -11,7 +11,7 @@ * Note: Quantum tunneling is not supported. */ -#include <linux/module.h> +#include <linux/init.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/skbuff.h> @@ -37,17 +37,8 @@ static struct Qdisc_ops blackhole_qdisc_ops __read_mostly = { .owner = THIS_MODULE, }; -static int __init blackhole_module_init(void) +static int __init blackhole_init(void) { return register_qdisc(&blackhole_qdisc_ops); } - -static void __exit blackhole_module_exit(void) -{ - unregister_qdisc(&blackhole_qdisc_ops); -} - -module_init(blackhole_module_init) -module_exit(blackhole_module_exit) - -MODULE_LICENSE("GPL"); +device_initcall(blackhole_init) diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index beeb75f80fdb..c538d9e4a8f6 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -240,7 +240,7 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) /* * Step 2+n. Apply classifier. */ - result = tc_classify_compat(skb, fl, &res); + result = tc_classify(skb, fl, &res, true); if (!fl || result < 0) goto fallback; diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index 6a783afe4960..02bfd3d1c4f0 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -170,13 +170,13 @@ static bool choke_match_flow(struct sk_buff *skb1, if (!choke_skb_cb(skb1)->keys_valid) { choke_skb_cb(skb1)->keys_valid = 1; - skb_flow_dissect_flow_keys(skb1, &temp); + skb_flow_dissect_flow_keys(skb1, &temp, 0); make_flow_keys_digest(&choke_skb_cb(skb1)->keys, &temp); } if (!choke_skb_cb(skb2)->keys_valid) { choke_skb_cb(skb2)->keys_valid = 1; - skb_flow_dissect_flow_keys(skb2, &temp); + skb_flow_dissect_flow_keys(skb2, &temp, 0); make_flow_keys_digest(&choke_skb_cb(skb2)->keys, &temp); } @@ -201,7 +201,7 @@ static bool choke_classify(struct sk_buff *skb, int result; fl = rcu_dereference_bh(q->filter_list); - result = tc_classify(skb, fl, &res); + result = tc_classify(skb, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 338706092c27..f26bdea875c1 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -331,7 +331,7 @@ static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch, *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; fl = rcu_dereference_bh(q->filter_list); - result = tc_classify(skb, fl, &res); + result = tc_classify(skb, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 66700a6116aa..f357f34d02d2 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -35,14 +35,20 @@ #define NO_DEFAULT_INDEX (1 << 16) +struct mask_value { + u8 mask; + u8 value; +}; + struct dsmark_qdisc_data { struct Qdisc *q; struct tcf_proto __rcu *filter_list; - u8 *mask; /* "owns" the array */ - u8 *value; + struct mask_value *mv; u16 indices; + u8 set_tc_index; u32 default_index; /* index range is 0...0xffff */ - int set_tc_index; +#define DSMARK_EMBEDDED_SZ 16 + struct mask_value embedded[DSMARK_EMBEDDED_SZ]; }; static inline int dsmark_valid_index(struct dsmark_qdisc_data *p, u16 index) @@ -116,7 +122,6 @@ static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent, struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_DSMARK_MAX + 1]; int err = -EINVAL; - u8 mask = 0; pr_debug("%s(sch %p,[qdisc %p],classid %x,parent %x), arg 0x%lx\n", __func__, sch, p, classid, parent, *arg); @@ -133,14 +138,11 @@ static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent, if (err < 0) goto errout; - if (tb[TCA_DSMARK_MASK]) - mask = nla_get_u8(tb[TCA_DSMARK_MASK]); - if (tb[TCA_DSMARK_VALUE]) - p->value[*arg - 1] = nla_get_u8(tb[TCA_DSMARK_VALUE]); + p->mv[*arg - 1].value = nla_get_u8(tb[TCA_DSMARK_VALUE]); if (tb[TCA_DSMARK_MASK]) - p->mask[*arg - 1] = mask; + p->mv[*arg - 1].mask = nla_get_u8(tb[TCA_DSMARK_MASK]); err = 0; @@ -155,8 +157,8 @@ static int dsmark_delete(struct Qdisc *sch, unsigned long arg) if (!dsmark_valid_index(p, arg)) return -EINVAL; - p->mask[arg - 1] = 0xff; - p->value[arg - 1] = 0; + p->mv[arg - 1].mask = 0xff; + p->mv[arg - 1].value = 0; return 0; } @@ -173,7 +175,7 @@ static void dsmark_walk(struct Qdisc *sch, struct qdisc_walker *walker) return; for (i = 0; i < p->indices; i++) { - if (p->mask[i] == 0xff && !p->value[i]) + if (p->mv[i].mask == 0xff && !p->mv[i].value) goto ignore; if (walker->count >= walker->skip) { if (walker->fn(sch, i + 1, walker) < 0) { @@ -230,7 +232,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch) else { struct tcf_result res; struct tcf_proto *fl = rcu_dereference_bh(p->filter_list); - int result = tc_classify(skb, fl, &res); + int result = tc_classify(skb, fl, &res, false); pr_debug("result %d class 0x%04x\n", result, res.classid); @@ -291,12 +293,12 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch) switch (tc_skb_protocol(skb)) { case htons(ETH_P_IP): - ipv4_change_dsfield(ip_hdr(skb), p->mask[index], - p->value[index]); + ipv4_change_dsfield(ip_hdr(skb), p->mv[index].mask, + p->mv[index].value); break; case htons(ETH_P_IPV6): - ipv6_change_dsfield(ipv6_hdr(skb), p->mask[index], - p->value[index]); + ipv6_change_dsfield(ipv6_hdr(skb), p->mv[index].mask, + p->mv[index].value); break; default: /* @@ -304,7 +306,7 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch) * This way, we can send non-IP traffic through dsmark * and don't need yet another qdisc as a bypass. */ - if (p->mask[index] != 0xff || p->value[index]) + if (p->mv[index].mask != 0xff || p->mv[index].value) pr_warn("%s: unsupported protocol %d\n", __func__, ntohs(tc_skb_protocol(skb))); break; @@ -346,7 +348,7 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt) int err = -EINVAL; u32 default_index = NO_DEFAULT_INDEX; u16 indices; - u8 *mask; + int i; pr_debug("%s(sch %p,[qdisc %p],opt %p)\n", __func__, sch, p, opt); @@ -366,18 +368,18 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt) if (tb[TCA_DSMARK_DEFAULT_INDEX]) default_index = nla_get_u16(tb[TCA_DSMARK_DEFAULT_INDEX]); - mask = kmalloc(indices * 2, GFP_KERNEL); - if (mask == NULL) { + if (indices <= DSMARK_EMBEDDED_SZ) + p->mv = p->embedded; + else + p->mv = kmalloc_array(indices, sizeof(*p->mv), GFP_KERNEL); + if (!p->mv) { err = -ENOMEM; goto errout; } - - p->mask = mask; - memset(p->mask, 0xff, indices); - - p->value = p->mask + indices; - memset(p->value, 0, indices); - + for (i = 0; i < indices; i++) { + p->mv[i].mask = 0xff; + p->mv[i].value = 0; + } p->indices = indices; p->default_index = default_index; p->set_tc_index = nla_get_flag(tb[TCA_DSMARK_SET_TC_INDEX]); @@ -410,7 +412,8 @@ static void dsmark_destroy(struct Qdisc *sch) tcf_destroy_chain(&p->filter_list); qdisc_destroy(p->q); - kfree(p->mask); + if (p->mv != p->embedded) + kfree(p->mv); } static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl, @@ -430,8 +433,8 @@ static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl, opts = nla_nest_start(skb, TCA_OPTIONS); if (opts == NULL) goto nla_put_failure; - if (nla_put_u8(skb, TCA_DSMARK_MASK, p->mask[cl - 1]) || - nla_put_u8(skb, TCA_DSMARK_VALUE, p->value[cl - 1])) + if (nla_put_u8(skb, TCA_DSMARK_MASK, p->mv[cl - 1].mask) || + nla_put_u8(skb, TCA_DSMARK_VALUE, p->mv[cl - 1].value)) goto nla_put_failure; return nla_nest_end(skb, opts); diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index f377702d4b91..109b2322778f 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -224,13 +224,16 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL)) return &q->internal; - /* SYNACK messages are attached to a listener socket. - * 1) They are not part of a 'flow' yet - * 2) We do not want to rate limit them (eg SYNFLOOD attack), + /* SYNACK messages are attached to a TCP_NEW_SYN_RECV request socket + * or a listener (SYNCOOKIE mode) + * 1) request sockets are not full blown, + * they do not contain sk_pacing_rate + * 2) They are not part of a 'flow' yet + * 3) We do not want to rate limit them (eg SYNFLOOD attack), * especially if the listener set SO_MAX_PACING_RATE - * 3) We pretend they are orphaned + * 4) We pretend they are orphaned */ - if (!sk || sk->sk_state == TCP_LISTEN) { + if (!sk || sk_listener(sk)) { unsigned long hash = skb_get_hash(skb) & q->orphan_mask; /* By forcing low order bit to 1, we make sure to not diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index a9ba030435a2..4c834e93dafb 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -92,7 +92,7 @@ static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch, return fq_codel_hash(q, skb) + 1; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - result = tc_classify(skb, filter, &res); + result = tc_classify(skb, filter, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 942fea8405a4..cb5d4ad32946 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -416,33 +416,25 @@ struct Qdisc noop_qdisc = { }; EXPORT_SYMBOL(noop_qdisc); -static struct Qdisc_ops noqueue_qdisc_ops __read_mostly = { +static int noqueue_init(struct Qdisc *qdisc, struct nlattr *opt) +{ + /* register_qdisc() assigns a default of noop_enqueue if unset, + * but __dev_queue_xmit() treats noqueue only as such + * if this is NULL - so clear it here. */ + qdisc->enqueue = NULL; + return 0; +} + +struct Qdisc_ops noqueue_qdisc_ops __read_mostly = { .id = "noqueue", .priv_size = 0, + .init = noqueue_init, .enqueue = noop_enqueue, .dequeue = noop_dequeue, .peek = noop_dequeue, .owner = THIS_MODULE, }; -static struct Qdisc noqueue_qdisc; -static struct netdev_queue noqueue_netdev_queue = { - .qdisc = &noqueue_qdisc, - .qdisc_sleeping = &noqueue_qdisc, -}; - -static struct Qdisc noqueue_qdisc = { - .enqueue = NULL, - .dequeue = noop_dequeue, - .flags = TCQ_F_BUILTIN, - .ops = &noqueue_qdisc_ops, - .list = LIST_HEAD_INIT(noqueue_qdisc.list), - .q.lock = __SPIN_LOCK_UNLOCKED(noqueue_qdisc.q.lock), - .dev_queue = &noqueue_netdev_queue, - .busylock = __SPIN_LOCK_UNLOCKED(noqueue_qdisc.busylock), -}; - - static const u8 prio2band[TC_PRIO_MAX + 1] = { 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 }; @@ -733,18 +725,19 @@ static void attach_one_default_qdisc(struct net_device *dev, struct netdev_queue *dev_queue, void *_unused) { - struct Qdisc *qdisc = &noqueue_qdisc; + struct Qdisc *qdisc; + const struct Qdisc_ops *ops = default_qdisc_ops; - if (dev->tx_queue_len && !(dev->priv_flags & IFF_NO_QUEUE)) { - qdisc = qdisc_create_dflt(dev_queue, - default_qdisc_ops, TC_H_ROOT); - if (!qdisc) { - netdev_info(dev, "activation failed\n"); - return; - } - if (!netif_is_multiqueue(dev)) - qdisc->flags |= TCQ_F_ONETXQUEUE; + if (dev->priv_flags & IFF_NO_QUEUE) + ops = &noqueue_qdisc_ops; + + qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT); + if (!qdisc) { + netdev_info(dev, "activation failed\n"); + return; } + if (!netif_is_multiqueue(dev)) + qdisc->flags |= TCQ_F_ONETXQUEUE; dev_queue->qdisc_sleeping = qdisc; } @@ -756,7 +749,6 @@ static void attach_default_qdiscs(struct net_device *dev) txq = netdev_get_tx_queue(dev, 0); if (!netif_is_multiqueue(dev) || - dev->tx_queue_len == 0 || dev->priv_flags & IFF_NO_QUEUE) { netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL); dev->qdisc = txq->qdisc_sleeping; @@ -781,7 +773,7 @@ static void transition_one_qdisc(struct net_device *dev, clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state); rcu_assign_pointer(dev_queue->qdisc, new_qdisc); - if (need_watchdog_p && new_qdisc != &noqueue_qdisc) { + if (need_watchdog_p) { dev_queue->trans_start = 0; *need_watchdog_p = 1; } diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index e6c7416d0332..b7ebe2c87586 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1165,7 +1165,7 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; head = &q->root; tcf = rcu_dereference_bh(q->root.filter_list); - while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) { + while (tcf && (result = tc_classify(skb, tcf, &res, false)) >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { case TC_ACT_QUEUED: diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index cf4b0f865d1b..15ccd7f8fb2a 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -229,7 +229,7 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, } *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) { + while (tcf && (result = tc_classify(skb, tcf, &res, false)) >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { case TC_ACT_QUEUED: diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index 42dd218871e0..4e904ca0af9d 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -46,7 +46,7 @@ multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) int err; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - err = tc_classify(skb, fl, &res); + err = tc_classify(skb, fl, &res, false); #ifdef CONFIG_NET_CLS_ACT switch (err) { case TC_ACT_STOLEN: diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 8e5cd34aaa74..ba6487f2741f 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -42,7 +42,7 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; if (TC_H_MAJ(skb->priority) != sch->handle) { fl = rcu_dereference_bh(q->filter_list); - err = tc_classify(skb, fl, &res); + err = tc_classify(skb, fl, &res, false); #ifdef CONFIG_NET_CLS_ACT switch (err) { case TC_ACT_STOLEN: diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index ffaeea63d473..3dc3a6e56052 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -717,7 +717,7 @@ static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch, *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; fl = rcu_dereference_bh(q->filter_list); - result = tc_classify(skb, fl, &res); + result = tc_classify(skb, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index dcdff5c769a1..5bbb6332ec57 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -258,7 +258,7 @@ static bool sfb_classify(struct sk_buff *skb, struct tcf_proto *fl, struct tcf_result res; int result; - result = tc_classify(skb, fl, &res); + result = tc_classify(skb, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 52f75a5473e1..3abab534eb5c 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -179,7 +179,7 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, return sfq_hash(q, skb) + 1; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - result = tc_classify(skb, fl, &res); + result = tc_classify(skb, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 197c3f59ecbf..b00f1f9611d6 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -1208,20 +1208,22 @@ void sctp_assoc_update(struct sctp_association *asoc, * within this document. * * Our basic strategy is to round-robin transports in priorities - * according to sctp_state_prio_map[] e.g., if no such + * according to sctp_trans_score() e.g., if no such * transport with state SCTP_ACTIVE exists, round-robin through * SCTP_UNKNOWN, etc. You get the picture. */ -static const u8 sctp_trans_state_to_prio_map[] = { - [SCTP_ACTIVE] = 3, /* best case */ - [SCTP_UNKNOWN] = 2, - [SCTP_PF] = 1, - [SCTP_INACTIVE] = 0, /* worst case */ -}; - static u8 sctp_trans_score(const struct sctp_transport *trans) { - return sctp_trans_state_to_prio_map[trans->state]; + switch (trans->state) { + case SCTP_ACTIVE: + return 3; /* best case */ + case SCTP_UNKNOWN: + return 2; + case SCTP_PF: + return 1; + default: /* case SCTP_INACTIVE */ + return 0; /* worst case */ + } } static struct sctp_transport *sctp_trans_elect_tie(struct sctp_transport *trans1, diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 4345790ad326..3d9ea9a48289 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -506,14 +506,22 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr, if (IS_ERR(rt)) continue; + if (!dst) + dst = &rt->dst; + /* Ensure the src address belongs to the output * interface. */ odev = __ip_dev_find(sock_net(sk), laddr->a.v4.sin_addr.s_addr, false); - if (!odev || odev->ifindex != fl4->flowi4_oif) + if (!odev || odev->ifindex != fl4->flowi4_oif) { + if (&rt->dst != dst) + dst_release(&rt->dst); continue; + } + if (dst != &rt->dst) + dst_release(dst); dst = &rt->dst; break; } @@ -1178,7 +1186,7 @@ static void sctp_v4_del_protocol(void) unregister_inetaddr_notifier(&sctp_inetaddr_notifier); } -static int __net_init sctp_net_init(struct net *net) +static int __net_init sctp_defaults_init(struct net *net) { int status; @@ -1271,12 +1279,6 @@ static int __net_init sctp_net_init(struct net *net) sctp_dbg_objcnt_init(net); - /* Initialize the control inode/socket for handling OOTB packets. */ - if ((status = sctp_ctl_sock_init(net))) { - pr_err("Failed to initialize the SCTP control sock\n"); - goto err_ctl_sock_init; - } - /* Initialize the local address list. */ INIT_LIST_HEAD(&net->sctp.local_addr_list); spin_lock_init(&net->sctp.local_addr_lock); @@ -1292,9 +1294,6 @@ static int __net_init sctp_net_init(struct net *net) return 0; -err_ctl_sock_init: - sctp_dbg_objcnt_exit(net); - sctp_proc_exit(net); err_init_proc: cleanup_sctp_mibs(net); err_init_mibs: @@ -1303,15 +1302,12 @@ err_sysctl_register: return status; } -static void __net_exit sctp_net_exit(struct net *net) +static void __net_exit sctp_defaults_exit(struct net *net) { /* Free the local address list */ sctp_free_addr_wq(net); sctp_free_local_addr_list(net); - /* Free the control endpoint. */ - inet_ctl_sock_destroy(net->sctp.ctl_sock); - sctp_dbg_objcnt_exit(net); sctp_proc_exit(net); @@ -1319,9 +1315,32 @@ static void __net_exit sctp_net_exit(struct net *net) sctp_sysctl_net_unregister(net); } -static struct pernet_operations sctp_net_ops = { - .init = sctp_net_init, - .exit = sctp_net_exit, +static struct pernet_operations sctp_defaults_ops = { + .init = sctp_defaults_init, + .exit = sctp_defaults_exit, +}; + +static int __net_init sctp_ctrlsock_init(struct net *net) +{ + int status; + + /* Initialize the control inode/socket for handling OOTB packets. */ + status = sctp_ctl_sock_init(net); + if (status) + pr_err("Failed to initialize the SCTP control sock\n"); + + return status; +} + +static void __net_init sctp_ctrlsock_exit(struct net *net) +{ + /* Free the control endpoint. */ + inet_ctl_sock_destroy(net->sctp.ctl_sock); +} + +static struct pernet_operations sctp_ctrlsock_ops = { + .init = sctp_ctrlsock_init, + .exit = sctp_ctrlsock_exit, }; /* Initialize the universe into something sensible. */ @@ -1454,8 +1473,11 @@ static __init int sctp_init(void) sctp_v4_pf_init(); sctp_v6_pf_init(); - status = sctp_v4_protosw_init(); + status = register_pernet_subsys(&sctp_defaults_ops); + if (status) + goto err_register_defaults; + status = sctp_v4_protosw_init(); if (status) goto err_protosw_init; @@ -1463,9 +1485,9 @@ static __init int sctp_init(void) if (status) goto err_v6_protosw_init; - status = register_pernet_subsys(&sctp_net_ops); + status = register_pernet_subsys(&sctp_ctrlsock_ops); if (status) - goto err_register_pernet_subsys; + goto err_register_ctrlsock; status = sctp_v4_add_protocol(); if (status) @@ -1481,12 +1503,14 @@ out: err_v6_add_protocol: sctp_v4_del_protocol(); err_add_protocol: - unregister_pernet_subsys(&sctp_net_ops); -err_register_pernet_subsys: + unregister_pernet_subsys(&sctp_ctrlsock_ops); +err_register_ctrlsock: sctp_v6_protosw_exit(); err_v6_protosw_init: sctp_v4_protosw_exit(); err_protosw_init: + unregister_pernet_subsys(&sctp_defaults_ops); +err_register_defaults: sctp_v4_pf_exit(); sctp_v6_pf_exit(); sctp_sysctl_unregister(); @@ -1519,12 +1543,14 @@ static __exit void sctp_exit(void) sctp_v6_del_protocol(); sctp_v4_del_protocol(); - unregister_pernet_subsys(&sctp_net_ops); + unregister_pernet_subsys(&sctp_ctrlsock_ops); /* Free protosw registrations */ sctp_v6_protosw_exit(); sctp_v4_protosw_exit(); + unregister_pernet_subsys(&sctp_defaults_ops); + /* Unregister with socket layer. */ sctp_v6_pf_exit(); sctp_v4_pf_exit(); diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 06320c8c1c86..763e06a55155 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -2494,7 +2494,7 @@ static int sctp_process_param(struct sctp_association *asoc, __u16 sat; int retval = 1; sctp_scope_t scope; - time_t stale; + u32 stale; struct sctp_af *af; union sctp_addr_param *addr_param; struct sctp_transport *t; @@ -3090,8 +3090,19 @@ static __be16 sctp_process_asconf_param(struct sctp_association *asoc, sctp_assoc_set_primary(asoc, asconf->transport); sctp_assoc_del_nonprimary_peers(asoc, asconf->transport); - } else - sctp_assoc_del_peer(asoc, &addr); + return SCTP_ERROR_NO_ERROR; + } + + /* If the address is not part of the association, the + * ASCONF-ACK with Error Cause Indication Parameter + * which including cause of Unresolvable Address should + * be sent. + */ + peer = sctp_assoc_lookup_paddr(asoc, &addr); + if (!peer) + return SCTP_ERROR_DNS_FAILED; + + sctp_assoc_rm_peer(asoc, peer); break; case SCTP_PARAM_SET_PRIMARY: /* ADDIP Section 4.2.4 @@ -3132,11 +3143,18 @@ bool sctp_verify_asconf(const struct sctp_association *asoc, case SCTP_PARAM_IPV4_ADDRESS: if (length != sizeof(sctp_ipv4addr_param_t)) return false; + /* ensure there is only one addr param and it's in the + * beginning of addip_hdr params, or we reject it. + */ + if (param.v != addip->addip_hdr.params) + return false; addr_param_seen = true; break; case SCTP_PARAM_IPV6_ADDRESS: if (length != sizeof(sctp_ipv6addr_param_t)) return false; + if (param.v != addip->addip_hdr.params) + return false; addr_param_seen = true; break; case SCTP_PARAM_ADD_IP: diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index fef2acdf4a2e..6098d4c42fa9 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -244,12 +244,13 @@ void sctp_generate_t3_rtx_event(unsigned long peer) int error; struct sctp_transport *transport = (struct sctp_transport *) peer; struct sctp_association *asoc = transport->asoc; - struct net *net = sock_net(asoc->base.sk); + struct sock *sk = asoc->base.sk; + struct net *net = sock_net(sk); /* Check whether a task is in the sock. */ - bh_lock_sock(asoc->base.sk); - if (sock_owned_by_user(asoc->base.sk)) { + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { pr_debug("%s: sock is busy\n", __func__); /* Try again later. */ @@ -272,10 +273,10 @@ void sctp_generate_t3_rtx_event(unsigned long peer) transport, GFP_ATOMIC); if (error) - asoc->base.sk->sk_err = -error; + sk->sk_err = -error; out_unlock: - bh_unlock_sock(asoc->base.sk); + bh_unlock_sock(sk); sctp_transport_put(transport); } @@ -285,11 +286,12 @@ out_unlock: static void sctp_generate_timeout_event(struct sctp_association *asoc, sctp_event_timeout_t timeout_type) { - struct net *net = sock_net(asoc->base.sk); + struct sock *sk = asoc->base.sk; + struct net *net = sock_net(sk); int error = 0; - bh_lock_sock(asoc->base.sk); - if (sock_owned_by_user(asoc->base.sk)) { + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { pr_debug("%s: sock is busy: timer %d\n", __func__, timeout_type); @@ -312,10 +314,10 @@ static void sctp_generate_timeout_event(struct sctp_association *asoc, (void *)timeout_type, GFP_ATOMIC); if (error) - asoc->base.sk->sk_err = -error; + sk->sk_err = -error; out_unlock: - bh_unlock_sock(asoc->base.sk); + bh_unlock_sock(sk); sctp_association_put(asoc); } @@ -365,10 +367,11 @@ void sctp_generate_heartbeat_event(unsigned long data) int error = 0; struct sctp_transport *transport = (struct sctp_transport *) data; struct sctp_association *asoc = transport->asoc; - struct net *net = sock_net(asoc->base.sk); + struct sock *sk = asoc->base.sk; + struct net *net = sock_net(sk); - bh_lock_sock(asoc->base.sk); - if (sock_owned_by_user(asoc->base.sk)) { + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { pr_debug("%s: sock is busy\n", __func__); /* Try again later. */ @@ -388,11 +391,11 @@ void sctp_generate_heartbeat_event(unsigned long data) asoc->state, asoc->ep, asoc, transport, GFP_ATOMIC); - if (error) - asoc->base.sk->sk_err = -error; + if (error) + sk->sk_err = -error; out_unlock: - bh_unlock_sock(asoc->base.sk); + bh_unlock_sock(sk); sctp_transport_put(transport); } @@ -403,10 +406,11 @@ void sctp_generate_proto_unreach_event(unsigned long data) { struct sctp_transport *transport = (struct sctp_transport *) data; struct sctp_association *asoc = transport->asoc; - struct net *net = sock_net(asoc->base.sk); + struct sock *sk = asoc->base.sk; + struct net *net = sock_net(sk); - bh_lock_sock(asoc->base.sk); - if (sock_owned_by_user(asoc->base.sk)) { + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { pr_debug("%s: sock is busy\n", __func__); /* Try again later. */ @@ -427,7 +431,7 @@ void sctp_generate_proto_unreach_event(unsigned long data) asoc->state, asoc->ep, asoc, transport, GFP_ATOMIC); out_unlock: - bh_unlock_sock(asoc->base.sk); + bh_unlock_sock(sk); sctp_association_put(asoc); } @@ -702,7 +706,7 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds, * outstanding data and rely on the retransmission limit be reached * to shutdown the association. */ - if (t->asoc->state != SCTP_STATE_SHUTDOWN_PENDING) + if (t->asoc->state < SCTP_STATE_SHUTDOWN_PENDING) t->asoc->overall_error_count = 0; /* Clear the hb_sent flag to signal that we had a good @@ -954,7 +958,7 @@ static void sctp_cmd_del_non_primary(struct sctp_association *asoc) t = list_entry(pos, struct sctp_transport, transports); if (!sctp_cmp_addr_exact(&t->ipaddr, &asoc->peer.primary_addr)) { - sctp_assoc_del_peer(asoc, &t->ipaddr); + sctp_assoc_rm_peer(asoc, t); } } } diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index d7eaa7354cf7..6f46aa16cb76 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -2306,7 +2306,7 @@ static sctp_disposition_t sctp_sf_do_5_2_6_stale(struct net *net, sctp_cmd_seq_t *commands) { struct sctp_chunk *chunk = arg; - time_t stale; + u32 stale; sctp_cookie_preserve_param_t bht; sctp_errhdr_t *err; struct sctp_chunk *reply; diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c index 4feda2d0a833..548240dd15fc 100644 --- a/net/sunrpc/auth_unix.c +++ b/net/sunrpc/auth_unix.c @@ -23,7 +23,7 @@ struct unx_cred { }; #define uc_uid uc_base.cr_uid -#define UNX_WRITESLACK (21 + (UNX_MAXNODENAME >> 2)) +#define UNX_WRITESLACK (21 + XDR_QUADLEN(UNX_MAXNODENAME)) #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_AUTH diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 2928afffbb81..4a2340a54401 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -44,7 +44,7 @@ static void cache_revisit_request(struct cache_head *item); static void cache_init(struct cache_head *h) { time_t now = seconds_since_boot(); - h->next = NULL; + INIT_HLIST_NODE(&h->cache_list); h->flags = 0; kref_init(&h->ref); h->expiry_time = now + CACHE_NEW_EXPIRY; @@ -54,15 +54,14 @@ static void cache_init(struct cache_head *h) struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail, struct cache_head *key, int hash) { - struct cache_head **head, **hp; - struct cache_head *new = NULL, *freeme = NULL; + struct cache_head *new = NULL, *freeme = NULL, *tmp = NULL; + struct hlist_head *head; head = &detail->hash_table[hash]; read_lock(&detail->hash_lock); - for (hp=head; *hp != NULL ; hp = &(*hp)->next) { - struct cache_head *tmp = *hp; + hlist_for_each_entry(tmp, head, cache_list) { if (detail->match(tmp, key)) { if (cache_is_expired(detail, tmp)) /* This entry is expired, we will discard it. */ @@ -88,12 +87,10 @@ struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail, write_lock(&detail->hash_lock); /* check if entry appeared while we slept */ - for (hp=head; *hp != NULL ; hp = &(*hp)->next) { - struct cache_head *tmp = *hp; + hlist_for_each_entry(tmp, head, cache_list) { if (detail->match(tmp, key)) { if (cache_is_expired(detail, tmp)) { - *hp = tmp->next; - tmp->next = NULL; + hlist_del_init(&tmp->cache_list); detail->entries --; freeme = tmp; break; @@ -104,8 +101,8 @@ struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail, return tmp; } } - new->next = *head; - *head = new; + + hlist_add_head(&new->cache_list, head); detail->entries++; cache_get(new); write_unlock(&detail->hash_lock); @@ -143,7 +140,6 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail, * If 'old' is not VALID, we update it directly, * otherwise we need to replace it */ - struct cache_head **head; struct cache_head *tmp; if (!test_bit(CACHE_VALID, &old->flags)) { @@ -168,15 +164,13 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail, } cache_init(tmp); detail->init(tmp, old); - head = &detail->hash_table[hash]; write_lock(&detail->hash_lock); if (test_bit(CACHE_NEGATIVE, &new->flags)) set_bit(CACHE_NEGATIVE, &tmp->flags); else detail->update(tmp, new); - tmp->next = *head; - *head = tmp; + hlist_add_head(&tmp->cache_list, &detail->hash_table[hash]); detail->entries++; cache_get(tmp); cache_fresh_locked(tmp, new->expiry_time); @@ -416,28 +410,29 @@ static int cache_clean(void) /* find a non-empty bucket in the table */ while (current_detail && current_index < current_detail->hash_size && - current_detail->hash_table[current_index] == NULL) + hlist_empty(¤t_detail->hash_table[current_index])) current_index++; /* find a cleanable entry in the bucket and clean it, or set to next bucket */ if (current_detail && current_index < current_detail->hash_size) { - struct cache_head *ch, **cp; + struct cache_head *ch = NULL; struct cache_detail *d; + struct hlist_head *head; + struct hlist_node *tmp; write_lock(¤t_detail->hash_lock); /* Ok, now to clean this strand */ - cp = & current_detail->hash_table[current_index]; - for (ch = *cp ; ch ; cp = & ch->next, ch = *cp) { + head = ¤t_detail->hash_table[current_index]; + hlist_for_each_entry_safe(ch, tmp, head, cache_list) { if (current_detail->nextcheck > ch->expiry_time) current_detail->nextcheck = ch->expiry_time+1; if (!cache_is_expired(current_detail, ch)) continue; - *cp = ch->next; - ch->next = NULL; + hlist_del_init(&ch->cache_list); current_detail->entries--; rv = 1; break; @@ -1270,18 +1265,13 @@ EXPORT_SYMBOL_GPL(qword_get); * get a header, then pass each real item in the cache */ -struct handle { - struct cache_detail *cd; -}; - -static void *c_start(struct seq_file *m, loff_t *pos) +void *cache_seq_start(struct seq_file *m, loff_t *pos) __acquires(cd->hash_lock) { loff_t n = *pos; unsigned int hash, entry; struct cache_head *ch; - struct cache_detail *cd = ((struct handle*)m->private)->cd; - + struct cache_detail *cd = m->private; read_lock(&cd->hash_lock); if (!n--) @@ -1289,7 +1279,7 @@ static void *c_start(struct seq_file *m, loff_t *pos) hash = n >> 32; entry = n & ((1LL<<32) - 1); - for (ch=cd->hash_table[hash]; ch; ch=ch->next) + hlist_for_each_entry(ch, &cd->hash_table[hash], cache_list) if (!entry--) return ch; n &= ~((1LL<<32) - 1); @@ -1297,51 +1287,57 @@ static void *c_start(struct seq_file *m, loff_t *pos) hash++; n += 1LL<<32; } while(hash < cd->hash_size && - cd->hash_table[hash]==NULL); + hlist_empty(&cd->hash_table[hash])); if (hash >= cd->hash_size) return NULL; *pos = n+1; - return cd->hash_table[hash]; + return hlist_entry_safe(cd->hash_table[hash].first, + struct cache_head, cache_list); } +EXPORT_SYMBOL_GPL(cache_seq_start); -static void *c_next(struct seq_file *m, void *p, loff_t *pos) +void *cache_seq_next(struct seq_file *m, void *p, loff_t *pos) { struct cache_head *ch = p; int hash = (*pos >> 32); - struct cache_detail *cd = ((struct handle*)m->private)->cd; + struct cache_detail *cd = m->private; if (p == SEQ_START_TOKEN) hash = 0; - else if (ch->next == NULL) { + else if (ch->cache_list.next == NULL) { hash++; *pos += 1LL<<32; } else { ++*pos; - return ch->next; + return hlist_entry_safe(ch->cache_list.next, + struct cache_head, cache_list); } *pos &= ~((1LL<<32) - 1); while (hash < cd->hash_size && - cd->hash_table[hash] == NULL) { + hlist_empty(&cd->hash_table[hash])) { hash++; *pos += 1LL<<32; } if (hash >= cd->hash_size) return NULL; ++*pos; - return cd->hash_table[hash]; + return hlist_entry_safe(cd->hash_table[hash].first, + struct cache_head, cache_list); } +EXPORT_SYMBOL_GPL(cache_seq_next); -static void c_stop(struct seq_file *m, void *p) +void cache_seq_stop(struct seq_file *m, void *p) __releases(cd->hash_lock) { - struct cache_detail *cd = ((struct handle*)m->private)->cd; + struct cache_detail *cd = m->private; read_unlock(&cd->hash_lock); } +EXPORT_SYMBOL_GPL(cache_seq_stop); static int c_show(struct seq_file *m, void *p) { struct cache_head *cp = p; - struct cache_detail *cd = ((struct handle*)m->private)->cd; + struct cache_detail *cd = m->private; if (p == SEQ_START_TOKEN) return cd->cache_show(m, cd, NULL); @@ -1364,33 +1360,36 @@ static int c_show(struct seq_file *m, void *p) } static const struct seq_operations cache_content_op = { - .start = c_start, - .next = c_next, - .stop = c_stop, + .start = cache_seq_start, + .next = cache_seq_next, + .stop = cache_seq_stop, .show = c_show, }; static int content_open(struct inode *inode, struct file *file, struct cache_detail *cd) { - struct handle *han; + struct seq_file *seq; + int err; if (!cd || !try_module_get(cd->owner)) return -EACCES; - han = __seq_open_private(file, &cache_content_op, sizeof(*han)); - if (han == NULL) { + + err = seq_open(file, &cache_content_op); + if (err) { module_put(cd->owner); - return -ENOMEM; + return err; } - han->cd = cd; + seq = file->private_data; + seq->private = cd; return 0; } static int content_release(struct inode *inode, struct file *file, struct cache_detail *cd) { - int ret = seq_release_private(inode, file); + int ret = seq_release(inode, file); module_put(cd->owner); return ret; } @@ -1665,17 +1664,21 @@ EXPORT_SYMBOL_GPL(cache_unregister_net); struct cache_detail *cache_create_net(struct cache_detail *tmpl, struct net *net) { struct cache_detail *cd; + int i; cd = kmemdup(tmpl, sizeof(struct cache_detail), GFP_KERNEL); if (cd == NULL) return ERR_PTR(-ENOMEM); - cd->hash_table = kzalloc(cd->hash_size * sizeof(struct cache_head *), + cd->hash_table = kzalloc(cd->hash_size * sizeof(struct hlist_head), GFP_KERNEL); if (cd->hash_table == NULL) { kfree(cd); return ERR_PTR(-ENOMEM); } + + for (i = 0; i < cd->hash_size; i++) + INIT_HLIST_HEAD(&cd->hash_table[i]); cd->net = net; return cd; } diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 337ca851a350..f14f24ee9983 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -1092,14 +1092,10 @@ void rpc_destroy_mempool(void) { rpciod_stop(); - if (rpc_buffer_mempool) - mempool_destroy(rpc_buffer_mempool); - if (rpc_task_mempool) - mempool_destroy(rpc_task_mempool); - if (rpc_task_slabp) - kmem_cache_destroy(rpc_task_slabp); - if (rpc_buffer_slabp) - kmem_cache_destroy(rpc_buffer_slabp); + mempool_destroy(rpc_buffer_mempool); + mempool_destroy(rpc_task_mempool); + kmem_cache_destroy(rpc_task_slabp); + kmem_cache_destroy(rpc_buffer_slabp); rpc_destroy_wait_queue(&delay_queue); } diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 5a16d8d8c831..a8f579df14d8 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -34,36 +34,19 @@ static void svc_unregister(const struct svc_serv *serv, struct net *net); -#define svc_serv_is_pooled(serv) ((serv)->sv_function) +#define svc_serv_is_pooled(serv) ((serv)->sv_ops->svo_function) -/* - * Mode for mapping cpus to pools. - */ -enum { - SVC_POOL_AUTO = -1, /* choose one of the others */ - SVC_POOL_GLOBAL, /* no mapping, just a single global pool - * (legacy & UP mode) */ - SVC_POOL_PERCPU, /* one pool per cpu */ - SVC_POOL_PERNODE /* one pool per numa node */ -}; #define SVC_POOL_DEFAULT SVC_POOL_GLOBAL /* * Structure for mapping cpus to pools and vice versa. * Setup once during sunrpc initialisation. */ -static struct svc_pool_map { - int count; /* How many svc_servs use us */ - int mode; /* Note: int not enum to avoid - * warnings about "enumeration value - * not handled in switch" */ - unsigned int npools; - unsigned int *pool_to; /* maps pool id to cpu or node */ - unsigned int *to_pool; /* maps cpu or node to pool id */ -} svc_pool_map = { - .count = 0, +struct svc_pool_map svc_pool_map = { .mode = SVC_POOL_DEFAULT }; +EXPORT_SYMBOL_GPL(svc_pool_map); + static DEFINE_MUTEX(svc_pool_map_mutex);/* protects svc_pool_map.count only */ static int @@ -236,7 +219,7 @@ svc_pool_map_init_pernode(struct svc_pool_map *m) * vice versa). Initialise the map if we're the first user. * Returns the number of pools. */ -static unsigned int +unsigned int svc_pool_map_get(void) { struct svc_pool_map *m = &svc_pool_map; @@ -271,7 +254,7 @@ svc_pool_map_get(void) mutex_unlock(&svc_pool_map_mutex); return m->npools; } - +EXPORT_SYMBOL_GPL(svc_pool_map_get); /* * Drop a reference to the global map of cpus to pools. @@ -280,7 +263,7 @@ svc_pool_map_get(void) * mode using the pool_mode module option without * rebooting or re-loading sunrpc.ko. */ -static void +void svc_pool_map_put(void) { struct svc_pool_map *m = &svc_pool_map; @@ -297,7 +280,7 @@ svc_pool_map_put(void) mutex_unlock(&svc_pool_map_mutex); } - +EXPORT_SYMBOL_GPL(svc_pool_map_put); static int svc_pool_map_get_node(unsigned int pidx) { @@ -423,7 +406,7 @@ EXPORT_SYMBOL_GPL(svc_bind); */ static struct svc_serv * __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, - void (*shutdown)(struct svc_serv *serv, struct net *net)) + struct svc_serv_ops *ops) { struct svc_serv *serv; unsigned int vers; @@ -440,7 +423,7 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, bufsize = RPCSVC_MAXPAYLOAD; serv->sv_max_payload = bufsize? bufsize : 4096; serv->sv_max_mesg = roundup(serv->sv_max_payload + PAGE_SIZE, PAGE_SIZE); - serv->sv_shutdown = shutdown; + serv->sv_ops = ops; xdrsize = 0; while (prog) { prog->pg_lovers = prog->pg_nvers-1; @@ -486,26 +469,22 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, struct svc_serv * svc_create(struct svc_program *prog, unsigned int bufsize, - void (*shutdown)(struct svc_serv *serv, struct net *net)) + struct svc_serv_ops *ops) { - return __svc_create(prog, bufsize, /*npools*/1, shutdown); + return __svc_create(prog, bufsize, /*npools*/1, ops); } EXPORT_SYMBOL_GPL(svc_create); struct svc_serv * svc_create_pooled(struct svc_program *prog, unsigned int bufsize, - void (*shutdown)(struct svc_serv *serv, struct net *net), - svc_thread_fn func, struct module *mod) + struct svc_serv_ops *ops) { struct svc_serv *serv; unsigned int npools = svc_pool_map_get(); - serv = __svc_create(prog, bufsize, npools, shutdown); + serv = __svc_create(prog, bufsize, npools, ops); if (!serv) goto out_err; - - serv->sv_function = func; - serv->sv_module = mod; return serv; out_err: svc_pool_map_put(); @@ -517,8 +496,8 @@ void svc_shutdown_net(struct svc_serv *serv, struct net *net) { svc_close_net(serv, net); - if (serv->sv_shutdown) - serv->sv_shutdown(serv, net); + if (serv->sv_ops->svo_shutdown) + serv->sv_ops->svo_shutdown(serv, net); } EXPORT_SYMBOL_GPL(svc_shutdown_net); @@ -604,40 +583,52 @@ svc_release_buffer(struct svc_rqst *rqstp) } struct svc_rqst * -svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node) +svc_rqst_alloc(struct svc_serv *serv, struct svc_pool *pool, int node) { struct svc_rqst *rqstp; rqstp = kzalloc_node(sizeof(*rqstp), GFP_KERNEL, node); if (!rqstp) - goto out_enomem; + return rqstp; - serv->sv_nrthreads++; __set_bit(RQ_BUSY, &rqstp->rq_flags); spin_lock_init(&rqstp->rq_lock); rqstp->rq_server = serv; rqstp->rq_pool = pool; - spin_lock_bh(&pool->sp_lock); - pool->sp_nrthreads++; - list_add_rcu(&rqstp->rq_all, &pool->sp_all_threads); - spin_unlock_bh(&pool->sp_lock); rqstp->rq_argp = kmalloc_node(serv->sv_xdrsize, GFP_KERNEL, node); if (!rqstp->rq_argp) - goto out_thread; + goto out_enomem; rqstp->rq_resp = kmalloc_node(serv->sv_xdrsize, GFP_KERNEL, node); if (!rqstp->rq_resp) - goto out_thread; + goto out_enomem; if (!svc_init_buffer(rqstp, serv->sv_max_mesg, node)) - goto out_thread; + goto out_enomem; return rqstp; -out_thread: - svc_exit_thread(rqstp); out_enomem: - return ERR_PTR(-ENOMEM); + svc_rqst_free(rqstp); + return NULL; +} +EXPORT_SYMBOL_GPL(svc_rqst_alloc); + +struct svc_rqst * +svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node) +{ + struct svc_rqst *rqstp; + + rqstp = svc_rqst_alloc(serv, pool, node); + if (!rqstp) + return ERR_PTR(-ENOMEM); + + serv->sv_nrthreads++; + spin_lock_bh(&pool->sp_lock); + pool->sp_nrthreads++; + list_add_rcu(&rqstp->rq_all, &pool->sp_all_threads); + spin_unlock_bh(&pool->sp_lock); + return rqstp; } EXPORT_SYMBOL_GPL(svc_prepare_thread); @@ -739,12 +730,12 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) break; } - __module_get(serv->sv_module); - task = kthread_create_on_node(serv->sv_function, rqstp, + __module_get(serv->sv_ops->svo_module); + task = kthread_create_on_node(serv->sv_ops->svo_function, rqstp, node, "%s", serv->sv_name); if (IS_ERR(task)) { error = PTR_ERR(task); - module_put(serv->sv_module); + module_put(serv->sv_ops->svo_module); svc_exit_thread(rqstp); break; } @@ -772,15 +763,21 @@ EXPORT_SYMBOL_GPL(svc_set_num_threads); * mutex" for the service. */ void -svc_exit_thread(struct svc_rqst *rqstp) +svc_rqst_free(struct svc_rqst *rqstp) { - struct svc_serv *serv = rqstp->rq_server; - struct svc_pool *pool = rqstp->rq_pool; - svc_release_buffer(rqstp); kfree(rqstp->rq_resp); kfree(rqstp->rq_argp); kfree(rqstp->rq_auth_data); + kfree_rcu(rqstp, rq_rcu_head); +} +EXPORT_SYMBOL_GPL(svc_rqst_free); + +void +svc_exit_thread(struct svc_rqst *rqstp) +{ + struct svc_serv *serv = rqstp->rq_server; + struct svc_pool *pool = rqstp->rq_pool; spin_lock_bh(&pool->sp_lock); pool->sp_nrthreads--; @@ -788,7 +785,7 @@ svc_exit_thread(struct svc_rqst *rqstp) list_del_rcu(&rqstp->rq_all); spin_unlock_bh(&pool->sp_lock); - kfree_rcu(rqstp, rq_rcu_head); + svc_rqst_free(rqstp); /* Release the server */ if (serv) diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 163ac45c3639..a6cbb2104667 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -24,7 +24,6 @@ static int svc_deferred_recv(struct svc_rqst *rqstp); static struct cache_deferred_req *svc_defer(struct cache_req *req); static void svc_age_temp_xprts(unsigned long closure); static void svc_delete_xprt(struct svc_xprt *xprt); -static void svc_xprt_do_enqueue(struct svc_xprt *xprt); /* apparently the "standard" is that clients close * idle connections after 5 minutes, servers after @@ -225,12 +224,12 @@ static void svc_xprt_received(struct svc_xprt *xprt) } /* As soon as we clear busy, the xprt could be closed and - * 'put', so we need a reference to call svc_xprt_do_enqueue with: + * 'put', so we need a reference to call svc_enqueue_xprt with: */ svc_xprt_get(xprt); smp_mb__before_atomic(); clear_bit(XPT_BUSY, &xprt->xpt_flags); - svc_xprt_do_enqueue(xprt); + xprt->xpt_server->sv_ops->svo_enqueue_xprt(xprt); svc_xprt_put(xprt); } @@ -320,7 +319,7 @@ static bool svc_xprt_has_something_to_do(struct svc_xprt *xprt) return false; } -static void svc_xprt_do_enqueue(struct svc_xprt *xprt) +void svc_xprt_do_enqueue(struct svc_xprt *xprt) { struct svc_pool *pool; struct svc_rqst *rqstp = NULL; @@ -402,6 +401,7 @@ redo_search: out: trace_svc_xprt_do_enqueue(xprt, rqstp); } +EXPORT_SYMBOL_GPL(svc_xprt_do_enqueue); /* * Queue up a transport with data pending. If there are idle nfsd @@ -412,7 +412,7 @@ void svc_xprt_enqueue(struct svc_xprt *xprt) { if (test_bit(XPT_BUSY, &xprt->xpt_flags)) return; - svc_xprt_do_enqueue(xprt); + xprt->xpt_server->sv_ops->svo_enqueue_xprt(xprt); } EXPORT_SYMBOL_GPL(svc_xprt_enqueue); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index ab5dd621ae0c..2e98f4a243e5 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -614,6 +614,7 @@ static void xprt_autoclose(struct work_struct *work) clear_bit(XPRT_CLOSE_WAIT, &xprt->state); xprt->ops->close(xprt); xprt_release_write(xprt, NULL); + wake_up_bit(&xprt->state, XPRT_LOCKED); } /** @@ -723,6 +724,7 @@ void xprt_unlock_connect(struct rpc_xprt *xprt, void *cookie) xprt->ops->release_xprt(xprt, NULL); out: spin_unlock_bh(&xprt->transport_lock); + wake_up_bit(&xprt->state, XPRT_LOCKED); } /** @@ -1394,6 +1396,10 @@ out: static void xprt_destroy(struct rpc_xprt *xprt) { dprintk("RPC: destroying transport %p\n", xprt); + + /* Exclude transport connect/disconnect handlers */ + wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_UNINTERRUPTIBLE); + del_timer_sync(&xprt->timer); rpc_xprt_debugfs_unregister(xprt); diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 04ea914201b2..5318951b3b53 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -117,7 +117,7 @@ __frwr_recovery_worker(struct work_struct *work) if (ib_dereg_mr(r->r.frmr.fr_mr)) goto out_fail; - r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(pd, depth); + r->r.frmr.fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth); if (IS_ERR(r->r.frmr.fr_mr)) goto out_fail; @@ -148,7 +148,7 @@ __frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device, struct rpcrdma_frmr *f = &r->r.frmr; int rc; - f->fr_mr = ib_alloc_fast_reg_mr(pd, depth); + f->fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth); if (IS_ERR(f->fr_mr)) goto out_mr_err; f->fr_pgl = ib_alloc_fast_reg_page_list(device, depth); @@ -158,7 +158,7 @@ __frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device, out_mr_err: rc = PTR_ERR(f->fr_mr); - dprintk("RPC: %s: ib_alloc_fast_reg_mr status %i\n", + dprintk("RPC: %s: ib_alloc_mr status %i\n", __func__, rc); return rc; diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c index 41985d07fdb7..617b76f22154 100644 --- a/net/sunrpc/xprtrdma/physical_ops.c +++ b/net/sunrpc/xprtrdma/physical_ops.c @@ -23,6 +23,21 @@ static int physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, struct rpcrdma_create_data_internal *cdata) { + struct ib_mr *mr; + + /* Obtain an rkey to use for RPC data payloads. + */ + mr = ib_get_dma_mr(ia->ri_pd, + IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ); + if (IS_ERR(mr)) { + pr_err("%s: ib_get_dma_mr for failed with %lX\n", + __func__, PTR_ERR(mr)); + return -ENOMEM; + } + + ia->ri_dma_mr = mr; return 0; } @@ -51,7 +66,7 @@ physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, struct rpcrdma_ia *ia = &r_xprt->rx_ia; rpcrdma_map_one(ia->ri_device, seg, rpcrdma_data_dir(writing)); - seg->mr_rkey = ia->ri_bind_mem->rkey; + seg->mr_rkey = ia->ri_dma_mr->rkey; seg->mr_base = seg->mr_dma; seg->mr_nsegs = 1; return 1; diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 84ea37daef36..bc8bd6577467 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -71,6 +71,67 @@ static const char transfertypes[][12] = { }; #endif +/* The client can send a request inline as long as the RPCRDMA header + * plus the RPC call fit under the transport's inline limit. If the + * combined call message size exceeds that limit, the client must use + * the read chunk list for this operation. + */ +static bool rpcrdma_args_inline(struct rpc_rqst *rqst) +{ + unsigned int callsize = RPCRDMA_HDRLEN_MIN + rqst->rq_snd_buf.len; + + return callsize <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst); +} + +/* The client can't know how large the actual reply will be. Thus it + * plans for the largest possible reply for that particular ULP + * operation. If the maximum combined reply message size exceeds that + * limit, the client must provide a write list or a reply chunk for + * this request. + */ +static bool rpcrdma_results_inline(struct rpc_rqst *rqst) +{ + unsigned int repsize = RPCRDMA_HDRLEN_MIN + rqst->rq_rcv_buf.buflen; + + return repsize <= RPCRDMA_INLINE_READ_THRESHOLD(rqst); +} + +static int +rpcrdma_tail_pullup(struct xdr_buf *buf) +{ + size_t tlen = buf->tail[0].iov_len; + size_t skip = tlen & 3; + + /* Do not include the tail if it is only an XDR pad */ + if (tlen < 4) + return 0; + + /* xdr_write_pages() adds a pad at the beginning of the tail + * if the content in "buf->pages" is unaligned. Force the + * tail's actual content to land at the next XDR position + * after the head instead. + */ + if (skip) { + unsigned char *src, *dst; + unsigned int count; + + src = buf->tail[0].iov_base; + dst = buf->head[0].iov_base; + dst += buf->head[0].iov_len; + + src += skip; + tlen -= skip; + + dprintk("RPC: %s: skip=%zu, memmove(%p, %p, %zu)\n", + __func__, skip, dst, src, tlen); + + for (count = tlen; count; count--) + *dst++ = *src++; + } + + return tlen; +} + /* * Chunk assembly from upper layer xdr_buf. * @@ -122,6 +183,10 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, if (len && n == nsegs) return -EIO; + /* When encoding the read list, the tail is always sent inline */ + if (type == rpcrdma_readch) + return n; + if (xdrbuf->tail[0].iov_len) { /* the rpcrdma protocol allows us to omit any trailing * xdr pad bytes, saving the server an RDMA operation. */ @@ -297,8 +362,7 @@ out: * pre-registered memory buffer for this request. For small amounts * of data, this is efficient. The cutoff value is tunable. */ -static int -rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad) +static void rpcrdma_inline_pullup(struct rpc_rqst *rqst) { int i, npages, curlen; int copy_len; @@ -310,16 +374,9 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad) destp = rqst->rq_svec[0].iov_base; curlen = rqst->rq_svec[0].iov_len; destp += curlen; - /* - * Do optional padding where it makes sense. Alignment of write - * payload can help the server, if our setting is accurate. - */ - pad -= (curlen + 36/*sizeof(struct rpcrdma_msg_padded)*/); - if (pad < 0 || rqst->rq_slen - curlen < RPCRDMA_INLINE_PAD_THRESH) - pad = 0; /* don't pad this request */ - dprintk("RPC: %s: pad %d destp 0x%p len %d hdrlen %d\n", - __func__, pad, destp, rqst->rq_slen, curlen); + dprintk("RPC: %s: destp 0x%p len %d hdrlen %d\n", + __func__, destp, rqst->rq_slen, curlen); copy_len = rqst->rq_snd_buf.page_len; @@ -355,7 +412,6 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad) page_base = 0; } /* header now contains entire send message */ - return pad; } /* @@ -380,7 +436,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); struct rpcrdma_req *req = rpcr_to_rdmar(rqst); char *base; - size_t rpclen, padlen; + size_t rpclen; ssize_t hdrlen; enum rpcrdma_chunktype rtype, wtype; struct rpcrdma_msg *headerp; @@ -402,28 +458,15 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) /* * Chunks needed for results? * + * o Read ops return data as write chunk(s), header as inline. * o If the expected result is under the inline threshold, all ops - * return as inline (but see later). + * return as inline. * o Large non-read ops return as a single reply chunk. - * o Large read ops return data as write chunk(s), header as inline. - * - * Note: the NFS code sending down multiple result segments implies - * the op is one of read, readdir[plus], readlink or NFSv4 getacl. - */ - - /* - * This code can handle read chunks, write chunks OR reply - * chunks -- only one type. If the request is too big to fit - * inline, then we will choose read chunks. If the request is - * a READ, then use write chunks to separate the file data - * into pages; otherwise use reply chunks. */ - if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst)) - wtype = rpcrdma_noch; - else if (rqst->rq_rcv_buf.page_len == 0) - wtype = rpcrdma_replych; - else if (rqst->rq_rcv_buf.flags & XDRBUF_READ) + if (rqst->rq_rcv_buf.flags & XDRBUF_READ) wtype = rpcrdma_writech; + else if (rpcrdma_results_inline(rqst)) + wtype = rpcrdma_noch; else wtype = rpcrdma_replych; @@ -432,21 +475,25 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * * o If the total request is under the inline threshold, all ops * are sent as inline. - * o Large non-write ops are sent with the entire message as a - * single read chunk (protocol 0-position special case). * o Large write ops transmit data as read chunk(s), header as * inline. + * o Large non-write ops are sent with the entire message as a + * single read chunk (protocol 0-position special case). * - * Note: the NFS code sending down multiple argument segments - * implies the op is a write. - * TBD check NFSv4 setacl + * This assumes that the upper layer does not present a request + * that both has a data payload, and whose non-data arguments + * by themselves are larger than the inline threshold. */ - if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst)) + if (rpcrdma_args_inline(rqst)) { rtype = rpcrdma_noch; - else if (rqst->rq_snd_buf.page_len == 0) - rtype = rpcrdma_areadch; - else + } else if (rqst->rq_snd_buf.flags & XDRBUF_WRITE) { rtype = rpcrdma_readch; + } else { + r_xprt->rx_stats.nomsg_call_count++; + headerp->rm_type = htonl(RDMA_NOMSG); + rtype = rpcrdma_areadch; + rpclen = 0; + } /* The following simplification is not true forever */ if (rtype != rpcrdma_noch && wtype == rpcrdma_replych) @@ -458,7 +505,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) } hdrlen = RPCRDMA_HDRLEN_MIN; - padlen = 0; /* * Pull up any extra send data into the preregistered buffer. @@ -467,45 +513,15 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) */ if (rtype == rpcrdma_noch) { - padlen = rpcrdma_inline_pullup(rqst, - RPCRDMA_INLINE_PAD_VALUE(rqst)); - - if (padlen) { - headerp->rm_type = rdma_msgp; - headerp->rm_body.rm_padded.rm_align = - cpu_to_be32(RPCRDMA_INLINE_PAD_VALUE(rqst)); - headerp->rm_body.rm_padded.rm_thresh = - cpu_to_be32(RPCRDMA_INLINE_PAD_THRESH); - headerp->rm_body.rm_padded.rm_pempty[0] = xdr_zero; - headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero; - headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero; - hdrlen += 2 * sizeof(u32); /* extra words in padhdr */ - if (wtype != rpcrdma_noch) { - dprintk("RPC: %s: invalid chunk list\n", - __func__); - return -EIO; - } - } else { - headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero; - headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero; - headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero; - /* new length after pullup */ - rpclen = rqst->rq_svec[0].iov_len; - /* - * Currently we try to not actually use read inline. - * Reply chunks have the desirable property that - * they land, packed, directly in the target buffers - * without headers, so they require no fixup. The - * additional RDMA Write op sends the same amount - * of data, streams on-the-wire and adds no overhead - * on receive. Therefore, we request a reply chunk - * for non-writes wherever feasible and efficient. - */ - if (wtype == rpcrdma_noch) - wtype = rpcrdma_replych; - } - } + rpcrdma_inline_pullup(rqst); + headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero; + headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero; + headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero; + /* new length after pullup */ + rpclen = rqst->rq_svec[0].iov_len; + } else if (rtype == rpcrdma_readch) + rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf); if (rtype != rpcrdma_noch) { hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf, headerp, rtype); @@ -518,9 +534,9 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) if (hdrlen < 0) return hdrlen; - dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd" + dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd" " headerp 0x%p base 0x%p lkey 0x%x\n", - __func__, transfertypes[wtype], hdrlen, rpclen, padlen, + __func__, transfertypes[wtype], hdrlen, rpclen, headerp, base, rdmab_lkey(req->rl_rdmabuf)); /* @@ -534,26 +550,15 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) req->rl_send_iov[0].length = hdrlen; req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf); + req->rl_niovs = 1; + if (rtype == rpcrdma_areadch) + return 0; + req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf); req->rl_send_iov[1].length = rpclen; req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf); req->rl_niovs = 2; - - if (padlen) { - struct rpcrdma_ep *ep = &r_xprt->rx_ep; - - req->rl_send_iov[2].addr = rdmab_addr(ep->rep_padbuf); - req->rl_send_iov[2].length = padlen; - req->rl_send_iov[2].lkey = rdmab_lkey(ep->rep_padbuf); - - req->rl_send_iov[3].addr = req->rl_send_iov[1].addr + rpclen; - req->rl_send_iov[3].length = rqst->rq_slen - rpclen; - req->rl_send_iov[3].lkey = rdmab_lkey(req->rl_sendbuf); - - req->rl_niovs = 4; - } - return 0; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 2e1348bde325..cb5174284074 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -115,15 +115,6 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp, rqstp->rq_arg.tail[0].iov_len = 0; } -static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count) -{ - if (!rdma_cap_read_multi_sge(xprt->sc_cm_id->device, - xprt->sc_cm_id->port_num)) - return 1; - else - return min_t(int, sge_count, xprt->sc_max_sge); -} - /* Issue an RDMA_READ using the local lkey to map the data sink */ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, @@ -144,8 +135,7 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt, ctxt->direction = DMA_FROM_DEVICE; ctxt->read_hdr = head; - pages_needed = - min_t(int, pages_needed, rdma_read_max_sge(xprt, pages_needed)); + pages_needed = min_t(int, pages_needed, xprt->sc_max_sge_rd); read = min_t(int, pages_needed << PAGE_SHIFT, rs_length); for (pno = 0; pno < pages_needed; pno++) { diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index d25cd430f9ff..1dfae8317065 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -136,6 +136,79 @@ static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt, return dma_addr; } +/* Returns the address of the first read chunk or <nul> if no read chunk + * is present + */ +struct rpcrdma_read_chunk * +svc_rdma_get_read_chunk(struct rpcrdma_msg *rmsgp) +{ + struct rpcrdma_read_chunk *ch = + (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; + + if (ch->rc_discrim == xdr_zero) + return NULL; + return ch; +} + +/* Returns the address of the first read write array element or <nul> + * if no write array list is present + */ +static struct rpcrdma_write_array * +svc_rdma_get_write_array(struct rpcrdma_msg *rmsgp) +{ + if (rmsgp->rm_body.rm_chunks[0] != xdr_zero || + rmsgp->rm_body.rm_chunks[1] == xdr_zero) + return NULL; + return (struct rpcrdma_write_array *)&rmsgp->rm_body.rm_chunks[1]; +} + +/* Returns the address of the first reply array element or <nul> if no + * reply array is present + */ +static struct rpcrdma_write_array * +svc_rdma_get_reply_array(struct rpcrdma_msg *rmsgp) +{ + struct rpcrdma_read_chunk *rch; + struct rpcrdma_write_array *wr_ary; + struct rpcrdma_write_array *rp_ary; + + /* XXX: Need to fix when reply chunk may occur with read list + * and/or write list. + */ + if (rmsgp->rm_body.rm_chunks[0] != xdr_zero || + rmsgp->rm_body.rm_chunks[1] != xdr_zero) + return NULL; + + rch = svc_rdma_get_read_chunk(rmsgp); + if (rch) { + while (rch->rc_discrim != xdr_zero) + rch++; + + /* The reply chunk follows an empty write array located + * at 'rc_position' here. The reply array is at rc_target. + */ + rp_ary = (struct rpcrdma_write_array *)&rch->rc_target; + goto found_it; + } + + wr_ary = svc_rdma_get_write_array(rmsgp); + if (wr_ary) { + int chunk = be32_to_cpu(wr_ary->wc_nchunks); + + rp_ary = (struct rpcrdma_write_array *) + &wr_ary->wc_array[chunk].wc_target.rs_length; + goto found_it; + } + + /* No read list, no write list */ + rp_ary = (struct rpcrdma_write_array *)&rmsgp->rm_body.rm_chunks[2]; + + found_it: + if (rp_ary->wc_discrim == xdr_zero) + return NULL; + return rp_ary; +} + /* Assumptions: * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE */ @@ -384,6 +457,7 @@ static int send_reply(struct svcxprt_rdma *rdma, int byte_count) { struct ib_send_wr send_wr; + u32 xdr_off; int sge_no; int sge_bytes; int page_no; @@ -418,8 +492,8 @@ static int send_reply(struct svcxprt_rdma *rdma, ctxt->direction = DMA_TO_DEVICE; /* Map the payload indicated by 'byte_count' */ + xdr_off = 0; for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) { - int xdr_off = 0; sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count); byte_count -= sge_bytes; ctxt->sge[sge_no].addr = @@ -457,6 +531,13 @@ static int send_reply(struct svcxprt_rdma *rdma, } rqstp->rq_next_page = rqstp->rq_respages + 1; + /* The loop above bumps sc_dma_used for each sge. The + * xdr_buf.tail gets a separate sge, but resides in the + * same page as xdr_buf.head. Don't count it twice. + */ + if (sge_no > ctxt->count) + atomic_dec(&rdma->sc_dma_used); + if (sge_no > rdma->sc_max_sge) { pr_err("svcrdma: Too many sges (%d)\n", sge_no); goto err; diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 6b36279e4288..fcc3eb80c265 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -91,7 +91,7 @@ struct svc_xprt_class svc_rdma_class = { .xcl_name = "rdma", .xcl_owner = THIS_MODULE, .xcl_ops = &svc_rdma_ops, - .xcl_max_payload = RPCRDMA_MAXPAYLOAD, + .xcl_max_payload = RPCSVC_MAXPAYLOAD_RDMA, .xcl_ident = XPRT_TRANSPORT_RDMA, }; @@ -659,6 +659,7 @@ static int rdma_cma_handler(struct rdma_cm_id *cma_id, if (xprt) { set_bit(XPT_CLOSE, &xprt->xpt_flags); svc_xprt_enqueue(xprt); + svc_xprt_put(xprt); } break; default: @@ -733,17 +734,19 @@ static struct svc_rdma_fastreg_mr *rdma_alloc_frmr(struct svcxprt_rdma *xprt) struct ib_mr *mr; struct ib_fast_reg_page_list *pl; struct svc_rdma_fastreg_mr *frmr; + u32 num_sg; frmr = kmalloc(sizeof(*frmr), GFP_KERNEL); if (!frmr) goto err; - mr = ib_alloc_fast_reg_mr(xprt->sc_pd, RPCSVC_MAXPAGES); + num_sg = min_t(u32, RPCSVC_MAXPAGES, xprt->sc_frmr_pg_list_len); + mr = ib_alloc_mr(xprt->sc_pd, IB_MR_TYPE_MEM_REG, num_sg); if (IS_ERR(mr)) goto err_free_frmr; pl = ib_alloc_fast_reg_page_list(xprt->sc_cm_id->device, - RPCSVC_MAXPAGES); + num_sg); if (IS_ERR(pl)) goto err_free_mr; @@ -872,6 +875,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) * capabilities of this particular device */ newxprt->sc_max_sge = min((size_t)devattr.max_sge, (size_t)RPCSVC_MAXPAGES); + newxprt->sc_max_sge_rd = min_t(size_t, devattr.max_sge_rd, + RPCSVC_MAXPAGES); newxprt->sc_max_requests = min((size_t)devattr.max_qp_wr, (size_t)svcrdma_max_requests); newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_max_requests; @@ -1046,6 +1051,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) " remote_ip : %pI4\n" " remote_port : %d\n" " max_sge : %d\n" + " max_sge_rd : %d\n" " sq_depth : %d\n" " max_requests : %d\n" " ord : %d\n", @@ -1059,6 +1065,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id-> route.addr.dst_addr)->sin_port), newxprt->sc_max_sge, + newxprt->sc_max_sge_rd, newxprt->sc_sq_depth, newxprt->sc_max_requests, newxprt->sc_ord); @@ -1201,40 +1208,6 @@ static int svc_rdma_secure_port(struct svc_rqst *rqstp) return 1; } -/* - * Attempt to register the kvec representing the RPC memory with the - * device. - * - * Returns: - * NULL : The device does not support fastreg or there were no more - * fastreg mr. - * frmr : The kvec register request was successfully posted. - * <0 : An error was encountered attempting to register the kvec. - */ -int svc_rdma_fastreg(struct svcxprt_rdma *xprt, - struct svc_rdma_fastreg_mr *frmr) -{ - struct ib_send_wr fastreg_wr; - u8 key; - - /* Bump the key */ - key = (u8)(frmr->mr->lkey & 0x000000FF); - ib_update_fast_reg_key(frmr->mr, ++key); - - /* Prepare FASTREG WR */ - memset(&fastreg_wr, 0, sizeof fastreg_wr); - fastreg_wr.opcode = IB_WR_FAST_REG_MR; - fastreg_wr.send_flags = IB_SEND_SIGNALED; - fastreg_wr.wr.fast_reg.iova_start = (unsigned long)frmr->kva; - fastreg_wr.wr.fast_reg.page_list = frmr->page_list; - fastreg_wr.wr.fast_reg.page_list_len = frmr->page_list_len; - fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; - fastreg_wr.wr.fast_reg.length = frmr->map_len; - fastreg_wr.wr.fast_reg.access_flags = frmr->access_flags; - fastreg_wr.wr.fast_reg.rkey = frmr->mr->lkey; - return svc_rdma_send(xprt, &fastreg_wr); -} - int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) { struct ib_send_wr *bad_wr, *n_wr; diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 680f888a9ddd..64443eb754ad 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -175,10 +175,8 @@ xprt_rdma_format_addresses6(struct rpc_xprt *xprt, struct sockaddr *sap) } static void -xprt_rdma_format_addresses(struct rpc_xprt *xprt) +xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap) { - struct sockaddr *sap = (struct sockaddr *) - &rpcx_to_rdmad(xprt).addr; char buf[128]; switch (sap->sa_family) { @@ -302,7 +300,7 @@ xprt_setup_rdma(struct xprt_create *args) struct rpc_xprt *xprt; struct rpcrdma_xprt *new_xprt; struct rpcrdma_ep *new_ep; - struct sockaddr_in *sin; + struct sockaddr *sap; int rc; if (args->addrlen > sizeof(xprt->addr)) { @@ -333,26 +331,20 @@ xprt_setup_rdma(struct xprt_create *args) * Set up RDMA-specific connect data. */ - /* Put server RDMA address in local cdata */ - memcpy(&cdata.addr, args->dstaddr, args->addrlen); + sap = (struct sockaddr *)&cdata.addr; + memcpy(sap, args->dstaddr, args->addrlen); /* Ensure xprt->addr holds valid server TCP (not RDMA) * address, for any side protocols which peek at it */ xprt->prot = IPPROTO_TCP; xprt->addrlen = args->addrlen; - memcpy(&xprt->addr, &cdata.addr, xprt->addrlen); + memcpy(&xprt->addr, sap, xprt->addrlen); - sin = (struct sockaddr_in *)&cdata.addr; - if (ntohs(sin->sin_port) != 0) + if (rpc_get_port(sap)) xprt_set_bound(xprt); - dprintk("RPC: %s: %pI4:%u\n", - __func__, &sin->sin_addr.s_addr, ntohs(sin->sin_port)); - - /* Set max requests */ cdata.max_requests = xprt->max_reqs; - /* Set some length limits */ cdata.rsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA write max */ cdata.wsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA read max */ @@ -375,8 +367,7 @@ xprt_setup_rdma(struct xprt_create *args) new_xprt = rpcx_to_rdmax(xprt); - rc = rpcrdma_ia_open(new_xprt, (struct sockaddr *) &cdata.addr, - xprt_rdma_memreg_strategy); + rc = rpcrdma_ia_open(new_xprt, sap, xprt_rdma_memreg_strategy); if (rc) goto out1; @@ -409,7 +400,7 @@ xprt_setup_rdma(struct xprt_create *args) INIT_DELAYED_WORK(&new_xprt->rx_connect_worker, xprt_rdma_connect_worker); - xprt_rdma_format_addresses(xprt); + xprt_rdma_format_addresses(xprt, sap); xprt->max_payload = new_xprt->rx_ia.ri_ops->ro_maxpages(new_xprt); if (xprt->max_payload == 0) goto out4; @@ -420,6 +411,9 @@ xprt_setup_rdma(struct xprt_create *args) if (!try_module_get(THIS_MODULE)) goto out4; + dprintk("RPC: %s: %s:%s\n", __func__, + xprt->address_strings[RPC_DISPLAY_ADDR], + xprt->address_strings[RPC_DISPLAY_PORT]); return xprt; out4: @@ -653,31 +647,30 @@ static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) if (xprt_connected(xprt)) idle_time = (long)(jiffies - xprt->last_used) / HZ; - seq_printf(seq, - "\txprt:\trdma %u %lu %lu %lu %ld %lu %lu %lu %Lu %Lu " - "%lu %lu %lu %Lu %Lu %Lu %Lu %lu %lu %lu\n", - - 0, /* need a local port? */ - xprt->stat.bind_count, - xprt->stat.connect_count, - xprt->stat.connect_time, - idle_time, - xprt->stat.sends, - xprt->stat.recvs, - xprt->stat.bad_xids, - xprt->stat.req_u, - xprt->stat.bklog_u, - - r_xprt->rx_stats.read_chunk_count, - r_xprt->rx_stats.write_chunk_count, - r_xprt->rx_stats.reply_chunk_count, - r_xprt->rx_stats.total_rdma_request, - r_xprt->rx_stats.total_rdma_reply, - r_xprt->rx_stats.pullup_copy_count, - r_xprt->rx_stats.fixup_copy_count, - r_xprt->rx_stats.hardway_register_count, - r_xprt->rx_stats.failed_marshal_count, - r_xprt->rx_stats.bad_reply_count); + seq_puts(seq, "\txprt:\trdma "); + seq_printf(seq, "%u %lu %lu %lu %ld %lu %lu %lu %llu %llu ", + 0, /* need a local port? */ + xprt->stat.bind_count, + xprt->stat.connect_count, + xprt->stat.connect_time, + idle_time, + xprt->stat.sends, + xprt->stat.recvs, + xprt->stat.bad_xids, + xprt->stat.req_u, + xprt->stat.bklog_u); + seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu %lu\n", + r_xprt->rx_stats.read_chunk_count, + r_xprt->rx_stats.write_chunk_count, + r_xprt->rx_stats.reply_chunk_count, + r_xprt->rx_stats.total_rdma_request, + r_xprt->rx_stats.total_rdma_reply, + r_xprt->rx_stats.pullup_copy_count, + r_xprt->rx_stats.fixup_copy_count, + r_xprt->rx_stats.hardway_register_count, + r_xprt->rx_stats.failed_marshal_count, + r_xprt->rx_stats.bad_reply_count, + r_xprt->rx_stats.nomsg_call_count); } static int diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 891c4ede2c20..eb081ad05e33 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -52,6 +52,7 @@ #include <linux/prefetch.h> #include <linux/sunrpc/addr.h> #include <asm/bitops.h> +#include <linux/module.h> /* try_module_get()/module_put() */ #include "xprt_rdma.h" @@ -414,6 +415,14 @@ connected: return 0; } +static void rpcrdma_destroy_id(struct rdma_cm_id *id) +{ + if (id) { + module_put(id->device->owner); + rdma_destroy_id(id); + } +} + static struct rdma_cm_id * rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia, struct sockaddr *addr) @@ -440,6 +449,17 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, } wait_for_completion_interruptible_timeout(&ia->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1); + + /* FIXME: + * Until xprtrdma supports DEVICE_REMOVAL, the provider must + * be pinned while there are active NFS/RDMA mounts to prevent + * hangs and crashes at umount time. + */ + if (!ia->ri_async_rc && !try_module_get(id->device->owner)) { + dprintk("RPC: %s: Failed to get device module\n", + __func__); + ia->ri_async_rc = -ENODEV; + } rc = ia->ri_async_rc; if (rc) goto out; @@ -449,16 +469,17 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, if (rc) { dprintk("RPC: %s: rdma_resolve_route() failed %i\n", __func__, rc); - goto out; + goto put; } wait_for_completion_interruptible_timeout(&ia->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1); rc = ia->ri_async_rc; if (rc) - goto out; + goto put; return id; - +put: + module_put(id->device->owner); out: rdma_destroy_id(id); return ERR_PTR(rc); @@ -493,9 +514,11 @@ rpcrdma_clean_cq(struct ib_cq *cq) int rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) { - int rc, mem_priv; struct rpcrdma_ia *ia = &xprt->rx_ia; struct ib_device_attr *devattr = &ia->ri_devattr; + int rc; + + ia->ri_dma_mr = NULL; ia->ri_id = rpcrdma_create_id(xprt, ia, addr); if (IS_ERR(ia->ri_id)) { @@ -519,11 +542,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) goto out3; } - if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) { - ia->ri_have_dma_lkey = 1; - ia->ri_dma_lkey = ia->ri_device->local_dma_lkey; - } - if (memreg == RPCRDMA_FRMR) { /* Requires both frmr reg and local dma lkey */ if (((devattr->device_cap_flags & @@ -539,42 +557,19 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) if (!ia->ri_device->alloc_fmr) { dprintk("RPC: %s: MTHCAFMR registration " "not supported by HCA\n", __func__); - memreg = RPCRDMA_ALLPHYSICAL; + goto out3; } } - /* - * Optionally obtain an underlying physical identity mapping in - * order to do a memory window-based bind. This base registration - * is protected from remote access - that is enabled only by binding - * for the specific bytes targeted during each RPC operation, and - * revoked after the corresponding completion similar to a storage - * adapter. - */ switch (memreg) { case RPCRDMA_FRMR: ia->ri_ops = &rpcrdma_frwr_memreg_ops; break; case RPCRDMA_ALLPHYSICAL: ia->ri_ops = &rpcrdma_physical_memreg_ops; - mem_priv = IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_REMOTE_READ; - goto register_setup; + break; case RPCRDMA_MTHCAFMR: ia->ri_ops = &rpcrdma_fmr_memreg_ops; - if (ia->ri_have_dma_lkey) - break; - mem_priv = IB_ACCESS_LOCAL_WRITE; - register_setup: - ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv); - if (IS_ERR(ia->ri_bind_mem)) { - printk(KERN_ALERT "%s: ib_get_dma_mr for " - "phys register failed with %lX\n", - __func__, PTR_ERR(ia->ri_bind_mem)); - rc = -ENOMEM; - goto out3; - } break; default: printk(KERN_ERR "RPC: Unsupported memory " @@ -592,7 +587,7 @@ out3: ib_dealloc_pd(ia->ri_pd); ia->ri_pd = NULL; out2: - rdma_destroy_id(ia->ri_id); + rpcrdma_destroy_id(ia->ri_id); ia->ri_id = NULL; out1: return rc; @@ -606,25 +601,17 @@ out1: void rpcrdma_ia_close(struct rpcrdma_ia *ia) { - int rc; - dprintk("RPC: %s: entering\n", __func__); - if (ia->ri_bind_mem != NULL) { - rc = ib_dereg_mr(ia->ri_bind_mem); - dprintk("RPC: %s: ib_dereg_mr returned %i\n", - __func__, rc); - } - if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) { if (ia->ri_id->qp) rdma_destroy_qp(ia->ri_id); - rdma_destroy_id(ia->ri_id); + rpcrdma_destroy_id(ia->ri_id); ia->ri_id = NULL; } /* If the pd is still busy, xprtrdma missed freeing a resource */ if (ia->ri_pd && !IS_ERR(ia->ri_pd)) - WARN_ON(ib_dealloc_pd(ia->ri_pd)); + ib_dealloc_pd(ia->ri_pd); } /* @@ -639,6 +626,12 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, struct ib_cq_init_attr cq_attr = {}; int rc, err; + if (devattr->max_sge < RPCRDMA_MAX_IOVS) { + dprintk("RPC: %s: insufficient sge's available\n", + __func__); + return -ENOMEM; + } + /* check provider's send/recv wr limits */ if (cdata->max_requests > devattr->max_qp_wr) cdata->max_requests = devattr->max_qp_wr; @@ -651,21 +644,13 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, if (rc) return rc; ep->rep_attr.cap.max_recv_wr = cdata->max_requests; - ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2); + ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_IOVS; ep->rep_attr.cap.max_recv_sge = 1; ep->rep_attr.cap.max_inline_data = 0; ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR; ep->rep_attr.qp_type = IB_QPT_RC; ep->rep_attr.port_num = ~0; - if (cdata->padding) { - ep->rep_padbuf = rpcrdma_alloc_regbuf(ia, cdata->padding, - GFP_KERNEL); - if (IS_ERR(ep->rep_padbuf)) - return PTR_ERR(ep->rep_padbuf); - } else - ep->rep_padbuf = NULL; - dprintk("RPC: %s: requested max: dtos: send %d recv %d; " "iovs: send %d recv %d\n", __func__, @@ -748,7 +733,8 @@ out2: dprintk("RPC: %s: ib_destroy_cq returned %i\n", __func__, err); out1: - rpcrdma_free_regbuf(ia, ep->rep_padbuf); + if (ia->ri_dma_mr) + ib_dereg_mr(ia->ri_dma_mr); return rc; } @@ -775,8 +761,6 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) ia->ri_id->qp = NULL; } - rpcrdma_free_regbuf(ia, ep->rep_padbuf); - rpcrdma_clean_cq(ep->rep_attr.recv_cq); rc = ib_destroy_cq(ep->rep_attr.recv_cq); if (rc) @@ -788,6 +772,12 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) if (rc) dprintk("RPC: %s: ib_destroy_cq returned %i\n", __func__, rc); + + if (ia->ri_dma_mr) { + rc = ib_dereg_mr(ia->ri_dma_mr); + dprintk("RPC: %s: ib_dereg_mr returned %i\n", + __func__, rc); + } } /* @@ -825,7 +815,7 @@ retry: if (ia->ri_device != id->device) { printk("RPC: %s: can't reconnect on " "different device!\n", __func__); - rdma_destroy_id(id); + rpcrdma_destroy_id(id); rc = -ENETUNREACH; goto out; } @@ -834,7 +824,7 @@ retry: if (rc) { dprintk("RPC: %s: rdma_create_qp failed %i\n", __func__, rc); - rdma_destroy_id(id); + rpcrdma_destroy_id(id); rc = -ENETUNREACH; goto out; } @@ -845,7 +835,7 @@ retry: write_unlock(&ia->ri_qplock); rdma_destroy_qp(old); - rdma_destroy_id(old); + rpcrdma_destroy_id(old); } else { dprintk("RPC: %s: connecting...\n", __func__); rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); @@ -1229,75 +1219,6 @@ rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg) (unsigned long long)seg->mr_dma, seg->mr_dmalen); } -static int -rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len, - struct ib_mr **mrp, struct ib_sge *iov) -{ - struct ib_phys_buf ipb; - struct ib_mr *mr; - int rc; - - /* - * All memory passed here was kmalloc'ed, therefore phys-contiguous. - */ - iov->addr = ib_dma_map_single(ia->ri_device, - va, len, DMA_BIDIRECTIONAL); - if (ib_dma_mapping_error(ia->ri_device, iov->addr)) - return -ENOMEM; - - iov->length = len; - - if (ia->ri_have_dma_lkey) { - *mrp = NULL; - iov->lkey = ia->ri_dma_lkey; - return 0; - } else if (ia->ri_bind_mem != NULL) { - *mrp = NULL; - iov->lkey = ia->ri_bind_mem->lkey; - return 0; - } - - ipb.addr = iov->addr; - ipb.size = iov->length; - mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1, - IB_ACCESS_LOCAL_WRITE, &iov->addr); - - dprintk("RPC: %s: phys convert: 0x%llx " - "registered 0x%llx length %d\n", - __func__, (unsigned long long)ipb.addr, - (unsigned long long)iov->addr, len); - - if (IS_ERR(mr)) { - *mrp = NULL; - rc = PTR_ERR(mr); - dprintk("RPC: %s: failed with %i\n", __func__, rc); - } else { - *mrp = mr; - iov->lkey = mr->lkey; - rc = 0; - } - - return rc; -} - -static int -rpcrdma_deregister_internal(struct rpcrdma_ia *ia, - struct ib_mr *mr, struct ib_sge *iov) -{ - int rc; - - ib_dma_unmap_single(ia->ri_device, - iov->addr, iov->length, DMA_BIDIRECTIONAL); - - if (NULL == mr) - return 0; - - rc = ib_dereg_mr(mr); - if (rc) - dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc); - return rc; -} - /** * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers * @ia: controlling rpcrdma_ia @@ -1317,26 +1238,29 @@ struct rpcrdma_regbuf * rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags) { struct rpcrdma_regbuf *rb; - int rc; + struct ib_sge *iov; - rc = -ENOMEM; rb = kmalloc(sizeof(*rb) + size, flags); if (rb == NULL) goto out; - rb->rg_size = size; - rb->rg_owner = NULL; - rc = rpcrdma_register_internal(ia, rb->rg_base, size, - &rb->rg_mr, &rb->rg_iov); - if (rc) + iov = &rb->rg_iov; + iov->addr = ib_dma_map_single(ia->ri_device, + (void *)rb->rg_base, size, + DMA_BIDIRECTIONAL); + if (ib_dma_mapping_error(ia->ri_device, iov->addr)) goto out_free; + iov->length = size; + iov->lkey = ia->ri_pd->local_dma_lkey; + rb->rg_size = size; + rb->rg_owner = NULL; return rb; out_free: kfree(rb); out: - return ERR_PTR(rc); + return ERR_PTR(-ENOMEM); } /** @@ -1347,10 +1271,15 @@ out: void rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb) { - if (rb) { - rpcrdma_deregister_internal(ia, rb->rg_mr, &rb->rg_iov); - kfree(rb); - } + struct ib_sge *iov; + + if (!rb) + return; + + iov = &rb->rg_iov; + ib_dma_unmap_single(ia->ri_device, + iov->addr, iov->length, DMA_BIDIRECTIONAL); + kfree(rb); } /* @@ -1363,9 +1292,11 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, struct rpcrdma_req *req) { + struct ib_device *device = ia->ri_device; struct ib_send_wr send_wr, *send_wr_fail; struct rpcrdma_rep *rep = req->rl_reply; - int rc; + struct ib_sge *iov = req->rl_send_iov; + int i, rc; if (rep) { rc = rpcrdma_ep_post_recv(ia, ep, rep); @@ -1376,22 +1307,15 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, send_wr.next = NULL; send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION; - send_wr.sg_list = req->rl_send_iov; + send_wr.sg_list = iov; send_wr.num_sge = req->rl_niovs; send_wr.opcode = IB_WR_SEND; - if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */ - ib_dma_sync_single_for_device(ia->ri_device, - req->rl_send_iov[3].addr, - req->rl_send_iov[3].length, - DMA_TO_DEVICE); - ib_dma_sync_single_for_device(ia->ri_device, - req->rl_send_iov[1].addr, - req->rl_send_iov[1].length, - DMA_TO_DEVICE); - ib_dma_sync_single_for_device(ia->ri_device, - req->rl_send_iov[0].addr, - req->rl_send_iov[0].length, - DMA_TO_DEVICE); + + for (i = 0; i < send_wr.num_sge; i++) + ib_dma_sync_single_for_device(device, iov[i].addr, + iov[i].length, DMA_TO_DEVICE); + dprintk("RPC: %s: posting %d s/g entries\n", + __func__, send_wr.num_sge); if (DECR_CQCOUNT(ep) > 0) send_wr.send_flags = 0; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index f49dd8b38122..c09414e6f91b 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -51,7 +51,6 @@ #include <linux/sunrpc/clnt.h> /* rpc_xprt */ #include <linux/sunrpc/rpc_rdma.h> /* RPC/RDMA protocol */ #include <linux/sunrpc/xprtrdma.h> /* xprt parameters */ -#include <linux/sunrpc/svc.h> /* RPCSVC_MAXPAYLOAD */ #define RDMA_RESOLVE_TIMEOUT (5000) /* 5 seconds */ #define RDMA_CONNECT_RETRY_MAX (2) /* retries if no listener backlog */ @@ -65,9 +64,7 @@ struct rpcrdma_ia { struct ib_device *ri_device; struct rdma_cm_id *ri_id; struct ib_pd *ri_pd; - struct ib_mr *ri_bind_mem; - u32 ri_dma_lkey; - int ri_have_dma_lkey; + struct ib_mr *ri_dma_mr; struct completion ri_done; int ri_async_rc; unsigned int ri_max_frmr_depth; @@ -89,7 +86,6 @@ struct rpcrdma_ep { int rep_connected; struct ib_qp_init_attr rep_attr; wait_queue_head_t rep_connect_wait; - struct rpcrdma_regbuf *rep_padbuf; struct rdma_conn_param rep_remote_cma; struct sockaddr_storage rep_remote_addr; struct delayed_work rep_connect_worker; @@ -119,7 +115,6 @@ struct rpcrdma_ep { struct rpcrdma_regbuf { size_t rg_size; struct rpcrdma_req *rg_owner; - struct ib_mr *rg_mr; struct ib_sge rg_iov; __be32 rg_base[0] __attribute__ ((aligned(256))); }; @@ -165,8 +160,7 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb) * struct rpcrdma_buffer. N is the max number of outstanding requests. */ -/* temporary static scatter/gather max */ -#define RPCRDMA_MAX_DATA_SEGS (64) /* max scatter/gather */ +#define RPCRDMA_MAX_DATA_SEGS ((1 * 1024 * 1024) / PAGE_SIZE) #define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */ struct rpcrdma_buffer; @@ -258,16 +252,18 @@ struct rpcrdma_mr_seg { /* chunk descriptors */ char *mr_offset; /* kva if no page, else offset */ }; +#define RPCRDMA_MAX_IOVS (2) + struct rpcrdma_req { - unsigned int rl_niovs; /* 0, 2 or 4 */ - unsigned int rl_nchunks; /* non-zero if chunks */ - unsigned int rl_connect_cookie; /* retry detection */ - struct rpcrdma_buffer *rl_buffer; /* home base for this structure */ + unsigned int rl_niovs; + unsigned int rl_nchunks; + unsigned int rl_connect_cookie; + struct rpcrdma_buffer *rl_buffer; struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ - struct ib_sge rl_send_iov[4]; /* for active requests */ - struct rpcrdma_regbuf *rl_rdmabuf; - struct rpcrdma_regbuf *rl_sendbuf; - struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; + struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS]; + struct rpcrdma_regbuf *rl_rdmabuf; + struct rpcrdma_regbuf *rl_sendbuf; + struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; }; static inline struct rpcrdma_req * @@ -342,6 +338,7 @@ struct rpcrdma_stats { unsigned long hardway_register_count; unsigned long failed_marshal_count; unsigned long bad_reply_count; + unsigned long nomsg_call_count; }; /* diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 0030376327b7..1a85e0ed0b48 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -777,7 +777,6 @@ static void xs_sock_mark_closed(struct rpc_xprt *xprt) xs_sock_reset_connection_flags(xprt); /* Mark transport as closed and wake up all pending tasks */ xprt_disconnect_done(xprt); - xprt_force_disconnect(xprt); } /** @@ -822,6 +821,8 @@ static void xs_reset_transport(struct sock_xprt *transport) if (atomic_read(&transport->xprt.swapper)) sk_clear_memalloc(sk); + kernel_sock_shutdown(sock, SHUT_RDWR); + write_lock_bh(&sk->sk_callback_lock); transport->inet = NULL; transport->sock = NULL; @@ -829,6 +830,7 @@ static void xs_reset_transport(struct sock_xprt *transport) sk->sk_user_data = NULL; xs_restore_old_callbacks(transport, sk); + xprt_clear_connected(xprt); write_unlock_bh(&sk->sk_callback_lock); xs_sock_reset_connection_flags(xprt); @@ -878,8 +880,11 @@ static void xs_xprt_free(struct rpc_xprt *xprt) */ static void xs_destroy(struct rpc_xprt *xprt) { + struct sock_xprt *transport = container_of(xprt, + struct sock_xprt, xprt); dprintk("RPC: xs_destroy xprt %p\n", xprt); + cancel_delayed_work_sync(&transport->connect_worker); xs_close(xprt); xs_xprt_free(xprt); module_put(THIS_MODULE); @@ -1432,6 +1437,7 @@ out: static void xs_tcp_state_change(struct sock *sk) { struct rpc_xprt *xprt; + struct sock_xprt *transport; read_lock_bh(&sk->sk_callback_lock); if (!(xprt = xprt_from_sock(sk))) @@ -1443,13 +1449,12 @@ static void xs_tcp_state_change(struct sock *sk) sock_flag(sk, SOCK_ZAPPED), sk->sk_shutdown); + transport = container_of(xprt, struct sock_xprt, xprt); trace_rpc_socket_state_change(xprt, sk->sk_socket); switch (sk->sk_state) { case TCP_ESTABLISHED: spin_lock(&xprt->transport_lock); if (!xprt_test_and_set_connected(xprt)) { - struct sock_xprt *transport = container_of(xprt, - struct sock_xprt, xprt); /* Reset TCP record info */ transport->tcp_offset = 0; @@ -1458,6 +1463,8 @@ static void xs_tcp_state_change(struct sock *sk) transport->tcp_flags = TCP_RCV_COPY_FRAGHDR | TCP_RCV_COPY_XID; xprt->connect_cookie++; + clear_bit(XPRT_SOCK_CONNECTING, &transport->sock_state); + xprt_clear_connecting(xprt); xprt_wake_pending_tasks(xprt, -EAGAIN); } @@ -1493,6 +1500,9 @@ static void xs_tcp_state_change(struct sock *sk) smp_mb__after_atomic(); break; case TCP_CLOSE: + if (test_and_clear_bit(XPRT_SOCK_CONNECTING, + &transport->sock_state)) + xprt_clear_connecting(xprt); xs_sock_mark_closed(xprt); } out: @@ -1866,7 +1876,7 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt, sk->sk_data_ready = xs_local_data_ready; sk->sk_write_space = xs_udp_write_space; sk->sk_error_report = xs_error_report; - sk->sk_allocation = GFP_ATOMIC; + sk->sk_allocation = GFP_NOIO; xprt_clear_connected(xprt); @@ -2051,7 +2061,7 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) sk->sk_user_data = xprt; sk->sk_data_ready = xs_udp_data_ready; sk->sk_write_space = xs_udp_write_space; - sk->sk_allocation = GFP_ATOMIC; + sk->sk_allocation = GFP_NOIO; xprt_set_connected(xprt); @@ -2153,7 +2163,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) sk->sk_state_change = xs_tcp_state_change; sk->sk_write_space = xs_tcp_write_space; sk->sk_error_report = xs_error_report; - sk->sk_allocation = GFP_ATOMIC; + sk->sk_allocation = GFP_NOIO; /* socket options */ sock_reset_flag(sk, SOCK_LINGER); @@ -2176,6 +2186,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) /* Tell the socket layer to start connecting... */ xprt->stat.connect_count++; xprt->stat.connect_start = jiffies; + set_bit(XPRT_SOCK_CONNECTING, &transport->sock_state); ret = kernel_connect(sock, xs_addr(xprt), xprt->addrlen, O_NONBLOCK); switch (ret) { case 0: @@ -2237,7 +2248,6 @@ static void xs_tcp_setup_socket(struct work_struct *work) case -EINPROGRESS: case -EALREADY: xprt_unlock_connect(xprt, transport); - xprt_clear_connecting(xprt); return; case -EINVAL: /* Happens, for instance, if the user specified a link @@ -2279,13 +2289,14 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task) WARN_ON_ONCE(!xprt_lock_connect(xprt, task, transport)); - /* Start by resetting any existing state */ - xs_reset_transport(transport); - - if (transport->sock != NULL && !RPC_IS_SOFTCONN(task)) { + if (transport->sock != NULL) { dprintk("RPC: xs_connect delayed xprt %p for %lu " "seconds\n", xprt, xprt->reestablish_timeout / HZ); + + /* Start by resetting any existing state */ + xs_reset_transport(transport); + queue_delayed_work(rpciod_workqueue, &transport->connect_worker, xprt->reestablish_timeout); diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 16c1c43980a1..7a9ab90363be 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -1,6 +1,6 @@ /* * net/switchdev/switchdev.c - Switch device API - * Copyright (c) 2014 Jiri Pirko <jiri@resnulli.us> + * Copyright (c) 2014-2015 Jiri Pirko <jiri@resnulli.us> * Copyright (c) 2014-2015 Scott Feldman <sfeldma@gmail.com> * * This program is free software; you can redistribute it and/or modify @@ -16,10 +16,83 @@ #include <linux/notifier.h> #include <linux/netdevice.h> #include <linux/if_bridge.h> +#include <linux/list.h> #include <net/ip_fib.h> #include <net/switchdev.h> /** + * switchdev_trans_item_enqueue - Enqueue data item to transaction queue + * + * @trans: transaction + * @data: pointer to data being queued + * @destructor: data destructor + * @tritem: transaction item being queued + * + * Enqeueue data item to transaction queue. tritem is typically placed in + * cointainter pointed at by data pointer. Destructor is called on + * transaction abort and after successful commit phase in case + * the caller did not dequeue the item before. + */ +void switchdev_trans_item_enqueue(struct switchdev_trans *trans, + void *data, void (*destructor)(void const *), + struct switchdev_trans_item *tritem) +{ + tritem->data = data; + tritem->destructor = destructor; + list_add_tail(&tritem->list, &trans->item_list); +} +EXPORT_SYMBOL_GPL(switchdev_trans_item_enqueue); + +static struct switchdev_trans_item * +__switchdev_trans_item_dequeue(struct switchdev_trans *trans) +{ + struct switchdev_trans_item *tritem; + + if (list_empty(&trans->item_list)) + return NULL; + tritem = list_first_entry(&trans->item_list, + struct switchdev_trans_item, list); + list_del(&tritem->list); + return tritem; +} + +/** + * switchdev_trans_item_dequeue - Dequeue data item from transaction queue + * + * @trans: transaction + */ +void *switchdev_trans_item_dequeue(struct switchdev_trans *trans) +{ + struct switchdev_trans_item *tritem; + + tritem = __switchdev_trans_item_dequeue(trans); + BUG_ON(!tritem); + return tritem->data; +} +EXPORT_SYMBOL_GPL(switchdev_trans_item_dequeue); + +static void switchdev_trans_init(struct switchdev_trans *trans) +{ + INIT_LIST_HEAD(&trans->item_list); +} + +static void switchdev_trans_items_destroy(struct switchdev_trans *trans) +{ + struct switchdev_trans_item *tritem; + + while ((tritem = __switchdev_trans_item_dequeue(trans))) + tritem->destructor(tritem->data); +} + +static void switchdev_trans_items_warn_destroy(struct net_device *dev, + struct switchdev_trans *trans) +{ + WARN(!list_empty(&trans->item_list), "%s: transaction item queue is not empty.\n", + dev->name); + switchdev_trans_items_destroy(trans); +} + +/** * switchdev_port_attr_get - Get port attribute * * @dev: port device @@ -31,7 +104,7 @@ int switchdev_port_attr_get(struct net_device *dev, struct switchdev_attr *attr) struct net_device *lower_dev; struct list_head *iter; struct switchdev_attr first = { - .id = SWITCHDEV_ATTR_UNDEFINED + .id = SWITCHDEV_ATTR_ID_UNDEFINED }; int err = -EOPNOTSUPP; @@ -51,7 +124,7 @@ int switchdev_port_attr_get(struct net_device *dev, struct switchdev_attr *attr) err = switchdev_port_attr_get(lower_dev, attr); if (err) break; - if (first.id == SWITCHDEV_ATTR_UNDEFINED) + if (first.id == SWITCHDEV_ATTR_ID_UNDEFINED) first = *attr; else if (memcmp(&first, attr, sizeof(*attr))) return -ENODATA; @@ -62,7 +135,8 @@ int switchdev_port_attr_get(struct net_device *dev, struct switchdev_attr *attr) EXPORT_SYMBOL_GPL(switchdev_port_attr_get); static int __switchdev_port_attr_set(struct net_device *dev, - struct switchdev_attr *attr) + struct switchdev_attr *attr, + struct switchdev_trans *trans) { const struct switchdev_ops *ops = dev->switchdev_ops; struct net_device *lower_dev; @@ -70,10 +144,10 @@ static int __switchdev_port_attr_set(struct net_device *dev, int err = -EOPNOTSUPP; if (ops && ops->switchdev_port_attr_set) - return ops->switchdev_port_attr_set(dev, attr); + return ops->switchdev_port_attr_set(dev, attr, trans); if (attr->flags & SWITCHDEV_F_NO_RECURSE) - return err; + goto done; /* Switch device port(s) may be stacked under * bond/team/vlan dev, so recurse down to set attr on @@ -81,11 +155,18 @@ static int __switchdev_port_attr_set(struct net_device *dev, */ netdev_for_each_lower_dev(dev, lower_dev, iter) { - err = __switchdev_port_attr_set(lower_dev, attr); + err = __switchdev_port_attr_set(lower_dev, attr, trans); + if (err == -EOPNOTSUPP && + attr->flags & SWITCHDEV_F_SKIP_EOPNOTSUPP) + continue; if (err) break; } +done: + if (err == -EOPNOTSUPP && attr->flags & SWITCHDEV_F_SKIP_EOPNOTSUPP) + err = 0; + return err; } @@ -144,6 +225,7 @@ static int switchdev_port_attr_set_defer(struct net_device *dev, */ int switchdev_port_attr_set(struct net_device *dev, struct switchdev_attr *attr) { + struct switchdev_trans trans; int err; if (!rtnl_is_locked()) { @@ -156,6 +238,8 @@ int switchdev_port_attr_set(struct net_device *dev, struct switchdev_attr *attr) return switchdev_port_attr_set_defer(dev, attr); } + switchdev_trans_init(&trans); + /* Phase I: prepare for attr set. Driver/device should fail * here if there are going to be issues in the commit phase, * such as lack of resources or support. The driver/device @@ -163,18 +247,16 @@ int switchdev_port_attr_set(struct net_device *dev, struct switchdev_attr *attr) * but should not commit the attr. */ - attr->trans = SWITCHDEV_TRANS_PREPARE; - err = __switchdev_port_attr_set(dev, attr); + trans.ph_prepare = true; + err = __switchdev_port_attr_set(dev, attr, &trans); if (err) { /* Prepare phase failed: abort the transaction. Any * resources reserved in the prepare phase are * released. */ - if (err != -EOPNOTSUPP) { - attr->trans = SWITCHDEV_TRANS_ABORT; - __switchdev_port_attr_set(dev, attr); - } + if (err != -EOPNOTSUPP) + switchdev_trans_items_destroy(&trans); return err; } @@ -184,17 +266,19 @@ int switchdev_port_attr_set(struct net_device *dev, struct switchdev_attr *attr) * because the driver said everythings was OK in phase I. */ - attr->trans = SWITCHDEV_TRANS_COMMIT; - err = __switchdev_port_attr_set(dev, attr); + trans.ph_prepare = false; + err = __switchdev_port_attr_set(dev, attr, &trans); WARN(err, "%s: Commit of attribute (id=%d) failed.\n", dev->name, attr->id); + switchdev_trans_items_warn_destroy(dev, &trans); return err; } EXPORT_SYMBOL_GPL(switchdev_port_attr_set); static int __switchdev_port_obj_add(struct net_device *dev, - struct switchdev_obj *obj) + const struct switchdev_obj *obj, + struct switchdev_trans *trans) { const struct switchdev_ops *ops = dev->switchdev_ops; struct net_device *lower_dev; @@ -202,7 +286,7 @@ static int __switchdev_port_obj_add(struct net_device *dev, int err = -EOPNOTSUPP; if (ops && ops->switchdev_port_obj_add) - return ops->switchdev_port_obj_add(dev, obj); + return ops->switchdev_port_obj_add(dev, obj, trans); /* Switch device port(s) may be stacked under * bond/team/vlan dev, so recurse down to add object on @@ -210,7 +294,7 @@ static int __switchdev_port_obj_add(struct net_device *dev, */ netdev_for_each_lower_dev(dev, lower_dev, iter) { - err = __switchdev_port_obj_add(lower_dev, obj); + err = __switchdev_port_obj_add(lower_dev, obj, trans); if (err) break; } @@ -222,6 +306,7 @@ static int __switchdev_port_obj_add(struct net_device *dev, * switchdev_port_obj_add - Add port object * * @dev: port device + * @id: object ID * @obj: object to add * * Use a 2-phase prepare-commit transaction model to ensure @@ -230,12 +315,16 @@ static int __switchdev_port_obj_add(struct net_device *dev, * * rtnl_lock must be held. */ -int switchdev_port_obj_add(struct net_device *dev, struct switchdev_obj *obj) +int switchdev_port_obj_add(struct net_device *dev, + const struct switchdev_obj *obj) { + struct switchdev_trans trans; int err; ASSERT_RTNL(); + switchdev_trans_init(&trans); + /* Phase I: prepare for obj add. Driver/device should fail * here if there are going to be issues in the commit phase, * such as lack of resources or support. The driver/device @@ -243,18 +332,16 @@ int switchdev_port_obj_add(struct net_device *dev, struct switchdev_obj *obj) * but should not commit the obj. */ - obj->trans = SWITCHDEV_TRANS_PREPARE; - err = __switchdev_port_obj_add(dev, obj); + trans.ph_prepare = true; + err = __switchdev_port_obj_add(dev, obj, &trans); if (err) { /* Prepare phase failed: abort the transaction. Any * resources reserved in the prepare phase are * released. */ - if (err != -EOPNOTSUPP) { - obj->trans = SWITCHDEV_TRANS_ABORT; - __switchdev_port_obj_add(dev, obj); - } + if (err != -EOPNOTSUPP) + switchdev_trans_items_destroy(&trans); return err; } @@ -264,9 +351,10 @@ int switchdev_port_obj_add(struct net_device *dev, struct switchdev_obj *obj) * because the driver said everythings was OK in phase I. */ - obj->trans = SWITCHDEV_TRANS_COMMIT; - err = __switchdev_port_obj_add(dev, obj); + trans.ph_prepare = false; + err = __switchdev_port_obj_add(dev, obj, &trans); WARN(err, "%s: Commit of object (id=%d) failed.\n", dev->name, obj->id); + switchdev_trans_items_warn_destroy(dev, &trans); return err; } @@ -276,9 +364,11 @@ EXPORT_SYMBOL_GPL(switchdev_port_obj_add); * switchdev_port_obj_del - Delete port object * * @dev: port device + * @id: object ID * @obj: object to delete */ -int switchdev_port_obj_del(struct net_device *dev, struct switchdev_obj *obj) +int switchdev_port_obj_del(struct net_device *dev, + const struct switchdev_obj *obj) { const struct switchdev_ops *ops = dev->switchdev_ops; struct net_device *lower_dev; @@ -307,9 +397,12 @@ EXPORT_SYMBOL_GPL(switchdev_port_obj_del); * switchdev_port_obj_dump - Dump port objects * * @dev: port device + * @id: object ID * @obj: object to dump + * @cb: function to call with a filled object */ -int switchdev_port_obj_dump(struct net_device *dev, struct switchdev_obj *obj) +int switchdev_port_obj_dump(struct net_device *dev, struct switchdev_obj *obj, + switchdev_obj_dump_cb_t *cb) { const struct switchdev_ops *ops = dev->switchdev_ops; struct net_device *lower_dev; @@ -317,7 +410,7 @@ int switchdev_port_obj_dump(struct net_device *dev, struct switchdev_obj *obj) int err = -EOPNOTSUPP; if (ops && ops->switchdev_port_obj_dump) - return ops->switchdev_port_obj_dump(dev, obj); + return ops->switchdev_port_obj_dump(dev, obj, cb); /* Switch device port(s) may be stacked under * bond/team/vlan dev, so recurse down to dump objects on @@ -325,7 +418,7 @@ int switchdev_port_obj_dump(struct net_device *dev, struct switchdev_obj *obj) */ netdev_for_each_lower_dev(dev, lower_dev, iter) { - err = switchdev_port_obj_dump(lower_dev, obj); + err = switchdev_port_obj_dump(lower_dev, obj, cb); break; } @@ -397,7 +490,7 @@ int call_switchdev_notifiers(unsigned long val, struct net_device *dev, EXPORT_SYMBOL_GPL(call_switchdev_notifiers); struct switchdev_vlan_dump { - struct switchdev_obj obj; + struct switchdev_obj_port_vlan vlan; struct sk_buff *skb; u32 filter_mask; u16 flags; @@ -405,8 +498,7 @@ struct switchdev_vlan_dump { u16 end; }; -static int switchdev_port_vlan_dump_put(struct net_device *dev, - struct switchdev_vlan_dump *dump) +static int switchdev_port_vlan_dump_put(struct switchdev_vlan_dump *dump) { struct bridge_vlan_info vinfo; @@ -436,12 +528,11 @@ static int switchdev_port_vlan_dump_put(struct net_device *dev, return 0; } -static int switchdev_port_vlan_dump_cb(struct net_device *dev, - struct switchdev_obj *obj) +static int switchdev_port_vlan_dump_cb(struct switchdev_obj *obj) { + struct switchdev_obj_port_vlan *vlan = SWITCHDEV_OBJ_PORT_VLAN(obj); struct switchdev_vlan_dump *dump = - container_of(obj, struct switchdev_vlan_dump, obj); - struct switchdev_obj_vlan *vlan = &dump->obj.u.vlan; + container_of(vlan, struct switchdev_vlan_dump, vlan); int err = 0; if (vlan->vid_begin > vlan->vid_end) @@ -452,7 +543,7 @@ static int switchdev_port_vlan_dump_cb(struct net_device *dev, for (dump->begin = dump->end = vlan->vid_begin; dump->begin <= vlan->vid_end; dump->begin++, dump->end++) { - err = switchdev_port_vlan_dump_put(dev, dump); + err = switchdev_port_vlan_dump_put(dump); if (err) return err; } @@ -464,7 +555,7 @@ static int switchdev_port_vlan_dump_cb(struct net_device *dev, /* prepend */ dump->begin = vlan->vid_begin; } else { - err = switchdev_port_vlan_dump_put(dev, dump); + err = switchdev_port_vlan_dump_put(dump); dump->flags = vlan->flags; dump->begin = vlan->vid_begin; dump->end = vlan->vid_end; @@ -476,7 +567,7 @@ static int switchdev_port_vlan_dump_cb(struct net_device *dev, /* append */ dump->end = vlan->vid_end; } else { - err = switchdev_port_vlan_dump_put(dev, dump); + err = switchdev_port_vlan_dump_put(dump); dump->flags = vlan->flags; dump->begin = vlan->vid_begin; dump->end = vlan->vid_end; @@ -493,10 +584,7 @@ static int switchdev_port_vlan_fill(struct sk_buff *skb, struct net_device *dev, u32 filter_mask) { struct switchdev_vlan_dump dump = { - .obj = { - .id = SWITCHDEV_OBJ_PORT_VLAN, - .cb = switchdev_port_vlan_dump_cb, - }, + .vlan.obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, .skb = skb, .filter_mask = filter_mask, }; @@ -504,12 +592,13 @@ static int switchdev_port_vlan_fill(struct sk_buff *skb, struct net_device *dev, if ((filter_mask & RTEXT_FILTER_BRVLAN) || (filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED)) { - err = switchdev_port_obj_dump(dev, &dump.obj); + err = switchdev_port_obj_dump(dev, &dump.vlan.obj, + switchdev_port_vlan_dump_cb); if (err) goto err_out; if (filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED) /* last one */ - err = switchdev_port_vlan_dump_put(dev, &dump); + err = switchdev_port_vlan_dump_put(&dump); } err_out: @@ -529,7 +618,7 @@ int switchdev_port_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, int nlflags) { struct switchdev_attr attr = { - .id = SWITCHDEV_ATTR_PORT_BRIDGE_FLAGS, + .id = SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS, }; u16 mode = BRIDGE_MODE_UNDEF; u32 mask = BR_LEARNING | BR_LEARNING_SYNC; @@ -550,7 +639,7 @@ static int switchdev_port_br_setflag(struct net_device *dev, unsigned long brport_flag) { struct switchdev_attr attr = { - .id = SWITCHDEV_ATTR_PORT_BRIDGE_FLAGS, + .id = SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS, }; u8 flag = nla_get_u8(nlattr); int err; @@ -617,14 +706,13 @@ static int switchdev_port_br_setlink_protinfo(struct net_device *dev, static int switchdev_port_br_afspec(struct net_device *dev, struct nlattr *afspec, int (*f)(struct net_device *dev, - struct switchdev_obj *obj)) + const struct switchdev_obj *obj)) { struct nlattr *attr; struct bridge_vlan_info *vinfo; - struct switchdev_obj obj = { - .id = SWITCHDEV_OBJ_PORT_VLAN, + struct switchdev_obj_port_vlan vlan = { + .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, }; - struct switchdev_obj_vlan *vlan = &obj.u.vlan; int rem; int err; @@ -634,30 +722,30 @@ static int switchdev_port_br_afspec(struct net_device *dev, if (nla_len(attr) != sizeof(struct bridge_vlan_info)) return -EINVAL; vinfo = nla_data(attr); - vlan->flags = vinfo->flags; + vlan.flags = vinfo->flags; if (vinfo->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) { - if (vlan->vid_begin) + if (vlan.vid_begin) return -EINVAL; - vlan->vid_begin = vinfo->vid; + vlan.vid_begin = vinfo->vid; } else if (vinfo->flags & BRIDGE_VLAN_INFO_RANGE_END) { - if (!vlan->vid_begin) + if (!vlan.vid_begin) return -EINVAL; - vlan->vid_end = vinfo->vid; - if (vlan->vid_end <= vlan->vid_begin) + vlan.vid_end = vinfo->vid; + if (vlan.vid_end <= vlan.vid_begin) return -EINVAL; - err = f(dev, &obj); + err = f(dev, &vlan.obj); if (err) return err; - memset(vlan, 0, sizeof(*vlan)); + memset(&vlan, 0, sizeof(vlan)); } else { - if (vlan->vid_begin) + if (vlan.vid_begin) return -EINVAL; - vlan->vid_begin = vinfo->vid; - vlan->vid_end = vinfo->vid; - err = f(dev, &obj); + vlan.vid_begin = vinfo->vid; + vlan.vid_end = vinfo->vid; + err = f(dev, &vlan.obj); if (err) return err; - memset(vlan, 0, sizeof(*vlan)); + memset(&vlan, 0, sizeof(vlan)); } } @@ -739,15 +827,13 @@ int switchdev_port_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 nlm_flags) { - struct switchdev_obj obj = { - .id = SWITCHDEV_OBJ_PORT_FDB, - .u.fdb = { - .addr = addr, - .vid = vid, - }, + struct switchdev_obj_port_fdb fdb = { + .obj.id = SWITCHDEV_OBJ_ID_PORT_FDB, + .addr = addr, + .vid = vid, }; - return switchdev_port_obj_add(dev, &obj); + return switchdev_port_obj_add(dev, &fdb.obj); } EXPORT_SYMBOL_GPL(switchdev_port_fdb_add); @@ -766,30 +852,29 @@ int switchdev_port_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid) { - struct switchdev_obj obj = { - .id = SWITCHDEV_OBJ_PORT_FDB, - .u.fdb = { - .addr = addr, - .vid = vid, - }, + struct switchdev_obj_port_fdb fdb = { + .obj.id = SWITCHDEV_OBJ_ID_PORT_FDB, + .addr = addr, + .vid = vid, }; - return switchdev_port_obj_del(dev, &obj); + return switchdev_port_obj_del(dev, &fdb.obj); } EXPORT_SYMBOL_GPL(switchdev_port_fdb_del); struct switchdev_fdb_dump { - struct switchdev_obj obj; + struct switchdev_obj_port_fdb fdb; + struct net_device *dev; struct sk_buff *skb; struct netlink_callback *cb; int idx; }; -static int switchdev_port_fdb_dump_cb(struct net_device *dev, - struct switchdev_obj *obj) +static int switchdev_port_fdb_dump_cb(struct switchdev_obj *obj) { + struct switchdev_obj_port_fdb *fdb = SWITCHDEV_OBJ_PORT_FDB(obj); struct switchdev_fdb_dump *dump = - container_of(obj, struct switchdev_fdb_dump, obj); + container_of(fdb, struct switchdev_fdb_dump, fdb); u32 portid = NETLINK_CB(dump->cb->skb).portid; u32 seq = dump->cb->nlh->nlmsg_seq; struct nlmsghdr *nlh; @@ -809,13 +894,13 @@ static int switchdev_port_fdb_dump_cb(struct net_device *dev, ndm->ndm_pad2 = 0; ndm->ndm_flags = NTF_SELF; ndm->ndm_type = 0; - ndm->ndm_ifindex = dev->ifindex; - ndm->ndm_state = obj->u.fdb.ndm_state; + ndm->ndm_ifindex = dump->dev->ifindex; + ndm->ndm_state = fdb->ndm_state; - if (nla_put(dump->skb, NDA_LLADDR, ETH_ALEN, obj->u.fdb.addr)) + if (nla_put(dump->skb, NDA_LLADDR, ETH_ALEN, fdb->addr)) goto nla_put_failure; - if (obj->u.fdb.vid && nla_put_u16(dump->skb, NDA_VLAN, obj->u.fdb.vid)) + if (fdb->vid && nla_put_u16(dump->skb, NDA_VLAN, fdb->vid)) goto nla_put_failure; nlmsg_end(dump->skb, nlh); @@ -845,20 +930,14 @@ int switchdev_port_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *filter_dev, int idx) { struct switchdev_fdb_dump dump = { - .obj = { - .id = SWITCHDEV_OBJ_PORT_FDB, - .cb = switchdev_port_fdb_dump_cb, - }, + .fdb.obj.id = SWITCHDEV_OBJ_ID_PORT_FDB, + .dev = dev, .skb = skb, .cb = cb, .idx = idx, }; - int err; - - err = switchdev_port_obj_dump(dev, &dump.obj); - if (err) - return err; + switchdev_port_obj_dump(dev, &dump.fdb.obj, switchdev_port_fdb_dump_cb); return dump.idx; } EXPORT_SYMBOL_GPL(switchdev_port_fdb_dump); @@ -889,7 +968,7 @@ static struct net_device *switchdev_get_lowest_dev(struct net_device *dev) static struct net_device *switchdev_get_dev_by_nhs(struct fib_info *fi) { struct switchdev_attr attr = { - .id = SWITCHDEV_ATTR_PORT_PARENT_ID, + .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID, }; struct switchdev_attr prev_attr; struct net_device *dev = NULL; @@ -936,17 +1015,15 @@ static struct net_device *switchdev_get_dev_by_nhs(struct fib_info *fi) int switchdev_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi, u8 tos, u8 type, u32 nlflags, u32 tb_id) { - struct switchdev_obj fib_obj = { - .id = SWITCHDEV_OBJ_IPV4_FIB, - .u.ipv4_fib = { - .dst = dst, - .dst_len = dst_len, - .fi = fi, - .tos = tos, - .type = type, - .nlflags = nlflags, - .tb_id = tb_id, - }, + struct switchdev_obj_ipv4_fib ipv4_fib = { + .obj.id = SWITCHDEV_OBJ_ID_IPV4_FIB, + .dst = dst, + .dst_len = dst_len, + .fi = fi, + .tos = tos, + .type = type, + .nlflags = nlflags, + .tb_id = tb_id, }; struct net_device *dev; int err = 0; @@ -967,7 +1044,7 @@ int switchdev_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi, if (!dev) return 0; - err = switchdev_port_obj_add(dev, &fib_obj); + err = switchdev_port_obj_add(dev, &ipv4_fib.obj); if (!err) fi->fib_flags |= RTNH_F_OFFLOAD; @@ -990,17 +1067,15 @@ EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_add); int switchdev_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi, u8 tos, u8 type, u32 tb_id) { - struct switchdev_obj fib_obj = { - .id = SWITCHDEV_OBJ_IPV4_FIB, - .u.ipv4_fib = { - .dst = dst, - .dst_len = dst_len, - .fi = fi, - .tos = tos, - .type = type, - .nlflags = 0, - .tb_id = tb_id, - }, + struct switchdev_obj_ipv4_fib ipv4_fib = { + .obj.id = SWITCHDEV_OBJ_ID_IPV4_FIB, + .dst = dst, + .dst_len = dst_len, + .fi = fi, + .tos = tos, + .type = type, + .nlflags = 0, + .tb_id = tb_id, }; struct net_device *dev; int err = 0; @@ -1012,7 +1087,7 @@ int switchdev_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi, if (!dev) return 0; - err = switchdev_port_obj_del(dev, &fib_obj); + err = switchdev_port_obj_del(dev, &ipv4_fib.obj); if (!err) fi->fib_flags &= ~RTNH_F_OFFLOAD; @@ -1044,11 +1119,11 @@ static bool switchdev_port_same_parent_id(struct net_device *a, struct net_device *b) { struct switchdev_attr a_attr = { - .id = SWITCHDEV_ATTR_PORT_PARENT_ID, + .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID, .flags = SWITCHDEV_F_NO_RECURSE, }; struct switchdev_attr b_attr = { - .id = SWITCHDEV_ATTR_PORT_PARENT_ID, + .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID, .flags = SWITCHDEV_F_NO_RECURSE, }; diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 8b010c976b2f..41042de3ae9b 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -170,6 +170,30 @@ static void bclink_retransmit_pkt(struct tipc_net *tn, u32 after, u32 to) } /** + * bclink_prepare_wakeup - prepare users for wakeup after congestion + * @bcl: broadcast link + * @resultq: queue for users which can be woken up + * Move a number of waiting users, as permitted by available space in + * the send queue, from link wait queue to specified queue for wakeup + */ +static void bclink_prepare_wakeup(struct tipc_link *bcl, struct sk_buff_head *resultq) +{ + int pnd[TIPC_SYSTEM_IMPORTANCE + 1] = {0,}; + int imp, lim; + struct sk_buff *skb, *tmp; + + skb_queue_walk_safe(&bcl->wakeupq, skb, tmp) { + imp = TIPC_SKB_CB(skb)->chain_imp; + lim = bcl->window + bcl->backlog[imp].limit; + pnd[imp] += TIPC_SKB_CB(skb)->chain_sz; + if ((pnd[imp] + bcl->backlog[imp].len) >= lim) + continue; + skb_unlink(skb, &bcl->wakeupq); + skb_queue_tail(resultq, skb); + } +} + +/** * tipc_bclink_wakeup_users - wake up pending users * * Called with no locks taken @@ -177,8 +201,12 @@ static void bclink_retransmit_pkt(struct tipc_net *tn, u32 after, u32 to) void tipc_bclink_wakeup_users(struct net *net) { struct tipc_net *tn = net_generic(net, tipc_net_id); + struct tipc_link *bcl = tn->bcl; + struct sk_buff_head resultq; - tipc_sk_rcv(net, &tn->bclink->link.wakeupq); + skb_queue_head_init(&resultq); + bclink_prepare_wakeup(bcl, &resultq); + tipc_sk_rcv(net, &resultq); } /** diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 562c926a51cc..c5ac436235e0 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -539,6 +539,7 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err) *err = -TIPC_ERR_NO_NAME; if (skb_linearize(skb)) return false; + msg = buf_msg(skb); if (msg_reroute_cnt(msg)) return false; dnode = addr_domain(net, msg_lookup_scope(msg)); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 03ee4d359f6a..ef31b40ad550 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2179,8 +2179,21 @@ unlock: if (UNIXCB(skb).fp) scm.fp = scm_fp_dup(UNIXCB(skb).fp); - sk_peek_offset_fwd(sk, chunk); + if (skip) { + sk_peek_offset_fwd(sk, chunk); + skip -= chunk; + } + if (UNIXCB(skb).fp) + break; + + last = skb; + last_len = skb->len; + unix_state_lock(sk); + skb = skb_peek_next(skb, &sk->sk_receive_queue); + if (skb) + goto again; + unix_state_unlock(sk); break; } } while (size); diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 70aef72c4ca2..7258246b7458 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -2625,7 +2625,7 @@ static void restore_regulatory_settings(bool reset_user) * settings, user regulatory settings takes precedence. */ if (is_an_alpha2(alpha2)) - regulatory_hint_user(user_alpha2, NL80211_USER_REG_HINT_USER); + regulatory_hint_user(alpha2, NL80211_USER_REG_HINT_USER); spin_lock(®_requests_lock); list_splice_tail_init(&tmp_reg_req_list, ®_requests_list); diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c index 42f7c76cf853..f07224d8b88f 100644 --- a/net/xfrm/xfrm_algo.c +++ b/net/xfrm/xfrm_algo.c @@ -31,7 +31,7 @@ static struct xfrm_algo_desc aead_list[] = { .uinfo = { .aead = { - .geniv = "seqniv", + .geniv = "seqiv", .icv_truncbits = 64, } }, @@ -50,7 +50,7 @@ static struct xfrm_algo_desc aead_list[] = { .uinfo = { .aead = { - .geniv = "seqniv", + .geniv = "seqiv", .icv_truncbits = 96, } }, @@ -69,7 +69,7 @@ static struct xfrm_algo_desc aead_list[] = { .uinfo = { .aead = { - .geniv = "seqniv", + .geniv = "seqiv", .icv_truncbits = 128, } }, @@ -88,7 +88,7 @@ static struct xfrm_algo_desc aead_list[] = { .uinfo = { .aead = { - .geniv = "seqniv", + .geniv = "seqiv", .icv_truncbits = 64, } }, @@ -107,7 +107,7 @@ static struct xfrm_algo_desc aead_list[] = { .uinfo = { .aead = { - .geniv = "seqniv", + .geniv = "seqiv", .icv_truncbits = 96, } }, @@ -126,7 +126,7 @@ static struct xfrm_algo_desc aead_list[] = { .uinfo = { .aead = { - .geniv = "seqniv", + .geniv = "seqiv", .icv_truncbits = 128, } }, @@ -164,7 +164,7 @@ static struct xfrm_algo_desc aead_list[] = { .uinfo = { .aead = { - .geniv = "seqniv", + .geniv = "seqiv", .icv_truncbits = 128, } }, diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 68ada2ca4b60..cc3676eb6239 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -19,7 +19,7 @@ #include <net/dst.h> #include <net/xfrm.h> -static int xfrm_output2(struct sock *sk, struct sk_buff *skb); +static int xfrm_output2(struct net *net, struct sock *sk, struct sk_buff *skb); static int xfrm_skb_check_space(struct sk_buff *skb) { @@ -131,18 +131,20 @@ out: int xfrm_output_resume(struct sk_buff *skb, int err) { + struct net *net = xs_net(skb_dst(skb)->xfrm); + while (likely((err = xfrm_output_one(skb, err)) == 0)) { nf_reset(skb); - err = skb_dst(skb)->ops->local_out(skb); + err = skb_dst(skb)->ops->local_out(net, skb->sk, skb); if (unlikely(err != 1)) goto out; if (!skb_dst(skb)->xfrm) - return dst_output(skb); + return dst_output(net, skb->sk, skb); err = nf_hook(skb_dst(skb)->ops->family, - NF_INET_POST_ROUTING, skb->sk, skb, + NF_INET_POST_ROUTING, net, skb->sk, skb, NULL, skb_dst(skb)->dev, xfrm_output2); if (unlikely(err != 1)) goto out; @@ -156,12 +158,12 @@ out: } EXPORT_SYMBOL_GPL(xfrm_output_resume); -static int xfrm_output2(struct sock *sk, struct sk_buff *skb) +static int xfrm_output2(struct net *net, struct sock *sk, struct sk_buff *skb) { return xfrm_output_resume(skb, 1); } -static int xfrm_output_gso(struct sock *sk, struct sk_buff *skb) +static int xfrm_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb) { struct sk_buff *segs; @@ -177,7 +179,7 @@ static int xfrm_output_gso(struct sock *sk, struct sk_buff *skb) int err; segs->next = NULL; - err = xfrm_output2(sk, segs); + err = xfrm_output2(net, sk, segs); if (unlikely(err)) { kfree_skb_list(nskb); @@ -196,7 +198,7 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb) int err; if (skb_is_gso(skb)) - return xfrm_output_gso(sk, skb); + return xfrm_output_gso(net, sk, skb); if (skb->ip_summed == CHECKSUM_PARTIAL) { err = skb_checksum_help(skb); @@ -207,7 +209,7 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb) } } - return xfrm_output2(sk, skb); + return xfrm_output2(net, sk, skb); } EXPORT_SYMBOL_GPL(xfrm_output); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 94af3d065785..09bfcbac63bb 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1208,7 +1208,7 @@ static inline int policy_to_flow_dir(int dir) } } -static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, +static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, const struct flowi *fl) { struct xfrm_policy *pol; @@ -1583,8 +1583,6 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family) memset(dst + 1, 0, sizeof(*xdst) - sizeof(*dst)); xdst->flo.ops = &xfrm_bundle_fc_ops; - if (afinfo->init_dst) - afinfo->init_dst(net, xdst); } else xdst = ERR_PTR(-ENOBUFS); @@ -1889,6 +1887,7 @@ static void xfrm_policy_queue_process(unsigned long arg) struct sock *sk; struct dst_entry *dst; struct xfrm_policy *pol = (struct xfrm_policy *)arg; + struct net *net = xp_net(pol); struct xfrm_policy_queue *pq = &pol->polq; struct flowi fl; struct sk_buff_head list; @@ -1905,8 +1904,7 @@ static void xfrm_policy_queue_process(unsigned long arg) spin_unlock(&pq->hold_queue.lock); dst_hold(dst->path); - dst = xfrm_lookup(xp_net(pol), dst->path, &fl, - sk, 0); + dst = xfrm_lookup(net, dst->path, &fl, sk, 0); if (IS_ERR(dst)) goto purge_queue; @@ -1936,8 +1934,7 @@ static void xfrm_policy_queue_process(unsigned long arg) xfrm_decode_session(skb, &fl, skb_dst(skb)->ops->family); dst_hold(skb_dst(skb)->path); - dst = xfrm_lookup(xp_net(pol), skb_dst(skb)->path, - &fl, skb->sk, 0); + dst = xfrm_lookup(net, skb_dst(skb)->path, &fl, skb->sk, 0); if (IS_ERR(dst)) { kfree_skb(skb); continue; @@ -1947,7 +1944,7 @@ static void xfrm_policy_queue_process(unsigned long arg) skb_dst_drop(skb); skb_dst_set(skb, dst); - dst_output(skb); + dst_output(net, skb->sk, skb); } out: @@ -1960,7 +1957,7 @@ purge_queue: xfrm_pol_put(pol); } -static int xdst_queue_output(struct sock *sk, struct sk_buff *skb) +static int xdst_queue_output(struct net *net, struct sock *sk, struct sk_buff *skb) { unsigned long sched_next; struct dst_entry *dst = skb_dst(skb); @@ -2187,7 +2184,7 @@ static struct dst_entry *make_blackhole(struct net *net, u16 family, */ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, - struct sock *sk, int flags) + const struct sock *sk, int flags) { struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; struct flow_cache_object *flo; @@ -2335,7 +2332,7 @@ EXPORT_SYMBOL(xfrm_lookup); */ struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, - struct sock *sk, int flags) + const struct sock *sk, int flags) { struct dst_entry *dst = xfrm_lookup(net, dst_orig, fl, sk, flags | XFRM_LOOKUP_QUEUE | |