From 48bf6beddf455b0cb605915081f3428960a6224e Mon Sep 17 00:00:00 2001 From: Bob Copeland Date: Mon, 13 Apr 2015 17:26:28 -0400 Subject: mac80211: introduce plink lock for plink fields The mesh plink code uses sta->lock to serialize access to the plink state fields between the peer link state machine and the peer link timer. Some paths (e.g. those involving mps_qos_null_tx()) unfortunately hold this spinlock across frame tx, which is soon to be disallowed. Add a new spinlock just for plink access. Signed-off-by: Bob Copeland Signed-off-by: Johannes Berg --- net/mac80211/mesh_plink.c | 37 ++++++++++++++++++++----------------- net/mac80211/sta_info.c | 1 + net/mac80211/sta_info.h | 5 ++++- 3 files changed, 25 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index 60d737f144e3..ac843fc88745 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -72,10 +72,11 @@ static bool rssi_threshold_check(struct ieee80211_sub_if_data *sdata, * * @sta: mesh peer link to restart * - * Locking: this function must be called holding sta->lock + * Locking: this function must be called holding sta->plink_lock */ static inline void mesh_plink_fsm_restart(struct sta_info *sta) { + lockdep_assert_held(&sta->plink_lock); sta->plink_state = NL80211_PLINK_LISTEN; sta->llid = sta->plid = sta->reason = 0; sta->plink_retries = 0; @@ -213,13 +214,15 @@ static u32 mesh_set_ht_prot_mode(struct ieee80211_sub_if_data *sdata) * All mesh paths with this peer as next hop will be flushed * Returns beacon changed flag if the beacon content changed. * - * Locking: the caller must hold sta->lock + * Locking: the caller must hold sta->plink_lock */ static u32 __mesh_plink_deactivate(struct sta_info *sta) { struct ieee80211_sub_if_data *sdata = sta->sdata; u32 changed = 0; + lockdep_assert_held(&sta->plink_lock); + if (sta->plink_state == NL80211_PLINK_ESTAB) changed = mesh_plink_dec_estab_count(sdata); sta->plink_state = NL80211_PLINK_BLOCKED; @@ -244,13 +247,13 @@ u32 mesh_plink_deactivate(struct sta_info *sta) struct ieee80211_sub_if_data *sdata = sta->sdata; u32 changed; - spin_lock_bh(&sta->lock); + spin_lock_bh(&sta->plink_lock); changed = __mesh_plink_deactivate(sta); sta->reason = WLAN_REASON_MESH_PEER_CANCELED; mesh_plink_frame_tx(sdata, WLAN_SP_MESH_PEERING_CLOSE, sta->sta.addr, sta->llid, sta->plid, sta->reason); - spin_unlock_bh(&sta->lock); + spin_unlock_bh(&sta->plink_lock); return changed; } @@ -387,7 +390,7 @@ static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata, sband = local->hw.wiphy->bands[band]; rates = ieee80211_sta_get_rates(sdata, elems, band, &basic_rates); - spin_lock_bh(&sta->lock); + spin_lock_bh(&sta->plink_lock); sta->last_rx = jiffies; /* rates and capabilities don't change during peering */ @@ -419,7 +422,7 @@ static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata, else rate_control_rate_update(local, sband, sta, changed); out: - spin_unlock_bh(&sta->lock); + spin_unlock_bh(&sta->plink_lock); } static struct sta_info * @@ -552,7 +555,7 @@ static void mesh_plink_timer(unsigned long data) if (sta->sdata->local->quiescing) return; - spin_lock_bh(&sta->lock); + spin_lock_bh(&sta->plink_lock); /* If a timer fires just before a state transition on another CPU, * we may have already extended the timeout and changed state by the @@ -563,7 +566,7 @@ static void mesh_plink_timer(unsigned long data) mpl_dbg(sta->sdata, "Ignoring timer for %pM in state %s (timer adjusted)", sta->sta.addr, mplstates[sta->plink_state]); - spin_unlock_bh(&sta->lock); + spin_unlock_bh(&sta->plink_lock); return; } @@ -573,7 +576,7 @@ static void mesh_plink_timer(unsigned long data) mpl_dbg(sta->sdata, "Ignoring timer for %pM in state %s (timer deleted)", sta->sta.addr, mplstates[sta->plink_state]); - spin_unlock_bh(&sta->lock); + spin_unlock_bh(&sta->plink_lock); return; } @@ -619,7 +622,7 @@ static void mesh_plink_timer(unsigned long data) default: break; } - spin_unlock_bh(&sta->lock); + spin_unlock_bh(&sta->plink_lock); if (action) mesh_plink_frame_tx(sdata, action, sta->sta.addr, sta->llid, sta->plid, reason); @@ -674,16 +677,16 @@ u32 mesh_plink_open(struct sta_info *sta) if (!test_sta_flag(sta, WLAN_STA_AUTH)) return 0; - spin_lock_bh(&sta->lock); + spin_lock_bh(&sta->plink_lock); sta->llid = mesh_get_new_llid(sdata); if (sta->plink_state != NL80211_PLINK_LISTEN && sta->plink_state != NL80211_PLINK_BLOCKED) { - spin_unlock_bh(&sta->lock); + spin_unlock_bh(&sta->plink_lock); return 0; } sta->plink_state = NL80211_PLINK_OPN_SNT; mesh_plink_timer_set(sta, sdata->u.mesh.mshcfg.dot11MeshRetryTimeout); - spin_unlock_bh(&sta->lock); + spin_unlock_bh(&sta->plink_lock); mpl_dbg(sdata, "Mesh plink: starting establishment with %pM\n", sta->sta.addr); @@ -700,10 +703,10 @@ u32 mesh_plink_block(struct sta_info *sta) { u32 changed; - spin_lock_bh(&sta->lock); + spin_lock_bh(&sta->plink_lock); changed = __mesh_plink_deactivate(sta); sta->plink_state = NL80211_PLINK_BLOCKED; - spin_unlock_bh(&sta->lock); + spin_unlock_bh(&sta->plink_lock); return changed; } @@ -758,7 +761,7 @@ static u32 mesh_plink_fsm(struct ieee80211_sub_if_data *sdata, mpl_dbg(sdata, "peer %pM in state %s got event %s\n", sta->sta.addr, mplstates[sta->plink_state], mplevents[event]); - spin_lock_bh(&sta->lock); + spin_lock_bh(&sta->plink_lock); switch (sta->plink_state) { case NL80211_PLINK_LISTEN: switch (event) { @@ -872,7 +875,7 @@ static u32 mesh_plink_fsm(struct ieee80211_sub_if_data *sdata, */ break; } - spin_unlock_bh(&sta->lock); + spin_unlock_bh(&sta->plink_lock); if (action) { mesh_plink_frame_tx(sdata, action, sta->sta.addr, sta->llid, sta->plid, sta->reason); diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 12971b71d0fa..53ab4bd1a44c 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -295,6 +295,7 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, INIT_WORK(&sta->ampdu_mlme.work, ieee80211_ba_session_work); mutex_init(&sta->ampdu_mlme.mtx); #ifdef CONFIG_MAC80211_MESH + spin_lock_init(&sta->plink_lock); if (ieee80211_vif_is_mesh(&sdata->vif) && !sdata->u.mesh.user_mpm) init_timer(&sta->plink_timer); diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 5c164fb3f6c5..40ad52e447ec 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -299,6 +299,7 @@ struct sta_ampdu_mlme { * @tid_seq: per-TID sequence numbers for sending to this STA * @ampdu_mlme: A-MPDU state machine state * @timer_to_tid: identity mapping to ID timers + * @plink_lock: serialize access to plink fields * @llid: Local link ID * @plid: Peer link ID * @reason: Cancel reason on PLINK_HOLDING state @@ -422,9 +423,10 @@ struct sta_info { #ifdef CONFIG_MAC80211_MESH /* - * Mesh peer link attributes + * Mesh peer link attributes, protected by plink_lock. * TODO: move to a sub-structure that is referenced with pointer? */ + spinlock_t plink_lock; u16 llid; u16 plid; u16 reason; @@ -432,6 +434,7 @@ struct sta_info { enum nl80211_plink_state plink_state; u32 plink_timeout; struct timer_list plink_timer; + s64 t_offset; s64 t_offset_setpoint; /* mesh power save */ -- cgit From 35c347ac53040daba955fa06fcd5f909bee85017 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 5 Mar 2015 16:10:08 +0100 Subject: mac80211: lock rate control Both minstrel (reported by Sven Eckelmann) and the iwlwifi rate control aren't properly taking concurrency into account. It's likely that the same is true for other rate control algorithms. In the case of minstrel this manifests itself in crashes when an update and other data access are run concurrently, for example when the stations change bandwidth or similar. In iwlwifi, this can cause firmware crashes. Since fixing all rate control algorithms will be very difficult, just provide locking for invocations. This protects the internal data structures the algorithms maintain. I've manipulated hostapd to test this, by having it change its advertised bandwidth roughly ever 150ms. At the same time, I'm running a flood ping between the client and the AP, which causes this race of update vs. get_rate/status to easily happen on the client. With this change, the system survives this test. Reported-by: Sven Eckelmann Signed-off-by: Johannes Berg --- net/mac80211/rate.c | 8 +++++++- net/mac80211/rate.h | 14 +++++++++++--- net/mac80211/sta_info.c | 2 +- net/mac80211/sta_info.h | 1 + 4 files changed, 20 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index d53355b011f5..de69adf24f53 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -683,7 +683,13 @@ void rate_control_get_rate(struct ieee80211_sub_if_data *sdata, if (sdata->local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL) return; - ref->ops->get_rate(ref->priv, ista, priv_sta, txrc); + if (ista) { + spin_lock_bh(&sta->rate_ctrl_lock); + ref->ops->get_rate(ref->priv, ista, priv_sta, txrc); + spin_unlock_bh(&sta->rate_ctrl_lock); + } else { + ref->ops->get_rate(ref->priv, NULL, NULL, txrc); + } if (sdata->local->hw.flags & IEEE80211_HW_SUPPORTS_RC_TABLE) return; diff --git a/net/mac80211/rate.h b/net/mac80211/rate.h index 38652f09feaf..25c9be5dd7fd 100644 --- a/net/mac80211/rate.h +++ b/net/mac80211/rate.h @@ -42,10 +42,12 @@ static inline void rate_control_tx_status(struct ieee80211_local *local, if (!ref || !test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) return; + spin_lock_bh(&sta->rate_ctrl_lock); if (ref->ops->tx_status) ref->ops->tx_status(ref->priv, sband, ista, priv_sta, skb); else ref->ops->tx_status_noskb(ref->priv, sband, ista, priv_sta, info); + spin_unlock_bh(&sta->rate_ctrl_lock); } static inline void @@ -64,7 +66,9 @@ rate_control_tx_status_noskb(struct ieee80211_local *local, if (WARN_ON_ONCE(!ref->ops->tx_status_noskb)) return; + spin_lock_bh(&sta->rate_ctrl_lock); ref->ops->tx_status_noskb(ref->priv, sband, ista, priv_sta, info); + spin_unlock_bh(&sta->rate_ctrl_lock); } static inline void rate_control_rate_init(struct sta_info *sta) @@ -91,8 +95,10 @@ static inline void rate_control_rate_init(struct sta_info *sta) sband = local->hw.wiphy->bands[chanctx_conf->def.chan->band]; + spin_lock_bh(&sta->rate_ctrl_lock); ref->ops->rate_init(ref->priv, sband, &chanctx_conf->def, ista, priv_sta); + spin_unlock_bh(&sta->rate_ctrl_lock); rcu_read_unlock(); set_sta_flag(sta, WLAN_STA_RATE_CONTROL); } @@ -115,18 +121,20 @@ static inline void rate_control_rate_update(struct ieee80211_local *local, return; } + spin_lock_bh(&sta->rate_ctrl_lock); ref->ops->rate_update(ref->priv, sband, &chanctx_conf->def, ista, priv_sta, changed); + spin_unlock_bh(&sta->rate_ctrl_lock); rcu_read_unlock(); } drv_sta_rc_update(local, sta->sdata, &sta->sta, changed); } static inline void *rate_control_alloc_sta(struct rate_control_ref *ref, - struct ieee80211_sta *sta, - gfp_t gfp) + struct sta_info *sta, gfp_t gfp) { - return ref->ops->alloc_sta(ref->priv, sta, gfp); + spin_lock_init(&sta->rate_ctrl_lock); + return ref->ops->alloc_sta(ref->priv, &sta->sta, gfp); } static inline void rate_control_free_sta(struct sta_info *sta) diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 53ab4bd1a44c..0800e02cce05 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -269,7 +269,7 @@ static int sta_prepare_rate_control(struct ieee80211_local *local, sta->rate_ctrl = local->rate_ctrl; sta->rate_ctrl_priv = rate_control_alloc_sta(sta->rate_ctrl, - &sta->sta, gfp); + sta, gfp); if (!sta->rate_ctrl_priv) return -ENOMEM; diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 40ad52e447ec..e4c4f871ffee 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -353,6 +353,7 @@ struct sta_info { u8 ptk_idx; struct rate_control_ref *rate_ctrl; void *rate_ctrl_priv; + spinlock_t rate_ctrl_lock; spinlock_t lock; struct work_struct drv_deliver_wk; -- cgit From 17c18bf880b2464851e5a2bca86521affc46c97e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sat, 21 Mar 2015 15:25:43 +0100 Subject: mac80211: add TX fastpath In order to speed up mac80211's TX path, add the "fast-xmit" cache that will cache the data frame 802.11 header and other data to be able to build the frame more quickly. This cache is rebuilt when external triggers imply changes, but a lot of the checks done per packet today are simplified away to the check for the cache. There's also a more detailed description in the code. Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 9 +- net/mac80211/chan.c | 6 + net/mac80211/ieee80211_i.h | 5 + net/mac80211/key.c | 2 + net/mac80211/rx.c | 2 + net/mac80211/sta_info.c | 6 + net/mac80211/sta_info.h | 27 +++ net/mac80211/tx.c | 420 ++++++++++++++++++++++++++++++++++++++++++++- 8 files changed, 475 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 265e42721a66..4aa5e893cbaa 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -137,6 +137,9 @@ static int ieee80211_set_noack_map(struct wiphy *wiphy, struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); sdata->noack_map = noack_map; + + ieee80211_check_fast_xmit_iface(sdata); + return 0; } @@ -2099,10 +2102,14 @@ static int ieee80211_set_wiphy_params(struct wiphy *wiphy, u32 changed) int err; if (changed & WIPHY_PARAM_FRAG_THRESHOLD) { + ieee80211_check_fast_xmit_all(local); + err = drv_set_frag_threshold(local, wiphy->frag_threshold); - if (err) + if (err) { + ieee80211_check_fast_xmit_all(local); return err; + } } if ((changed & WIPHY_PARAM_COVERAGE_CLASS) || diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index 5bcd4e5589d3..7e9b62475400 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -664,6 +664,8 @@ out: ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_IDLE); + ieee80211_check_fast_xmit_iface(sdata); + return ret; } @@ -1030,6 +1032,8 @@ ieee80211_vif_use_reserved_reassign(struct ieee80211_sub_if_data *sdata) if (sdata->vif.type == NL80211_IFTYPE_AP) __ieee80211_vif_copy_chanctx_to_vlans(sdata, false); + ieee80211_check_fast_xmit_iface(sdata); + if (ieee80211_chanctx_refcount(local, old_ctx) == 0) ieee80211_free_chanctx(local, old_ctx); @@ -1376,6 +1380,8 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) __ieee80211_vif_copy_chanctx_to_vlans(sdata, false); + ieee80211_check_fast_xmit_iface(sdata); + sdata->radar_required = sdata->reserved_radar_required; if (sdata->vif.bss_conf.chandef.width != diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index ab46ab4a7249..556051f68ad7 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1651,6 +1651,11 @@ struct sk_buff * ieee80211_build_data_template(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u32 info_flags); +void ieee80211_check_fast_xmit(struct sta_info *sta); +void ieee80211_check_fast_xmit_all(struct ieee80211_local *local); +void ieee80211_check_fast_xmit_iface(struct ieee80211_sub_if_data *sdata); +void ieee80211_clear_fast_xmit(struct sta_info *sta); + /* HT */ void ieee80211_apply_htcap_overrides(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta_ht_cap *ht_cap); diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 2291cd730091..3e0b814f4db3 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -229,6 +229,7 @@ static void __ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata, if (uni) { rcu_assign_pointer(sdata->default_unicast_key, key); + ieee80211_check_fast_xmit_iface(sdata); drv_set_default_unicast_key(sdata->local, sdata, idx); } @@ -298,6 +299,7 @@ static void ieee80211_key_replace(struct ieee80211_sub_if_data *sdata, if (pairwise) { rcu_assign_pointer(sta->ptk[idx], new); sta->ptk_idx = idx; + ieee80211_check_fast_xmit(sta); } else { rcu_assign_pointer(sta->gtk[idx], new); sta->gtk_idx = idx; diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 260eed45b6d2..6e3b564b6dea 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1200,6 +1200,8 @@ static void sta_ps_start(struct sta_info *sta) ps_dbg(sdata, "STA %pM aid %d enters power save mode\n", sta->sta.addr, sta->sta.aid); + ieee80211_clear_fast_xmit(sta); + if (!sta->sta.txq[0]) return; diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 0800e02cce05..737730abba6d 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -1201,6 +1201,8 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta) ps_dbg(sdata, "STA %pM aid %d sending %d filtered/%d PS frames since STA not sleeping anymore\n", sta->sta.addr, sta->sta.aid, filtered, buffered); + + ieee80211_check_fast_xmit(sta); } static void ieee80211_send_null_response(struct ieee80211_sub_if_data *sdata, @@ -1599,6 +1601,7 @@ void ieee80211_sta_block_awake(struct ieee80211_hw *hw, if (block) { set_sta_flag(sta, WLAN_STA_PS_DRIVER); + ieee80211_clear_fast_xmit(sta); return; } @@ -1616,6 +1619,7 @@ void ieee80211_sta_block_awake(struct ieee80211_hw *hw, ieee80211_queue_work(hw, &sta->drv_deliver_wk); } else { clear_sta_flag(sta, WLAN_STA_PS_DRIVER); + ieee80211_check_fast_xmit(sta); } } EXPORT_SYMBOL(ieee80211_sta_block_awake); @@ -1720,6 +1724,7 @@ int sta_info_move_state(struct sta_info *sta, !sta->sdata->u.vlan.sta)) atomic_dec(&sta->sdata->bss->num_mcast_sta); clear_bit(WLAN_STA_AUTHORIZED, &sta->_flags); + ieee80211_clear_fast_xmit(sta); } break; case IEEE80211_STA_AUTHORIZED: @@ -1729,6 +1734,7 @@ int sta_info_move_state(struct sta_info *sta, !sta->sdata->u.vlan.sta)) atomic_inc(&sta->sdata->bss->num_mcast_sta); set_bit(WLAN_STA_AUTHORIZED, &sta->_flags); + ieee80211_check_fast_xmit(sta); } break; default: diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index e4c4f871ffee..0602363ff63b 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -241,6 +241,30 @@ struct sta_ampdu_mlme { /* Value to indicate no TID reservation */ #define IEEE80211_TID_UNRESERVED 0xff +/** + * struct ieee80211_fast_tx - TX fastpath information + * @key: key to use for hw crypto + * @hdr: the 802.11 header to put with the frame + * @hdr_len: actual 802.11 header length + * @sa_offs: offset of the SA + * @da_offs: offset of the DA + * @pn_offs: offset where to put PN for crypto (or 0 if not needed) + * @band: band this will be transmitted on, for tx_info + * @rcu_head: RCU head to free this struct + * + * Try to keep this struct small so it fits into a single cacheline. + */ +struct ieee80211_fast_tx { + struct ieee80211_key *key; + u8 hdr[30 + 2 + IEEE80211_CCMP_HDR_LEN + + sizeof(rfc1042_header)]; + u8 hdr_len; + u8 sa_offs, da_offs, pn_offs; + u8 band; + + struct rcu_head rcu_head; +}; + /** * struct sta_info - STA information * @@ -339,6 +363,7 @@ struct sta_ampdu_mlme { * using IEEE80211_NUM_TID entry for non-QoS frames * @rx_msdu: MSDUs received from this station, using IEEE80211_NUM_TID * entry for non-QoS frames + * @fast_tx: TX fastpath information */ struct sta_info { /* General information, mostly static */ @@ -356,6 +381,8 @@ struct sta_info { spinlock_t rate_ctrl_lock; spinlock_t lock; + struct ieee80211_fast_tx __rcu *fast_tx; + struct work_struct drv_deliver_wk; u16 listen_interval; diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 667111ee6a20..160e1927323d 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1600,7 +1600,7 @@ static int ieee80211_skb_resize(struct ieee80211_sub_if_data *sdata, if (skb_cloned(skb) && (!(local->hw.flags & IEEE80211_HW_SUPPORTS_CLONED_SKBS) || !skb_clone_writable(skb, ETH_HLEN) || - sdata->crypto_tx_tailroom_needed_cnt)) + (may_encrypt && sdata->crypto_tx_tailroom_needed_cnt))) I802_DEBUG_INC(local->tx_expand_skb_head_cloned); else if (head_need || tail_need) I802_DEBUG_INC(local->tx_expand_skb_head); @@ -2387,6 +2387,414 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, return ERR_PTR(ret); } +/* + * fast-xmit overview + * + * The core idea of this fast-xmit is to remove per-packet checks by checking + * them out of band. ieee80211_check_fast_xmit() implements the out-of-band + * checks that are needed to get the sta->fast_tx pointer assigned, after which + * much less work can be done per packet. For example, fragmentation must be + * disabled or the fast_tx pointer will not be set. All the conditions are seen + * in the code here. + * + * Once assigned, the fast_tx data structure also caches the per-packet 802.11 + * header and other data to aid packet processing in ieee80211_xmit_fast(). + * + * The most difficult part of this is that when any of these assumptions + * change, an external trigger (i.e. a call to ieee80211_clear_fast_xmit(), + * ieee80211_check_fast_xmit() or friends) is required to reset the data, + * since the per-packet code no longer checks the conditions. This is reflected + * by the calls to these functions throughout the rest of the code, and must be + * maintained if any of the TX path checks change. + */ + +void ieee80211_check_fast_xmit(struct sta_info *sta) +{ + struct ieee80211_fast_tx build = {}, *fast_tx = NULL, *old; + struct ieee80211_local *local = sta->local; + struct ieee80211_sub_if_data *sdata = sta->sdata; + struct ieee80211_hdr *hdr = (void *)build.hdr; + struct ieee80211_chanctx_conf *chanctx_conf; + __le16 fc; + + if (!(local->hw.flags & IEEE80211_HW_SUPPORT_FAST_XMIT)) + return; + + /* Locking here protects both the pointer itself, and against concurrent + * invocations winning data access races to, e.g., the key pointer that + * is used. + * Without it, the invocation of this function right after the key + * pointer changes wouldn't be sufficient, as another CPU could access + * the pointer, then stall, and then do the cache update after the CPU + * that invalidated the key. + * With the locking, such scenarios cannot happen as the check for the + * key and the fast-tx assignment are done atomically, so the CPU that + * modifies the key will either wait or other one will see the key + * cleared/changed already. + */ + spin_lock_bh(&sta->lock); + if (local->hw.flags & IEEE80211_HW_SUPPORTS_PS && + !(local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_PS) && + sdata->vif.type == NL80211_IFTYPE_STATION) + goto out; + + if (!test_sta_flag(sta, WLAN_STA_AUTHORIZED)) + goto out; + + if (test_sta_flag(sta, WLAN_STA_PS_STA) || + test_sta_flag(sta, WLAN_STA_PS_DRIVER) || + test_sta_flag(sta, WLAN_STA_PS_DELIVER)) + goto out; + + if (sdata->noack_map) + goto out; + + /* fast-xmit doesn't handle fragmentation at all */ + if (local->hw.wiphy->frag_threshold != (u32)-1) + goto out; + + rcu_read_lock(); + chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); + if (!chanctx_conf) { + rcu_read_unlock(); + goto out; + } + build.band = chanctx_conf->def.chan->band; + rcu_read_unlock(); + + fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_DATA); + + switch (sdata->vif.type) { + case NL80211_IFTYPE_STATION: + if (test_sta_flag(sta, WLAN_STA_TDLS_PEER)) { + /* DA SA BSSID */ + build.da_offs = offsetof(struct ieee80211_hdr, addr1); + build.sa_offs = offsetof(struct ieee80211_hdr, addr2); + memcpy(hdr->addr3, sdata->u.mgd.bssid, ETH_ALEN); + build.hdr_len = 24; + break; + } + + if (sdata->u.mgd.use_4addr) { + /* non-regular ethertype cannot use the fastpath */ + fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS | + IEEE80211_FCTL_TODS); + /* RA TA DA SA */ + memcpy(hdr->addr1, sdata->u.mgd.bssid, ETH_ALEN); + memcpy(hdr->addr2, sdata->vif.addr, ETH_ALEN); + build.da_offs = offsetof(struct ieee80211_hdr, addr3); + build.sa_offs = offsetof(struct ieee80211_hdr, addr4); + build.hdr_len = 30; + break; + } + fc |= cpu_to_le16(IEEE80211_FCTL_TODS); + /* BSSID SA DA */ + memcpy(hdr->addr1, sdata->u.mgd.bssid, ETH_ALEN); + build.da_offs = offsetof(struct ieee80211_hdr, addr3); + build.sa_offs = offsetof(struct ieee80211_hdr, addr2); + build.hdr_len = 24; + break; + case NL80211_IFTYPE_AP_VLAN: + if (sdata->wdev.use_4addr) { + fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS | + IEEE80211_FCTL_TODS); + /* RA TA DA SA */ + memcpy(hdr->addr1, sta->sta.addr, ETH_ALEN); + memcpy(hdr->addr2, sdata->vif.addr, ETH_ALEN); + build.da_offs = offsetof(struct ieee80211_hdr, addr3); + build.sa_offs = offsetof(struct ieee80211_hdr, addr4); + build.hdr_len = 30; + break; + } + /* fall through */ + case NL80211_IFTYPE_AP: + fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS); + /* DA BSSID SA */ + build.da_offs = offsetof(struct ieee80211_hdr, addr1); + memcpy(hdr->addr2, sdata->vif.addr, ETH_ALEN); + build.sa_offs = offsetof(struct ieee80211_hdr, addr3); + build.hdr_len = 24; + break; + default: + /* not handled on fast-xmit */ + goto out; + } + + if (sta->sta.wme) { + build.hdr_len += 2; + fc |= cpu_to_le16(IEEE80211_STYPE_QOS_DATA); + } + + /* We store the key here so there's no point in using rcu_dereference() + * but that's fine because the code that changes the pointers will call + * this function after doing so. For a single CPU that would be enough, + * for multiple see the comment above. + */ + build.key = rcu_access_pointer(sta->ptk[sta->ptk_idx]); + if (!build.key) + build.key = rcu_access_pointer(sdata->default_unicast_key); + if (build.key) { + bool gen_iv, iv_spc; + + gen_iv = build.key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV; + iv_spc = build.key->conf.flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE; + + /* don't handle software crypto */ + if (!(build.key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE)) + goto out; + + switch (build.key->conf.cipher) { + case WLAN_CIPHER_SUITE_CCMP: + case WLAN_CIPHER_SUITE_CCMP_256: + /* add fixed key ID */ + if (gen_iv) { + (build.hdr + build.hdr_len)[3] = + 0x20 | (build.key->conf.keyidx << 6); + build.pn_offs = build.hdr_len; + } + if (gen_iv || iv_spc) + build.hdr_len += IEEE80211_CCMP_HDR_LEN; + break; + case WLAN_CIPHER_SUITE_GCMP: + case WLAN_CIPHER_SUITE_GCMP_256: + /* add fixed key ID */ + if (gen_iv) { + (build.hdr + build.hdr_len)[3] = + 0x20 | (build.key->conf.keyidx << 6); + build.pn_offs = build.hdr_len; + } + if (gen_iv || iv_spc) + build.hdr_len += IEEE80211_GCMP_HDR_LEN; + break; + default: + /* don't do fast-xmit for these ciphers (yet) */ + goto out; + } + + fc |= cpu_to_le16(IEEE80211_FCTL_PROTECTED); + } + + hdr->frame_control = fc; + + memcpy(build.hdr + build.hdr_len, + rfc1042_header, sizeof(rfc1042_header)); + build.hdr_len += sizeof(rfc1042_header); + + fast_tx = kmemdup(&build, sizeof(build), GFP_ATOMIC); + /* if the kmemdup fails, continue w/o fast_tx */ + if (!fast_tx) + goto out; + + out: + /* we might have raced against another call to this function */ + old = rcu_dereference_protected(sta->fast_tx, + lockdep_is_held(&sta->lock)); + rcu_assign_pointer(sta->fast_tx, fast_tx); + if (old) + kfree_rcu(old, rcu_head); + spin_unlock_bh(&sta->lock); +} + +void ieee80211_check_fast_xmit_all(struct ieee80211_local *local) +{ + struct sta_info *sta; + + rcu_read_lock(); + list_for_each_entry_rcu(sta, &local->sta_list, list) + ieee80211_check_fast_xmit(sta); + rcu_read_unlock(); +} + +void ieee80211_check_fast_xmit_iface(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_local *local = sdata->local; + struct sta_info *sta; + + rcu_read_lock(); + + list_for_each_entry_rcu(sta, &local->sta_list, list) { + if (sdata != sta->sdata && + (!sta->sdata->bss || sta->sdata->bss != sdata->bss)) + continue; + ieee80211_check_fast_xmit(sta); + } + + rcu_read_unlock(); +} + +void ieee80211_clear_fast_xmit(struct sta_info *sta) +{ + struct ieee80211_fast_tx *fast_tx; + + spin_lock_bh(&sta->lock); + fast_tx = rcu_dereference_protected(sta->fast_tx, + lockdep_is_held(&sta->lock)); + RCU_INIT_POINTER(sta->fast_tx, NULL); + spin_unlock_bh(&sta->lock); + + if (fast_tx) + kfree_rcu(fast_tx, rcu_head); +} + +static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, + struct net_device *dev, struct sta_info *sta, + struct ieee80211_fast_tx *fast_tx, + struct sk_buff *skb) +{ + struct ieee80211_local *local = sdata->local; + u16 ethertype = (skb->data[12] << 8) | skb->data[13]; + int extra_head = fast_tx->hdr_len - (ETH_HLEN - 2); + int hw_headroom = sdata->local->hw.extra_tx_headroom; + struct ethhdr eth; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + struct ieee80211_hdr *hdr = (void *)fast_tx->hdr; + struct ieee80211_tx_data tx; + ieee80211_tx_result r; + struct tid_ampdu_tx *tid_tx = NULL; + u8 tid = IEEE80211_NUM_TIDS; + + /* control port protocol needs a lot of special handling */ + if (cpu_to_be16(ethertype) == sdata->control_port_protocol) + return false; + + /* only RFC 1042 SNAP */ + if (ethertype < ETH_P_802_3_MIN) + return false; + + /* don't handle TX status request here either */ + if (skb->sk && skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS) + return false; + + if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) { + tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; + tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]); + if (tid_tx && + !test_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state)) + return false; + } + + /* after this point (skb is modified) we cannot return false */ + + if (skb_shared(skb)) { + struct sk_buff *tmp_skb = skb; + + skb = skb_clone(skb, GFP_ATOMIC); + kfree_skb(tmp_skb); + + if (!skb) + return true; + } + + dev->stats.tx_packets++; + dev->stats.tx_bytes += skb->len + extra_head; + dev->trans_start = jiffies; + + /* will not be crypto-handled beyond what we do here, so use false + * as the may-encrypt argument for the resize to not account for + * more room than we already have in 'extra_head' + */ + if (unlikely(ieee80211_skb_resize(sdata, skb, + max_t(int, extra_head + hw_headroom - + skb_headroom(skb), 0), + false))) { + kfree_skb(skb); + return true; + } + + memcpy(ð, skb->data, ETH_HLEN - 2); + hdr = (void *)skb_push(skb, extra_head); + memcpy(skb->data, fast_tx->hdr, fast_tx->hdr_len); + memcpy(skb->data + fast_tx->da_offs, eth.h_dest, ETH_ALEN); + memcpy(skb->data + fast_tx->sa_offs, eth.h_source, ETH_ALEN); + + memset(info, 0, sizeof(*info)); + info->band = fast_tx->band; + info->control.vif = &sdata->vif; + info->flags = IEEE80211_TX_CTL_FIRST_FRAGMENT | + IEEE80211_TX_CTL_DONTFRAG | + (tid_tx ? IEEE80211_TX_CTL_AMPDU : 0); + + if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) { + *ieee80211_get_qos_ctl(hdr) = tid; + hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid); + } else { + info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ; + hdr->seq_ctrl = cpu_to_le16(sdata->sequence_number); + sdata->sequence_number += 0x10; + } + + sta->tx_msdu[tid]++; + + info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)]; + + __skb_queue_head_init(&tx.skbs); + + tx.flags = IEEE80211_TX_UNICAST; + tx.local = local; + tx.sdata = sdata; + tx.sta = sta; + tx.key = fast_tx->key; + + if (fast_tx->key) + info->control.hw_key = &fast_tx->key->conf; + + if (!(local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL)) { + tx.skb = skb; + r = ieee80211_tx_h_rate_ctrl(&tx); + skb = tx.skb; + tx.skb = NULL; + + if (r != TX_CONTINUE) { + if (r != TX_QUEUED) + kfree_skb(skb); + return true; + } + } + + /* statistics normally done by ieee80211_tx_h_stats (but that + * has to consider fragmentation, so is more complex) + */ + sta->tx_fragments++; + sta->tx_bytes[skb_get_queue_mapping(skb)] += skb->len; + sta->tx_packets[skb_get_queue_mapping(skb)]++; + + if (fast_tx->pn_offs) { + u64 pn; + u8 *crypto_hdr = skb->data + fast_tx->pn_offs; + + switch (fast_tx->key->conf.cipher) { + case WLAN_CIPHER_SUITE_CCMP: + case WLAN_CIPHER_SUITE_CCMP_256: + pn = atomic64_inc_return(&fast_tx->key->u.ccmp.tx_pn); + crypto_hdr[0] = pn; + crypto_hdr[1] = pn >> 8; + crypto_hdr[4] = pn >> 16; + crypto_hdr[5] = pn >> 24; + crypto_hdr[6] = pn >> 32; + crypto_hdr[7] = pn >> 40; + break; + case WLAN_CIPHER_SUITE_GCMP: + case WLAN_CIPHER_SUITE_GCMP_256: + pn = atomic64_inc_return(&fast_tx->key->u.gcmp.tx_pn); + crypto_hdr[0] = pn; + crypto_hdr[1] = pn >> 8; + crypto_hdr[4] = pn >> 16; + crypto_hdr[5] = pn >> 24; + crypto_hdr[6] = pn >> 32; + crypto_hdr[7] = pn >> 40; + break; + } + } + + if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + sdata = container_of(sdata->bss, + struct ieee80211_sub_if_data, u.ap); + + __skb_queue_tail(&tx.skbs, skb); + ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false); + return true; +} + void __ieee80211_subif_start_xmit(struct sk_buff *skb, struct net_device *dev, u32 info_flags) @@ -2406,6 +2814,16 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, goto out; } + if (!IS_ERR_OR_NULL(sta)) { + struct ieee80211_fast_tx *fast_tx; + + fast_tx = rcu_dereference(sta->fast_tx); + + if (fast_tx && + ieee80211_xmit_fast(sdata, dev, sta, fast_tx, skb)) + goto out; + } + skb = ieee80211_build_hdr(sdata, skb, info_flags, sta); if (IS_ERR(skb)) goto out; -- cgit From 725b812c839643639d6135b1f7fb41e48de465a4 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 10 Apr 2015 14:02:08 +0200 Subject: mac80211: extend fast-xmit to driver fragmentation If the driver handles fragmentation then it wouldn't be done in software so we can still use the fast-xmit path in that case. Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 160e1927323d..d983683a8a13 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -2450,7 +2450,8 @@ void ieee80211_check_fast_xmit(struct sta_info *sta) goto out; /* fast-xmit doesn't handle fragmentation at all */ - if (local->hw.wiphy->frag_threshold != (u32)-1) + if (local->hw.wiphy->frag_threshold != (u32)-1 && + !local->ops->set_frag_threshold) goto out; rcu_read_lock(); -- cgit From e495c24731a2651fc2c1c7feedc8ba3b31f6b5d4 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 10 Apr 2015 14:03:17 +0200 Subject: mac80211: extend fast-xmit for more ciphers When crypto is offloaded then in some cases it's all handled by the device, and in others only some space for the IV must be reserved in the frame. Handle both of these cases in the fast-xmit path, up to a limit of 18 bytes of space for IVs. Signed-off-by: Johannes Berg --- net/mac80211/sta_info.h | 10 +++++++--- net/mac80211/tx.c | 40 +++++++++++++++++++++++++++++++++++++--- 2 files changed, 44 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 0602363ff63b..c821fe47c2d1 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -241,6 +241,8 @@ struct sta_ampdu_mlme { /* Value to indicate no TID reservation */ #define IEEE80211_TID_UNRESERVED 0xff +#define IEEE80211_FAST_XMIT_MAX_IV 18 + /** * struct ieee80211_fast_tx - TX fastpath information * @key: key to use for hw crypto @@ -252,15 +254,17 @@ struct sta_ampdu_mlme { * @band: band this will be transmitted on, for tx_info * @rcu_head: RCU head to free this struct * - * Try to keep this struct small so it fits into a single cacheline. + * This struct is small enough so that the common case (maximum crypto + * header length of 8 like for CCMP/GCMP) fits into a single 64-byte + * cache line. */ struct ieee80211_fast_tx { struct ieee80211_key *key; - u8 hdr[30 + 2 + IEEE80211_CCMP_HDR_LEN + - sizeof(rfc1042_header)]; u8 hdr_len; u8 sa_offs, da_offs, pn_offs; u8 band; + u8 hdr[30 + 2 + IEEE80211_FAST_XMIT_MAX_IV + + sizeof(rfc1042_header)]; struct rcu_head rcu_head; }; diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index d983683a8a13..39e80c3f4726 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -2535,10 +2535,11 @@ void ieee80211_check_fast_xmit(struct sta_info *sta) if (!build.key) build.key = rcu_access_pointer(sdata->default_unicast_key); if (build.key) { - bool gen_iv, iv_spc; + bool gen_iv, iv_spc, mmic; gen_iv = build.key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV; iv_spc = build.key->conf.flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE; + mmic = build.key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC; /* don't handle software crypto */ if (!(build.key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE)) @@ -2567,9 +2568,42 @@ void ieee80211_check_fast_xmit(struct sta_info *sta) if (gen_iv || iv_spc) build.hdr_len += IEEE80211_GCMP_HDR_LEN; break; - default: - /* don't do fast-xmit for these ciphers (yet) */ + case WLAN_CIPHER_SUITE_TKIP: + /* cannot handle MMIC or IV generation in xmit-fast */ + if (mmic || gen_iv) + goto out; + if (iv_spc) + build.hdr_len += IEEE80211_TKIP_IV_LEN; + break; + case WLAN_CIPHER_SUITE_WEP40: + case WLAN_CIPHER_SUITE_WEP104: + /* cannot handle IV generation in fast-xmit */ + if (gen_iv) + goto out; + if (iv_spc) + build.hdr_len += IEEE80211_WEP_IV_LEN; + break; + case WLAN_CIPHER_SUITE_AES_CMAC: + case WLAN_CIPHER_SUITE_BIP_CMAC_256: + case WLAN_CIPHER_SUITE_BIP_GMAC_128: + case WLAN_CIPHER_SUITE_BIP_GMAC_256: + WARN(1, + "management cipher suite 0x%x enabled for data\n", + build.key->conf.cipher); goto out; + default: + /* we don't know how to generate IVs for this at all */ + if (WARN_ON(gen_iv)) + goto out; + /* pure hardware keys are OK, of course */ + if (!(build.key->flags & KEY_FLAG_CIPHER_SCHEME)) + break; + /* cipher scheme might require space allocation */ + if (iv_spc && + build.key->conf.iv_len > IEEE80211_FAST_XMIT_MAX_IV) + goto out; + if (iv_spc) + build.hdr_len += build.key->conf.iv_len; } fc |= cpu_to_le16(IEEE80211_FCTL_PROTECTED); -- cgit From 3ffd884012bb12d3ed64c9fd9a142a62fdcfaf11 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 14 Apr 2015 10:28:37 +0200 Subject: mac80211: extend fast-xmit to cover IBSS IBSS can be supported very easily since it uses the standard station authorization state etc. so it just needs to be covered by the header building switch statement. Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 39e80c3f4726..49ba43eff6c4 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -2466,6 +2466,13 @@ void ieee80211_check_fast_xmit(struct sta_info *sta) fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_DATA); switch (sdata->vif.type) { + case NL80211_IFTYPE_ADHOC: + /* DA SA BSSID */ + build.da_offs = offsetof(struct ieee80211_hdr, addr1); + build.sa_offs = offsetof(struct ieee80211_hdr, addr2); + memcpy(hdr->addr3, sdata->u.ibss.bssid, ETH_ALEN); + build.hdr_len = 24; + break; case NL80211_IFTYPE_STATION: if (test_sta_flag(sta, WLAN_STA_TDLS_PEER)) { /* DA SA BSSID */ -- cgit From 2d981fddb0e4a5a4097d926f3b9fd3eaf25867a8 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 10 Apr 2015 14:10:10 +0200 Subject: mac80211: allow checksum offload only in fast-xmit When we go through the complete TX processing, there are a number of things like fragmentation and software crypto that require the checksum to be calculated already. In favour of maintainability, instead of adding the necessary call to skb_checksum_help() in all the places that need it, just do it once before the regular TX processing. Right now this only affects the TI wlcore and QCA ath10k drivers since they're the only ones using checksum offload. The previous commits enabled fast-xmit for them in almost all cases. For wlcore this even fixes a corner case: when a key fails to be programmed to hardware software encryption gets used, encrypting frames with a bad checksum. Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 49ba43eff6c4..20a90b16eb4d 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -2851,10 +2851,8 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, rcu_read_lock(); - if (ieee80211_lookup_ra_sta(sdata, skb, &sta)) { - kfree_skb(skb); - goto out; - } + if (ieee80211_lookup_ra_sta(sdata, skb, &sta)) + goto out_free; if (!IS_ERR_OR_NULL(sta)) { struct ieee80211_fast_tx *fast_tx; @@ -2866,6 +2864,21 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, goto out; } + /* the frame could be fragmented, software-encrypted, and other things + * so we cannot really handle checksum offload with it - fix it up in + * software before we handle anything else. + */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + if (skb->encapsulation) + skb_set_inner_transport_header(skb, + skb_checksum_start_offset(skb)); + else + skb_set_transport_header(skb, + skb_checksum_start_offset(skb)); + if (skb_checksum_help(skb)) + goto out_free; + } + skb = ieee80211_build_hdr(sdata, skb, info_flags, sta); if (IS_ERR(skb)) goto out; @@ -2875,6 +2888,9 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, dev->trans_start = jiffies; ieee80211_xmit(sdata, sta, skb); + goto out; + out_free: + kfree_skb(skb); out: rcu_read_unlock(); } -- cgit From 680a0daba74fed0bf30530c9b3e7e706cf29855f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 13 Apr 2015 16:58:25 +0200 Subject: mac80211: allow drivers to support S/G If drivers want to support S/G (really just gather DMA on TX) then we can now easily support this on the fast-xmit path since it just needs to write to the ethernet header (and already has a check for that being possible.) However, disallow this on the regular TX path (which has to handle fragmentation, software crypto, etc.) by calling skb_linearize(). Also allow the related HIGHDMA since that's not interesting to the code in mac80211 at all anyway. Signed-off-by: Johannes Berg --- net/mac80211/main.c | 2 +- net/mac80211/tx.c | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/main.c b/net/mac80211/main.c index df3051d96aff..6dd6dd47a9df 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -840,7 +840,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) /* Only HW csum features are currently compatible with mac80211 */ feature_whitelist = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | - NETIF_F_HW_CSUM; + NETIF_F_HW_CSUM | NETIF_F_SG | NETIF_F_HIGHDMA; if (WARN_ON(hw->netdev_features & ~feature_whitelist)) return -EINVAL; diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 20a90b16eb4d..3a421a04cbec 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -2864,6 +2864,12 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, goto out; } + /* we cannot process non-linear frames on this path */ + if (skb_linearize(skb)) { + kfree_skb(skb); + goto out; + } + /* the frame could be fragmented, software-encrypted, and other things * so we cannot really handle checksum offload with it - fix it up in * software before we handle anything else. -- cgit From 80616c0db8a26f030be78fea39f7c880c4263e55 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 14 Apr 2015 14:50:41 +0200 Subject: mac80211: allow segmentation offloads Implement the necessary software segmentation on the normal TX path so that fast-xmit can use segmentation offload if the hardware (or driver) supports it. Signed-off-by: Johannes Berg --- net/mac80211/main.c | 3 ++- net/mac80211/tx.c | 70 +++++++++++++++++++++++++++++++++++------------------ 2 files changed, 48 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 6dd6dd47a9df..ea31f119234b 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -840,7 +840,8 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) /* Only HW csum features are currently compatible with mac80211 */ feature_whitelist = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | - NETIF_F_HW_CSUM | NETIF_F_SG | NETIF_F_HIGHDMA; + NETIF_F_HW_CSUM | NETIF_F_SG | NETIF_F_HIGHDMA | + NETIF_F_GSO_SOFTWARE; if (WARN_ON(hw->netdev_features & ~feature_whitelist)) return -EINVAL; diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 3a421a04cbec..df62942653ad 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -2843,6 +2843,7 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct sta_info *sta; + struct sk_buff *next; if (unlikely(skb->len < ETH_HLEN)) { kfree_skb(skb); @@ -2864,36 +2865,57 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, goto out; } - /* we cannot process non-linear frames on this path */ - if (skb_linearize(skb)) { - kfree_skb(skb); - goto out; - } + if (skb_is_gso(skb)) { + struct sk_buff *segs; - /* the frame could be fragmented, software-encrypted, and other things - * so we cannot really handle checksum offload with it - fix it up in - * software before we handle anything else. - */ - if (skb->ip_summed == CHECKSUM_PARTIAL) { - if (skb->encapsulation) - skb_set_inner_transport_header(skb, - skb_checksum_start_offset(skb)); - else - skb_set_transport_header(skb, - skb_checksum_start_offset(skb)); - if (skb_checksum_help(skb)) + segs = skb_gso_segment(skb, 0); + if (IS_ERR(segs)) { goto out_free; + } else if (segs) { + consume_skb(skb); + skb = segs; + } + } else { + /* we cannot process non-linear frames on this path */ + if (skb_linearize(skb)) { + kfree_skb(skb); + goto out; + } + + /* the frame could be fragmented, software-encrypted, and other + * things so we cannot really handle checksum offload with it - + * fix it up in software before we handle anything else. + */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + if (skb->encapsulation) + skb_set_inner_transport_header(skb, + skb_checksum_start_offset(skb)); + else + skb_set_transport_header(skb, + skb_checksum_start_offset(skb)); + if (skb_checksum_help(skb)) + goto out_free; + } } - skb = ieee80211_build_hdr(sdata, skb, info_flags, sta); - if (IS_ERR(skb)) - goto out; + next = skb; + while (next) { + skb = next; + next = skb->next; - dev->stats.tx_packets++; - dev->stats.tx_bytes += skb->len; - dev->trans_start = jiffies; + skb->prev = NULL; + skb->next = NULL; + + skb = ieee80211_build_hdr(sdata, skb, info_flags, sta); + if (IS_ERR(skb)) + goto out; - ieee80211_xmit(sdata, sta, skb); + dev->stats.tx_packets++; + dev->stats.tx_bytes += skb->len; + dev->trans_start = jiffies; + + ieee80211_xmit(sdata, sta, skb); + } goto out; out_free: kfree_skb(skb); -- cgit From 1ee5e6676bccbf7a035d8d35c143f1a61e602198 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Wed, 22 Apr 2015 15:51:16 +0800 Subject: xfrm: remove the xfrm_queue_purge definition The task of xfrm_queue_purge is same as skb_queue_purge, so remove it Signed-off-by: Li RongQing Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 638af0655aaf..d8c35ad4210d 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -315,14 +315,6 @@ void xfrm_policy_destroy(struct xfrm_policy *policy) } EXPORT_SYMBOL(xfrm_policy_destroy); -static void xfrm_queue_purge(struct sk_buff_head *list) -{ - struct sk_buff *skb; - - while ((skb = skb_dequeue(list)) != NULL) - kfree_skb(skb); -} - /* Rule must be locked. Release descentant resources, announce * entry dead. The rule must be unlinked from lists to the moment. */ @@ -335,7 +327,7 @@ static void xfrm_policy_kill(struct xfrm_policy *policy) if (del_timer(&policy->polq.hold_timer)) xfrm_pol_put(policy); - xfrm_queue_purge(&policy->polq.hold_queue); + skb_queue_purge(&policy->polq.hold_queue); if (del_timer(&policy->timer)) xfrm_pol_put(policy); @@ -1955,7 +1947,7 @@ out: purge_queue: pq->timeout = 0; - xfrm_queue_purge(&pq->hold_queue); + skb_queue_purge(&pq->hold_queue); xfrm_pol_put(pol); } -- cgit From 800777026eeb33585f29608c2c6131fc66d2e218 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Wed, 22 Apr 2015 17:09:54 +0800 Subject: xfrm: optimise the use of walk list header in xfrm_policy/state_walk The walk from input is the list header, and marked as dead, and will be skipped in loop. list_first_entry() can be used to return the true usable value from walk if walk is not empty Signed-off-by: Li RongQing Acked-by: Herbert Xu Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 4 +++- net/xfrm/xfrm_state.c | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index d8c35ad4210d..847053ec7d91 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1004,7 +1004,9 @@ int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk, if (list_empty(&walk->walk.all)) x = list_first_entry(&net->xfrm.policy_all, struct xfrm_policy_walk_entry, all); else - x = list_entry(&walk->walk.all, struct xfrm_policy_walk_entry, all); + x = list_first_entry(&walk->walk.all, + struct xfrm_policy_walk_entry, all); + list_for_each_entry_from(x, &net->xfrm.policy_all, all) { if (x->dead) continue; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index f5e39e35d73a..df9318301e98 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1626,7 +1626,7 @@ int xfrm_state_walk(struct net *net, struct xfrm_state_walk *walk, if (list_empty(&walk->all)) x = list_first_entry(&net->xfrm.state_all, struct xfrm_state_walk, all); else - x = list_entry(&walk->all, struct xfrm_state_walk, all); + x = list_first_entry(&walk->all, struct xfrm_state_walk, all); list_for_each_entry_from(x, &net->xfrm.state_all, all) { if (x->state == XFRM_STATE_DEAD) continue; -- cgit From f31e8d4f7b44092b6b2ab3a6b1d4079836b6955a Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Thu, 23 Apr 2015 11:06:53 +0800 Subject: xfrm: fix the return code when xfrm_*_register_afinfo failed If xfrm_*_register_afinfo failed since xfrm_*_afinfo[afinfo->family] had the value, return the -EEXIST, not -ENOBUFS Signed-off-by: Li RongQing Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_input.c | 2 +- net/xfrm/xfrm_policy.c | 2 +- net/xfrm/xfrm_state.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 526c4feb3b50..459796a78ab0 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -29,7 +29,7 @@ int xfrm_input_register_afinfo(struct xfrm_input_afinfo *afinfo) return -EAFNOSUPPORT; spin_lock_bh(&xfrm_input_afinfo_lock); if (unlikely(xfrm_input_afinfo[afinfo->family] != NULL)) - err = -ENOBUFS; + err = -EEXIST; else rcu_assign_pointer(xfrm_input_afinfo[afinfo->family], afinfo); spin_unlock_bh(&xfrm_input_afinfo_lock); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 847053ec7d91..c4c47f337a3a 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2808,7 +2808,7 @@ int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo) return -EAFNOSUPPORT; spin_lock(&xfrm_policy_afinfo_lock); if (unlikely(xfrm_policy_afinfo[afinfo->family] != NULL)) - err = -ENOBUFS; + err = -EEXIST; else { struct dst_ops *dst_ops = afinfo->dst_ops; if (likely(dst_ops->kmem_cachep == NULL)) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index df9318301e98..e47e4980b35c 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1908,7 +1908,7 @@ int xfrm_state_register_afinfo(struct xfrm_state_afinfo *afinfo) return -EAFNOSUPPORT; spin_lock_bh(&xfrm_state_afinfo_lock); if (unlikely(xfrm_state_afinfo[afinfo->family] != NULL)) - err = -ENOBUFS; + err = -EEXIST; else rcu_assign_pointer(xfrm_state_afinfo[afinfo->family], afinfo); spin_unlock_bh(&xfrm_state_afinfo_lock); -- cgit From ebd82b39bf11b38b0b50919c8d4386706b26bff7 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 23 Apr 2015 17:26:06 +0200 Subject: mac80211: make station hash table max_size configurable Allow debug builds to configure the station hash table maximum size in order to run with hash collisions in limited scenarios such as hwsim testing. The default remains 0 which effectively means no limit. Signed-off-by: Johannes Berg --- net/mac80211/Kconfig | 10 ++++++++++ net/mac80211/sta_info.c | 1 + 2 files changed, 11 insertions(+) (limited to 'net') diff --git a/net/mac80211/Kconfig b/net/mac80211/Kconfig index 64a012a0c6e5..5f698d4ec53b 100644 --- a/net/mac80211/Kconfig +++ b/net/mac80211/Kconfig @@ -305,3 +305,13 @@ config MAC80211_DEBUG_COUNTERS and show them in debugfs. If unsure, say N. + +config MAC80211_STA_HASH_MAX_SIZE + int "Station hash table maximum size" if MAC80211_DEBUG_MENU + default 0 + ---help--- + Setting this option to a low value (e.g. 4) allows testing the + hash table with collisions relatively deterministically (just + connect more stations than the number selected here.) + + If unsure, leave the default of 0. diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 737730abba6d..aec15d746aea 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -70,6 +70,7 @@ static const struct rhashtable_params sta_rht_params = { .key_offset = offsetof(struct sta_info, sta.addr), .key_len = ETH_ALEN, .hashfn = sta_addr_hash, + .max_size = CONFIG_MAC80211_STA_HASH_MAX_SIZE, }; /* Caller must hold local->sta_mtx */ -- cgit From df1404650ccbfeb76a84f301f22316be0d00a864 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 22 Apr 2015 14:40:58 +0200 Subject: mac80211: remove support for IFF_PROMISC This support is essentially useless as typically networks are encrypted, frames will be filtered by hardware, and rate scaling will be done with the intended recipient in mind. For real monitoring of the network, the monitor mode support should be used instead. Removing it removes a lot of corner cases. Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 6 ++---- net/mac80211/iface.c | 19 ++----------------- net/mac80211/main.c | 3 --- net/mac80211/rx.c | 33 ++++++++------------------------- 4 files changed, 12 insertions(+), 49 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 556051f68ad7..7d12ba5a4a36 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -725,7 +725,6 @@ struct ieee80211_if_mesh { * enum ieee80211_sub_if_data_flags - virtual interface flags * * @IEEE80211_SDATA_ALLMULTI: interface wants all multicast packets - * @IEEE80211_SDATA_PROMISC: interface is promisc * @IEEE80211_SDATA_OPERATING_GMODE: operating in G-only mode * @IEEE80211_SDATA_DONT_BRIDGE_PACKETS: bridge packets between * associated stations and deliver multicast frames both @@ -735,7 +734,6 @@ struct ieee80211_if_mesh { */ enum ieee80211_sub_if_data_flags { IEEE80211_SDATA_ALLMULTI = BIT(0), - IEEE80211_SDATA_PROMISC = BIT(1), IEEE80211_SDATA_OPERATING_GMODE = BIT(2), IEEE80211_SDATA_DONT_BRIDGE_PACKETS = BIT(3), IEEE80211_SDATA_DISCONNECT_RESUME = BIT(4), @@ -1211,8 +1209,8 @@ struct ieee80211_local { atomic_t agg_queue_stop[IEEE80211_MAX_QUEUES]; - /* number of interfaces with corresponding IFF_ flags */ - atomic_t iff_allmultis, iff_promiscs; + /* number of interfaces with allmulti RX */ + atomic_t iff_allmultis; struct rate_control_ref *rate_ctrl; diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index b4ac596a7cb7..7791a08a560a 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -697,9 +697,6 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) if (sdata->flags & IEEE80211_SDATA_ALLMULTI) atomic_inc(&local->iff_allmultis); - if (sdata->flags & IEEE80211_SDATA_PROMISC) - atomic_inc(&local->iff_promiscs); - if (coming_up) local->open_count++; @@ -827,13 +824,10 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, WARN_ON_ONCE((sdata->vif.type != NL80211_IFTYPE_WDS && flushed > 0) || (sdata->vif.type == NL80211_IFTYPE_WDS && flushed != 1)); - /* don't count this interface for promisc/allmulti while it is down */ + /* don't count this interface for allmulti while it is down */ if (sdata->flags & IEEE80211_SDATA_ALLMULTI) atomic_dec(&local->iff_allmultis); - if (sdata->flags & IEEE80211_SDATA_PROMISC) - atomic_dec(&local->iff_promiscs); - if (sdata->vif.type == NL80211_IFTYPE_AP) { local->fif_pspoll--; local->fif_probe_req--; @@ -1047,12 +1041,10 @@ static void ieee80211_set_multicast_list(struct net_device *dev) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; - int allmulti, promisc, sdata_allmulti, sdata_promisc; + int allmulti, sdata_allmulti; allmulti = !!(dev->flags & IFF_ALLMULTI); - promisc = !!(dev->flags & IFF_PROMISC); sdata_allmulti = !!(sdata->flags & IEEE80211_SDATA_ALLMULTI); - sdata_promisc = !!(sdata->flags & IEEE80211_SDATA_PROMISC); if (allmulti != sdata_allmulti) { if (dev->flags & IFF_ALLMULTI) @@ -1062,13 +1054,6 @@ static void ieee80211_set_multicast_list(struct net_device *dev) sdata->flags ^= IEEE80211_SDATA_ALLMULTI; } - if (promisc != sdata_promisc) { - if (dev->flags & IFF_PROMISC) - atomic_inc(&local->iff_promiscs); - else - atomic_dec(&local->iff_promiscs); - sdata->flags ^= IEEE80211_SDATA_PROMISC; - } spin_lock_bh(&local->filter_lock); __hw_addr_sync(&local->mc_list, &dev->mc, dev->addr_len); spin_unlock_bh(&local->filter_lock); diff --git a/net/mac80211/main.c b/net/mac80211/main.c index ea31f119234b..b144de971366 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -41,9 +41,6 @@ void ieee80211_configure_filter(struct ieee80211_local *local) unsigned int changed_flags; unsigned int new_flags = 0; - if (atomic_read(&local->iff_promiscs)) - new_flags |= FIF_PROMISC_IN_BSS; - if (atomic_read(&local->iff_allmultis)) new_flags |= FIF_ALLMULTI; diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 6e3b564b6dea..99fb5d80e60a 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2331,11 +2331,9 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, fwded_frames); ieee80211_add_pending_skb(local, fwd_skb); out: - if (is_multicast_ether_addr(hdr->addr1) || - sdata->dev->flags & IFF_PROMISC) + if (is_multicast_ether_addr(hdr->addr1)) return RX_CONTINUE; - else - return RX_DROP_MONITOR; + return RX_DROP_MONITOR; } #endif @@ -3266,12 +3264,8 @@ static bool prepare_for_handlers(struct ieee80211_rx_data *rx, if (!bssid && !sdata->u.mgd.use_4addr) return false; if (!multicast && - !ether_addr_equal(sdata->vif.addr, hdr->addr1)) { - if (!(sdata->dev->flags & IFF_PROMISC) || - sdata->u.mgd.use_4addr) - return false; - status->rx_flags &= ~IEEE80211_RX_RA_MATCH; - } + !ether_addr_equal(sdata->vif.addr, hdr->addr1)) + return false; break; case NL80211_IFTYPE_ADHOC: if (!bssid) @@ -3285,9 +3279,7 @@ static bool prepare_for_handlers(struct ieee80211_rx_data *rx, return false; } else if (!multicast && !ether_addr_equal(sdata->vif.addr, hdr->addr1)) { - if (!(sdata->dev->flags & IFF_PROMISC)) - return false; - status->rx_flags &= ~IEEE80211_RX_RA_MATCH; + return false; } else if (!rx->sta) { int rate_idx; if (status->flag & (RX_FLAG_HT | RX_FLAG_VHT)) @@ -3309,12 +3301,7 @@ static bool prepare_for_handlers(struct ieee80211_rx_data *rx, } else if (!multicast && !ether_addr_equal(sdata->dev->dev_addr, hdr->addr1)) { - /* if we are in promisc mode we also accept - * packets not destined for us - */ - if (!(sdata->dev->flags & IFF_PROMISC)) - return false; - rx->flags &= ~IEEE80211_RX_RA_MATCH; + return false; } else if (!rx->sta) { int rate_idx; if (status->flag & RX_FLAG_HT) @@ -3327,12 +3314,8 @@ static bool prepare_for_handlers(struct ieee80211_rx_data *rx, break; case NL80211_IFTYPE_MESH_POINT: if (!multicast && - !ether_addr_equal(sdata->vif.addr, hdr->addr1)) { - if (!(sdata->dev->flags & IFF_PROMISC)) - return false; - - status->rx_flags &= ~IEEE80211_RX_RA_MATCH; - } + !ether_addr_equal(sdata->vif.addr, hdr->addr1)) + return false; break; case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_AP: -- cgit From 5c90067c0f79705b8fa3207c734b18727f320637 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 22 Apr 2015 14:48:34 +0200 Subject: mac80211: remove IEEE80211_RX_RA_MATCH With promisc support gone, only AP and P2P-Device type interfaces still clear IEEE80211_RX_RA_MATCH. In both cases this isn't really necessary though, so we can remove that flag and the code. Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 3 --- net/mac80211/rx.c | 44 +++++--------------------------------------- 2 files changed, 5 insertions(+), 42 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 7d12ba5a4a36..e9f36f7ec733 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -181,8 +181,6 @@ typedef unsigned __bitwise__ ieee80211_rx_result; /** * enum ieee80211_packet_rx_flags - packet RX flags - * @IEEE80211_RX_RA_MATCH: frame is destined to interface currently processed - * (incl. multicast frames) * @IEEE80211_RX_FRAGMENTED: fragmented frame * @IEEE80211_RX_AMSDU: a-MSDU packet * @IEEE80211_RX_MALFORMED_ACTION_FRM: action frame is malformed @@ -192,7 +190,6 @@ typedef unsigned __bitwise__ ieee80211_rx_result; * @rx_flags field of &struct ieee80211_rx_status. */ enum ieee80211_packet_rx_flags { - IEEE80211_RX_RA_MATCH = BIT(1), IEEE80211_RX_FRAGMENTED = BIT(2), IEEE80211_RX_AMSDU = BIT(3), IEEE80211_RX_MALFORMED_ACTION_FRM = BIT(4), diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 99fb5d80e60a..2d8c83892214 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -981,7 +981,6 @@ static void ieee80211_rx_reorder_ampdu(struct ieee80211_rx_data *rx, struct sk_buff *skb = rx->skb; struct ieee80211_local *local = rx->local; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; - struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); struct sta_info *sta = rx->sta; struct tid_ampdu_rx *tid_agg_rx; u16 sc; @@ -1016,10 +1015,6 @@ static void ieee80211_rx_reorder_ampdu(struct ieee80211_rx_data *rx, ack_policy != IEEE80211_QOS_CTL_ACK_POLICY_NORMAL) goto dont_reorder; - /* not actually part of this BA session */ - if (!(status->rx_flags & IEEE80211_RX_RA_MATCH)) - goto dont_reorder; - /* new, potentially un-ordered, ampdu frame - process it */ /* reset session timer */ @@ -1073,10 +1068,8 @@ ieee80211_rx_h_check_dup(struct ieee80211_rx_data *rx) if (unlikely(ieee80211_has_retry(hdr->frame_control) && rx->sta->last_seq_ctrl[rx->seqno_idx] == hdr->seq_ctrl)) { - if (status->rx_flags & IEEE80211_RX_RA_MATCH) { - rx->local->dot11FrameDuplicateCount++; - rx->sta->num_duplicates++; - } + rx->local->dot11FrameDuplicateCount++; + rx->sta->num_duplicates++; return RX_DROP_UNUSABLE; } else if (!(status->flag & RX_FLAG_AMSDU_MORE)) { rx->sta->last_seq_ctrl[rx->seqno_idx] = hdr->seq_ctrl; @@ -1267,7 +1260,7 @@ ieee80211_rx_h_uapsd_and_pspoll(struct ieee80211_rx_data *rx) struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb); int tid, ac; - if (!rx->sta || !(status->rx_flags & IEEE80211_RX_RA_MATCH)) + if (!rx->sta) return RX_CONTINUE; if (sdata->vif.type != NL80211_IFTYPE_AP && @@ -1388,9 +1381,6 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx) } } - if (!(status->rx_flags & IEEE80211_RX_RA_MATCH)) - return RX_CONTINUE; - if (rx->sdata->vif.type == NL80211_IFTYPE_STATION) ieee80211_sta_rx_notify(rx->sdata, hdr); @@ -1519,13 +1509,6 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) * possible. */ - /* - * No point in finding a key and decrypting if the frame is neither - * addressed to us nor a multicast frame. - */ - if (!(status->rx_flags & IEEE80211_RX_RA_MATCH)) - return RX_CONTINUE; - /* start without a key */ rx->key = NULL; fc = hdr->frame_control; @@ -2056,7 +2039,6 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx) struct sk_buff *skb, *xmit_skb; struct ethhdr *ehdr = (struct ethhdr *) rx->skb->data; struct sta_info *dsta; - struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb); dev->stats.rx_packets++; dev->stats.rx_bytes += rx->skb->len; @@ -2067,7 +2049,6 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx) if ((sdata->vif.type == NL80211_IFTYPE_AP || sdata->vif.type == NL80211_IFTYPE_AP_VLAN) && !(sdata->flags & IEEE80211_SDATA_DONT_BRIDGE_PACKETS) && - (status->rx_flags & IEEE80211_RX_RA_MATCH) && (sdata->vif.type != NL80211_IFTYPE_AP_VLAN || !sdata->u.vlan.sta)) { if (is_multicast_ether_addr(ehdr->h_dest)) { /* @@ -2208,7 +2189,6 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) struct sk_buff *skb = rx->skb, *fwd_skb; struct ieee80211_local *local = rx->local; struct ieee80211_sub_if_data *sdata = rx->sdata; - struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; u16 q, hdrlen; @@ -2239,8 +2219,7 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) mesh_rmc_check(rx->sdata, hdr->addr3, mesh_hdr)) return RX_DROP_MONITOR; - if (!ieee80211_is_data(hdr->frame_control) || - !(status->rx_flags & IEEE80211_RX_RA_MATCH)) + if (!ieee80211_is_data(hdr->frame_control)) return RX_CONTINUE; if (!mesh_hdr->ttl) @@ -2560,9 +2539,6 @@ ieee80211_rx_h_mgmt_check(struct ieee80211_rx_data *rx) rx->flags |= IEEE80211_RX_BEACON_REPORTED; } - if (!(status->rx_flags & IEEE80211_RX_RA_MATCH)) - return RX_DROP_MONITOR; - if (ieee80211_drop_unencrypted_mgmt(rx)) return RX_DROP_UNUSABLE; @@ -2590,9 +2566,6 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) mgmt->u.action.category != WLAN_CATEGORY_SPECTRUM_MGMT) return RX_DROP_UNUSABLE; - if (!(status->rx_flags & IEEE80211_RX_RA_MATCH)) - return RX_DROP_UNUSABLE; - switch (mgmt->u.action.category) { case WLAN_CATEGORY_HT: /* reject HT action frames from stations not supporting HT */ @@ -3334,9 +3307,7 @@ static bool prepare_for_handlers(struct ieee80211_rx_data *rx, return false; if (ieee80211_is_public_action(hdr, skb->len)) return true; - if (!ieee80211_is_beacon(hdr->frame_control)) - return false; - status->rx_flags &= ~IEEE80211_RX_RA_MATCH; + return ieee80211_is_beacon(hdr->frame_control); } else if (!ieee80211_has_tods(hdr->frame_control)) { /* ignore data frames to TDLS-peers */ if (ieee80211_is_data(hdr->frame_control)) @@ -3359,9 +3330,6 @@ static bool prepare_for_handlers(struct ieee80211_rx_data *rx, !ieee80211_is_probe_resp(hdr->frame_control) && !ieee80211_is_beacon(hdr->frame_control)) return false; - if (!ether_addr_equal(sdata->vif.addr, hdr->addr1) && - !multicast) - status->rx_flags &= ~IEEE80211_RX_RA_MATCH; break; default: /* should never get here */ @@ -3383,11 +3351,9 @@ static bool ieee80211_prepare_and_rx_handle(struct ieee80211_rx_data *rx, { struct ieee80211_local *local = rx->local; struct ieee80211_sub_if_data *sdata = rx->sdata; - struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); struct ieee80211_hdr *hdr = (void *)skb->data; rx->skb = skb; - status->rx_flags |= IEEE80211_RX_RA_MATCH; if (!prepare_for_handlers(rx, hdr)) return false; -- cgit From a58fbe1a8540d870393b343bc6cb06a7f8201387 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 22 Apr 2015 15:08:39 +0200 Subject: mac80211: clean up/rename prepare_for_handlers() The function really shouldn't be called prepare_for_handlers(), all it does is check if the frame should be dropped. Rename it to ieee80211_accept_frame() and clean it up a bit. Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 80 +++++++++++++++++++++++++------------------------------ 1 file changed, 36 insertions(+), 44 deletions(-) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 2d8c83892214..5fea34b158fe 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -3223,11 +3223,11 @@ void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid) /* main receive path */ -static bool prepare_for_handlers(struct ieee80211_rx_data *rx, - struct ieee80211_hdr *hdr) +static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx) { struct ieee80211_sub_if_data *sdata = rx->sdata; struct sk_buff *skb = rx->skb; + struct ieee80211_hdr *hdr = (void *)skb->data; struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); u8 *bssid = ieee80211_get_bssid(hdr, skb->len, sdata->vif.type); int multicast = is_multicast_ether_addr(hdr->addr1); @@ -3236,24 +3236,23 @@ static bool prepare_for_handlers(struct ieee80211_rx_data *rx, case NL80211_IFTYPE_STATION: if (!bssid && !sdata->u.mgd.use_4addr) return false; - if (!multicast && - !ether_addr_equal(sdata->vif.addr, hdr->addr1)) - return false; - break; + if (multicast) + return true; + return ether_addr_equal(sdata->vif.addr, hdr->addr1); case NL80211_IFTYPE_ADHOC: if (!bssid) return false; if (ether_addr_equal(sdata->vif.addr, hdr->addr2) || ether_addr_equal(sdata->u.ibss.bssid, hdr->addr2)) return false; - if (ieee80211_is_beacon(hdr->frame_control)) { + if (ieee80211_is_beacon(hdr->frame_control)) return true; - } else if (!ieee80211_bssid_match(bssid, sdata->u.ibss.bssid)) { + if (!ieee80211_bssid_match(bssid, sdata->u.ibss.bssid)) return false; - } else if (!multicast && - !ether_addr_equal(sdata->vif.addr, hdr->addr1)) { + if (!multicast && + !ether_addr_equal(sdata->vif.addr, hdr->addr1)) return false; - } else if (!rx->sta) { + if (!rx->sta) { int rate_idx; if (status->flag & (RX_FLAG_HT | RX_FLAG_VHT)) rate_idx = 0; /* TODO: HT/VHT rates */ @@ -3262,20 +3261,18 @@ static bool prepare_for_handlers(struct ieee80211_rx_data *rx, ieee80211_ibss_rx_no_sta(sdata, bssid, hdr->addr2, BIT(rate_idx)); } - break; + return true; case NL80211_IFTYPE_OCB: if (!bssid) return false; - if (ieee80211_is_beacon(hdr->frame_control)) { + if (ieee80211_is_beacon(hdr->frame_control)) return false; - } else if (!is_broadcast_ether_addr(bssid)) { - ocb_dbg(sdata, "BSSID mismatch in OCB mode!\n"); + if (!is_broadcast_ether_addr(bssid)) return false; - } else if (!multicast && - !ether_addr_equal(sdata->dev->dev_addr, - hdr->addr1)) { + if (!multicast && + !ether_addr_equal(sdata->dev->dev_addr, hdr->addr1)) return false; - } else if (!rx->sta) { + if (!rx->sta) { int rate_idx; if (status->flag & RX_FLAG_HT) rate_idx = 0; /* TODO: HT rates */ @@ -3284,18 +3281,17 @@ static bool prepare_for_handlers(struct ieee80211_rx_data *rx, ieee80211_ocb_rx_no_sta(sdata, bssid, hdr->addr2, BIT(rate_idx)); } - break; + return true; case NL80211_IFTYPE_MESH_POINT: - if (!multicast && - !ether_addr_equal(sdata->vif.addr, hdr->addr1)) - return false; - break; + if (multicast) + return true; + return ether_addr_equal(sdata->vif.addr, hdr->addr1); case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_AP: - if (!bssid) { - if (!ether_addr_equal(sdata->vif.addr, hdr->addr1)) - return false; - } else if (!ieee80211_bssid_match(bssid, sdata->vif.addr)) { + if (!bssid) + return ether_addr_equal(sdata->vif.addr, hdr->addr1); + + if (!ieee80211_bssid_match(bssid, sdata->vif.addr)) { /* * Accept public action frames even when the * BSSID doesn't match, this is used for P2P @@ -3308,7 +3304,9 @@ static bool prepare_for_handlers(struct ieee80211_rx_data *rx, if (ieee80211_is_public_action(hdr, skb->len)) return true; return ieee80211_is_beacon(hdr->frame_control); - } else if (!ieee80211_has_tods(hdr->frame_control)) { + } + + if (!ieee80211_has_tods(hdr->frame_control)) { /* ignore data frames to TDLS-peers */ if (ieee80211_is_data(hdr->frame_control)) return false; @@ -3317,27 +3315,22 @@ static bool prepare_for_handlers(struct ieee80211_rx_data *rx, !ether_addr_equal(bssid, hdr->addr1)) return false; } - break; + return true; case NL80211_IFTYPE_WDS: if (bssid || !ieee80211_is_data(hdr->frame_control)) return false; - if (!ether_addr_equal(sdata->u.wds.remote_addr, hdr->addr2)) - return false; - break; + return ether_addr_equal(sdata->u.wds.remote_addr, hdr->addr2); case NL80211_IFTYPE_P2P_DEVICE: - if (!ieee80211_is_public_action(hdr, skb->len) && - !ieee80211_is_probe_req(hdr->frame_control) && - !ieee80211_is_probe_resp(hdr->frame_control) && - !ieee80211_is_beacon(hdr->frame_control)) - return false; - break; + return ieee80211_is_public_action(hdr, skb->len) || + ieee80211_is_probe_req(hdr->frame_control) || + ieee80211_is_probe_resp(hdr->frame_control) || + ieee80211_is_beacon(hdr->frame_control); default: - /* should never get here */ - WARN_ON_ONCE(1); break; } - return true; + WARN_ON_ONCE(1); + return false; } /* @@ -3351,11 +3344,10 @@ static bool ieee80211_prepare_and_rx_handle(struct ieee80211_rx_data *rx, { struct ieee80211_local *local = rx->local; struct ieee80211_sub_if_data *sdata = rx->sdata; - struct ieee80211_hdr *hdr = (void *)skb->data; rx->skb = skb; - if (!prepare_for_handlers(rx, hdr)) + if (!ieee80211_accept_frame(rx)) return false; if (!consume) { -- cgit From 6fe3eac79329611eaa716b7a3e3b85f6698b668b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 22 Apr 2015 15:10:45 +0200 Subject: mac80211: OCB: remove pointless check for broadcast BSSID The OCB input path already checked that the BSSID is the broadcast address, so the later check can never fail. Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 5fea34b158fe..11dbbc576491 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1362,11 +1362,7 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx) } } } else if (rx->sdata->vif.type == NL80211_IFTYPE_OCB) { - u8 *bssid = ieee80211_get_bssid(hdr, rx->skb->len, - NL80211_IFTYPE_OCB); - /* OCB uses wild-card BSSID */ - if (is_broadcast_ether_addr(bssid)) - sta->last_rx = jiffies; + sta->last_rx = jiffies; } else if (!is_multicast_ether_addr(hdr->addr1)) { /* * Mesh beacons will update last_rx when if they are found to -- cgit From ce5b071a456beea13a893fcc73f47998bf7ceb35 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 22 Apr 2015 16:44:46 +0200 Subject: mac80211: don't update dev->trans_start This isn't necessary any more as the stack will automatically update the TXQ's trans_start after calling ndo_start_xmit(). Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index df62942653ad..83e3261dbf67 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -2729,7 +2729,6 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, dev->stats.tx_packets++; dev->stats.tx_bytes += skb->len + extra_head; - dev->trans_start = jiffies; /* will not be crypto-handled beyond what we do here, so use false * as the may-encrypt argument for the resize to not account for @@ -2912,7 +2911,6 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, dev->stats.tx_packets++; dev->stats.tx_bytes += skb->len; - dev->trans_start = jiffies; ieee80211_xmit(sdata, sta, skb); } -- cgit From 5a490510ba5fce8a10746525357a297f8f076bb1 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 22 Apr 2015 17:10:38 +0200 Subject: mac80211: use per-CPU TX/RX statistics This isn't all that relevant for RX right now, but TX can be concurrent due to multi-queue and the accounting is therefore broken. Use the standard per-CPU statistics to avoid this. Signed-off-by: Johannes Berg --- net/mac80211/iface.c | 45 ++++++++++++++++++++++++++++++++++++++++++++- net/mac80211/rx.c | 21 ++++++++++++++------- net/mac80211/tx.c | 16 ++++++++++++---- 3 files changed, 70 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 7791a08a560a..dc2d7133c4f6 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1094,6 +1094,35 @@ static u16 ieee80211_netdev_select_queue(struct net_device *dev, return ieee80211_select_queue(IEEE80211_DEV_TO_SUB_IF(dev), skb); } +static struct rtnl_link_stats64 * +ieee80211_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) +{ + int i; + + for_each_possible_cpu(i) { + const struct pcpu_sw_netstats *tstats; + u64 rx_packets, rx_bytes, tx_packets, tx_bytes; + unsigned int start; + + tstats = per_cpu_ptr(dev->tstats, i); + + do { + start = u64_stats_fetch_begin_irq(&tstats->syncp); + rx_packets = tstats->rx_packets; + tx_packets = tstats->tx_packets; + rx_bytes = tstats->rx_bytes; + tx_bytes = tstats->tx_bytes; + } while (u64_stats_fetch_retry_irq(&tstats->syncp, start)); + + stats->rx_packets += rx_packets; + stats->tx_packets += tx_packets; + stats->rx_bytes += rx_bytes; + stats->tx_bytes += tx_bytes; + } + + return stats; +} + static const struct net_device_ops ieee80211_dataif_ops = { .ndo_open = ieee80211_open, .ndo_stop = ieee80211_stop, @@ -1103,6 +1132,7 @@ static const struct net_device_ops ieee80211_dataif_ops = { .ndo_change_mtu = ieee80211_change_mtu, .ndo_set_mac_address = ieee80211_change_mac, .ndo_select_queue = ieee80211_netdev_select_queue, + .ndo_get_stats64 = ieee80211_get_stats64, }; static u16 ieee80211_monitor_select_queue(struct net_device *dev, @@ -1136,14 +1166,21 @@ static const struct net_device_ops ieee80211_monitorif_ops = { .ndo_change_mtu = ieee80211_change_mtu, .ndo_set_mac_address = ieee80211_change_mac, .ndo_select_queue = ieee80211_monitor_select_queue, + .ndo_get_stats64 = ieee80211_get_stats64, }; +static void ieee80211_if_free(struct net_device *dev) +{ + free_percpu(dev->tstats); + free_netdev(dev); +} + static void ieee80211_if_setup(struct net_device *dev) { ether_setup(dev); dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->netdev_ops = &ieee80211_dataif_ops; - dev->destructor = free_netdev; + dev->destructor = ieee80211_if_free; } static void ieee80211_iface_work(struct work_struct *work) @@ -1684,6 +1721,12 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, return -ENOMEM; dev_net_set(ndev, wiphy_net(local->hw.wiphy)); + ndev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!ndev->tstats) { + free_netdev(ndev); + return -ENOMEM; + } + ndev->needed_headroom = local->tx_headroom + 4*6 /* four MAC addresses */ + 2 + 2 + 2 + 2 /* ctl, dur, seq, qos */ diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 11dbbc576491..b69a071d7679 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -32,6 +32,16 @@ #include "wme.h" #include "rate.h" +static inline void ieee80211_rx_stats(struct net_device *dev, u32 len) +{ + struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats); + + u64_stats_update_begin(&tstats->syncp); + tstats->rx_packets++; + tstats->rx_bytes += len; + u64_stats_update_end(&tstats->syncp); +} + /* * monitor mode reception * @@ -529,8 +539,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, } prev_dev = sdata->dev; - sdata->dev->stats.rx_packets++; - sdata->dev->stats.rx_bytes += skb->len; + ieee80211_rx_stats(sdata->dev, skb->len); } if (prev_dev) { @@ -2036,12 +2045,11 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx) struct ethhdr *ehdr = (struct ethhdr *) rx->skb->data; struct sta_info *dsta; - dev->stats.rx_packets++; - dev->stats.rx_bytes += rx->skb->len; - skb = rx->skb; xmit_skb = NULL; + ieee80211_rx_stats(dev, skb->len); + if ((sdata->vif.type == NL80211_IFTYPE_AP || sdata->vif.type == NL80211_IFTYPE_AP_VLAN) && !(sdata->flags & IEEE80211_SDATA_DONT_BRIDGE_PACKETS) && @@ -3045,8 +3053,7 @@ static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx, } prev_dev = sdata->dev; - sdata->dev->stats.rx_packets++; - sdata->dev->stats.rx_bytes += skb->len; + ieee80211_rx_stats(sdata->dev, skb->len); } if (prev_dev) { diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 83e3261dbf67..745fdf5c2722 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -37,6 +37,16 @@ /* misc utils */ +static inline void ieee80211_tx_stats(struct net_device *dev, u32 len) +{ + struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats); + + u64_stats_update_begin(&tstats->syncp); + tstats->tx_packets++; + tstats->tx_bytes += len; + u64_stats_update_end(&tstats->syncp); +} + static __le16 ieee80211_duration(struct ieee80211_tx_data *tx, struct sk_buff *skb, int group_addr, int next_frag_len) @@ -2727,8 +2737,7 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, return true; } - dev->stats.tx_packets++; - dev->stats.tx_bytes += skb->len + extra_head; + ieee80211_tx_stats(dev, skb->len + extra_head); /* will not be crypto-handled beyond what we do here, so use false * as the may-encrypt argument for the resize to not account for @@ -2909,8 +2918,7 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, if (IS_ERR(skb)) goto out; - dev->stats.tx_packets++; - dev->stats.tx_bytes += skb->len; + ieee80211_tx_stats(dev, skb->len); ieee80211_xmit(sdata, sta, skb); } -- cgit From dc0565ce6e34be06730312e79b226b7408a543c8 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Fri, 24 Apr 2015 16:49:31 +0800 Subject: xfrm: slightly optimise xfrm_input Check x->km.state with XFRM_STATE_ACQ only when state is not XFRM_STAT_VALID, not everytime Signed-off-by: Li RongQing Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_input.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 459796a78ab0..1858a45f008b 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -239,13 +239,13 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) skb->sp->xvec[skb->sp->len++] = x; spin_lock(&x->lock); - if (unlikely(x->km.state == XFRM_STATE_ACQ)) { - XFRM_INC_STATS(net, LINUX_MIB_XFRMACQUIREERROR); - goto drop_unlock; - } if (unlikely(x->km.state != XFRM_STATE_VALID)) { - XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEINVALID); + if (x->km.state == XFRM_STATE_ACQ) + XFRM_INC_STATS(net, LINUX_MIB_XFRMACQUIREERROR); + else + XFRM_INC_STATS(net, + LINUX_MIB_XFRMINSTATEINVALID); goto drop_unlock; } -- cgit From 4292504044a4fd4c5d9155dcb5c7b09ed6cbf611 Mon Sep 17 00:00:00 2001 From: Chun-Yeow Yeoh Date: Sat, 18 Apr 2015 01:30:02 +0800 Subject: cfg80211: allow the plink state blocking for user managed mesh wpa_supplicant or authsae handles the mesh peering in user space, but the plink state is still managed in kernel space. Currently, there is no implementation by wpa_supplicant or authsae to block the plink state after it is set to ESTAB. By applying this patch, we can use the "iw mesh0 station set plink_action block" to block the peer mesh STA. This is useful for experimenting purposes. Signed-off-by: Chun-Yeow Yeoh Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index dd78445c7d50..8a33bbae9ec5 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4061,7 +4061,8 @@ int cfg80211_check_station_change(struct wiphy *wiphy, return -EINVAL; break; case CFG80211_STA_MESH_PEER_USER: - if (params->plink_action != NL80211_PLINK_ACTION_NO_ACTION) + if (params->plink_action != NL80211_PLINK_ACTION_NO_ACTION && + params->plink_action != NL80211_PLINK_ACTION_BLOCK) return -EINVAL; break; } -- cgit From c1041f109a788fbc45197c38aed4c46e580d9d5f Mon Sep 17 00:00:00 2001 From: Chaya Rachel Ivgi Date: Mon, 20 Apr 2015 22:51:46 +0300 Subject: mac80211: fix ignored HT/VHT override configs HT and VHT override configurations were ignored during association and applied only when first beacon recived, or not applied at all. Fix the code to apply HT/VHT overrides during association. This is a bit tricky since the channel was already configured during authentication and we don't want to reconfigure it unless there's really a change. Signed-off-by: Chaya Rachel Ivgi Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 57 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 36 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 26053bf2faa8..3294666f599c 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -4307,15 +4307,15 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, } static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, - struct cfg80211_bss *cbss, bool assoc) + struct cfg80211_bss *cbss, bool assoc, + bool override) { struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_bss *bss = (void *)cbss->priv; struct sta_info *new_sta = NULL; struct ieee80211_supported_band *sband; - struct ieee80211_sta_ht_cap sta_ht_cap; - bool have_sta = false, is_override = false; + bool have_sta = false; int err; sband = local->hw.wiphy->bands[cbss->channel->band]; @@ -4335,14 +4335,7 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, return -ENOMEM; } - memcpy(&sta_ht_cap, &sband->ht_cap, sizeof(sta_ht_cap)); - ieee80211_apply_htcap_overrides(sdata, &sta_ht_cap); - - is_override = (sta_ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40) != - (sband->ht_cap.cap & - IEEE80211_HT_CAP_SUP_WIDTH_20_40); - - if (new_sta || is_override) { + if (new_sta || override) { err = ieee80211_prep_channel(sdata, cbss); if (err) { if (new_sta) @@ -4552,7 +4545,7 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, sdata_info(sdata, "authenticate with %pM\n", req->bss->bssid); - err = ieee80211_prep_connection(sdata, req->bss, false); + err = ieee80211_prep_connection(sdata, req->bss, false, false); if (err) goto err_clear; @@ -4624,6 +4617,7 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband; const u8 *ssidie, *ht_ie, *vht_ie; int i, err; + bool override = false; assoc_data = kzalloc(sizeof(*assoc_data) + req->ie_len, GFP_KERNEL); if (!assoc_data) @@ -4728,14 +4722,6 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, } } - if (req->flags & ASSOC_REQ_DISABLE_HT) { - ifmgd->flags |= IEEE80211_STA_DISABLE_HT; - ifmgd->flags |= IEEE80211_STA_DISABLE_VHT; - } - - if (req->flags & ASSOC_REQ_DISABLE_VHT) - ifmgd->flags |= IEEE80211_STA_DISABLE_VHT; - /* Also disable HT if we don't support it or the AP doesn't use WMM */ sband = local->hw.wiphy->bands[req->bss->channel->band]; if (!sband->ht_cap.ht_supported || @@ -4847,7 +4833,36 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, ifmgd->dtim_period = 0; ifmgd->have_beacon = false; - err = ieee80211_prep_connection(sdata, req->bss, true); + /* override HT/VHT configuration only if the AP and we support it */ + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) { + struct ieee80211_sta_ht_cap sta_ht_cap; + + if (req->flags & ASSOC_REQ_DISABLE_HT) + override = true; + + memcpy(&sta_ht_cap, &sband->ht_cap, sizeof(sta_ht_cap)); + ieee80211_apply_htcap_overrides(sdata, &sta_ht_cap); + + /* check for 40 MHz disable override */ + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_40MHZ) && + sband->ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 && + !(sta_ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40)) + override = true; + + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT) && + req->flags & ASSOC_REQ_DISABLE_VHT) + override = true; + } + + if (req->flags & ASSOC_REQ_DISABLE_HT) { + ifmgd->flags |= IEEE80211_STA_DISABLE_HT; + ifmgd->flags |= IEEE80211_STA_DISABLE_VHT; + } + + if (req->flags & ASSOC_REQ_DISABLE_VHT) + ifmgd->flags |= IEEE80211_STA_DISABLE_VHT; + + err = ieee80211_prep_connection(sdata, req->bss, true, override); if (err) goto err_clear; -- cgit From 6382246e895fa0ae5162de7c1e5566b9719bdd26 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Mon, 20 Apr 2015 22:53:37 +0300 Subject: mac80211: notify the driver upon BAR Rx When we receive a BAR, this typically means that our peer doesn't hear our Block-Acks or that we can't hear its frames. Either way, it is a good indication that the link is in a bad condition. This is why it can serve as a probe to the driver. Use the event_callback callback for this. Since more events with the same data will be added in the feature, the structure that describes the data attached to the event is called in a generic name: ieee80211_ba_event. This also means that from now on, the event_callback can't sleep. Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index b69a071d7679..ac6bfa999416 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2427,6 +2427,9 @@ ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx, struct sk_buff_head *frames) struct { __le16 control, start_seq_num; } __packed bar_data; + struct ieee80211_event event = { + .type = BAR_RX_EVENT, + }; if (!rx->sta) return RX_DROP_MONITOR; @@ -2442,6 +2445,9 @@ ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx, struct sk_buff_head *frames) return RX_DROP_MONITOR; start_seq_num = le16_to_cpu(bar_data.start_seq_num) >> 4; + event.u.ba.tid = tid; + event.u.ba.ssn = start_seq_num; + event.u.ba.sta = &rx->sta->sta; /* reset session timer */ if (tid_agg_rx->timeout) @@ -2454,6 +2460,8 @@ ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx, struct sk_buff_head *frames) start_seq_num, frames); spin_unlock(&tid_agg_rx->reorder_lock); + drv_event_callback(rx->local, rx->sdata, &event); + kfree_skb(skb); return RX_QUEUED; } -- cgit From b497de63ad5dcdae999c14444c4e7f53fd60119c Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Mon, 20 Apr 2015 22:53:38 +0300 Subject: mac80211: notify the driver on reordering buffer timeout When frames time out in the reordering buffer, it is a good indication that something went wrong and the driver may want to know about that to take action or trigger debug flows. It is pointless to notify the driver about each frame that is released. Notify each time the timer fires. Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index ac6bfa999416..e08253547f42 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -3229,6 +3229,15 @@ void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid) ieee80211_sta_reorder_release(sta->sdata, tid_agg_rx, &frames); spin_unlock(&tid_agg_rx->reorder_lock); + if (!skb_queue_empty(&frames)) { + struct ieee80211_event event = { + .type = BA_FRAME_TIMEOUT, + .u.ba.tid = tid, + .u.ba.sta = &sta->sta, + }; + drv_event_callback(rx.local, rx.sdata, &event); + } + ieee80211_rx_handlers(&rx, &frames); } -- cgit From b8e69d51ffb5753cbb62f3ebfeadeffd3a19b0e9 Mon Sep 17 00:00:00 2001 From: Matti Gottlieb Date: Mon, 20 Apr 2015 22:54:14 +0300 Subject: mac80211: force off channel transmission for public action frames Currently while associated to an AP and sending a (public) action frame to a different AP on the same channel, the action frame will be sent like a regular tx frame without going off channel. When power save is enabled this can cause problems, since the device can go into power save and miss the response to the action frame that is sent by the other AP. Force off-channel transmission to avoid this issue in case - HW offchannel is used, - the user didn't forbid transmitting frames off channel - the frame is not sent to the AP that we are associated with (if it is we assume the response would be bufferable) Signed-off-by: Matti Gottlieb Signed-off-by: Emmanuel Grumbach [reword commit message a bit] Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 4aa5e893cbaa..72a0178af737 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -3343,8 +3343,14 @@ static int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, break; case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_P2P_CLIENT: - if (!sdata->u.mgd.associated) + sdata_lock(sdata); + if (!sdata->u.mgd.associated || + (params->offchan && params->wait && + local->ops->remain_on_channel && + memcmp(sdata->u.mgd.associated->bssid, + mgmt->bssid, ETH_ALEN))) need_offchan = true; + sdata_unlock(sdata); break; case NL80211_IFTYPE_P2P_DEVICE: need_offchan = true; -- cgit From f828ad0ce22474820255c6faeb88b05724a8c1eb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 29 Apr 2015 16:20:58 -0700 Subject: tcp_westwood: fix tcp_westwood_info() I forgot to update tcp_westwood when changing get_info() behavior, this patch should fix this. Fixes: 64f40ff5bbdb ("tcp: prepare CC get_info() access from getsockopt()") Reported-by: kbuild test robot Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_westwood.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index b3c57cceb990..c10732e39837 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -256,18 +256,19 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) } /* Extract info for Tcp socket info provided via netlink. */ -static int tcp_westwood_info(struct sock *sk, u32 ext, struct sk_buff *skb) +static size_t tcp_westwood_info(struct sock *sk, u32 ext, int *attr, + union tcp_cc_info *info) { const struct westwood *ca = inet_csk_ca(sk); if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { - struct tcpvegas_info info = { - .tcpv_enabled = 1, - .tcpv_rtt = jiffies_to_usecs(ca->rtt), - .tcpv_minrtt = jiffies_to_usecs(ca->rtt_min), - }; + info->vegas.tcpv_enabled = 1; + info->vegas.tcpv_rttcnt = 0; + info->vegas.tcpv_rtt = jiffies_to_usecs(ca->rtt), + info->vegas.tcpv_minrtt = jiffies_to_usecs(ca->rtt_min), - return nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info); + *attr = INET_DIAG_VEGASINFO; + return sizeof(struct tcpvegas_info); } return 0; } -- cgit From 355b590ca24a53f5e2f6999218b5321eaf730a7e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 1 May 2015 10:37:49 -0700 Subject: ipv4: speedup ip_idents_reserve() Under stress, ip_idents_reserve() is accessing a contended cache line twice, with non optimal MESI transactions. If we place timestamps in separate location, we reduce this pressure by ~50% and allow atomic_add_return() to issue a Request for Ownership. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/route.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index a78540f28276..81f513d6f27f 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -457,12 +457,9 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, } #define IP_IDENTS_SZ 2048u -struct ip_ident_bucket { - atomic_t id; - u32 stamp32; -}; -static struct ip_ident_bucket *ip_idents __read_mostly; +static atomic_t *ip_idents __read_mostly; +static u32 *ip_tstamps __read_mostly; /* In order to protect privacy, we add a perturbation to identifiers * if one generator is seldom used. This makes hard for an attacker @@ -470,15 +467,16 @@ static struct ip_ident_bucket *ip_idents __read_mostly; */ u32 ip_idents_reserve(u32 hash, int segs) { - struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ; - u32 old = ACCESS_ONCE(bucket->stamp32); + u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ; + atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ; + u32 old = ACCESS_ONCE(*p_tstamp); u32 now = (u32)jiffies; u32 delta = 0; - if (old != now && cmpxchg(&bucket->stamp32, old, now) == old) + if (old != now && cmpxchg(p_tstamp, old, now) == old) delta = prandom_u32_max(now - old); - return atomic_add_return(segs + delta, &bucket->id) - segs; + return atomic_add_return(segs + delta, p_id) - segs; } EXPORT_SYMBOL(ip_idents_reserve); @@ -2741,6 +2739,10 @@ int __init ip_rt_init(void) prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents)); + ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL); + if (!ip_tstamps) + panic("IP: failed to allocate ip_tstamps\n"); + for_each_possible_cpu(cpu) { struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu); -- cgit From 1f56a01f4ed1bbb36ff2a97f75a6e6231d790cff Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 28 Apr 2015 13:03:03 -0700 Subject: ipv6: Consider RTF_CACHE when searching the fib6 tree It is a prep work for the later bug-fix patch which will stop /128 route from disappearing after pmtu update. The later bug-fix patch will allow a /128 route and its RTF_CACHE clone both exist at the same fib6_node. To do this, we need to prepare the existing fib6 tree search to expect RTF_CACHE for /128 route. Note that the fn->leaf is sorted by rt6i_metric. Hence, RTF_CACHE (if there is any) is always at the front. This property leads to the following: 1. When doing ip6_route_del(), it should honor the RTF_CACHE flag which the caller is used to ask for deleting clone or non-clone. The rtm_to_fib6_config() should also check the RTM_F_CLONED and then set RTF_CACHE accordingly so that: - 'ip -6 r del...' will make ip6_route_del() to delete a route and all its clones. Note that its clones is flushed by fib6_del() - 'ip -6 r flush table cache' will make ip6_route_del() to only delete clone(s). 2. Exclude RTF_CACHE from addrconf_get_prefix_route() which should not configure on a cloned route. 3. No change is need for rt6_device_match() since it currently could return a RTF_CACHE clone route, so the later bug-fix patch will not affect it. Signed-off-by: Martin KaFai Lau Reviewed-by: Hannes Frederic Sowa Cc: Steffen Klassert Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 2 ++ net/ipv6/route.c | 6 ++++++ 2 files changed, 8 insertions(+) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 37b70e82bff8..21c2c818df3b 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2121,6 +2121,8 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, fn = fib6_locate(&table->tb6_root, pfx, plen, NULL, 0); if (!fn) goto out; + + noflags |= RTF_CACHE; for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { if (rt->dst.dev->ifindex != dev->ifindex) continue; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 5c48293ff062..4774f13cbf90 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1785,6 +1785,9 @@ static int ip6_route_del(struct fib6_config *cfg) if (fn) { for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { + if ((rt->rt6i_flags & RTF_CACHE) && + !(cfg->fc_flags & RTF_CACHE)) + continue; if (cfg->fc_ifindex && (!rt->dst.dev || rt->dst.dev->ifindex != cfg->fc_ifindex)) @@ -2433,6 +2436,9 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, if (rtm->rtm_type == RTN_LOCAL) cfg->fc_flags |= RTF_LOCAL; + if (rtm->rtm_flags & RTM_F_CLONED) + cfg->fc_flags |= RTF_CACHE; + cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; cfg->fc_nlinfo.nlh = nlh; cfg->fc_nlinfo.nl_net = sock_net(skb->sk); -- cgit From 9fbdcfaf97bf4b7d4fbd5b6a61f72167c73f37d9 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Tue, 28 Apr 2015 13:03:04 -0700 Subject: ipv6: Extend the route lookups to low priority metrics. We search only for routes with highest priority metric in find_rr_leaf(). However if one of these routes is marked as invalid, we may fail to find a route even if there is a appropriate route with lower priority. Then we loose connectivity until the garbage collector deletes the invalid route. This typically happens if a host route expires afer a pmtu event. Fix this by searching also for routes with a lower priority metric. Signed-off-by: Steffen Klassert Signed-off-by: Martin KaFai Lau Reviewed-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv6/route.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 4774f13cbf90..07562a2702c9 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -652,15 +652,33 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, u32 metric, int oif, int strict, bool *do_rr) { - struct rt6_info *rt, *match; + struct rt6_info *rt, *match, *cont; int mpri = -1; match = NULL; - for (rt = rr_head; rt && rt->rt6i_metric == metric; - rt = rt->dst.rt6_next) + cont = NULL; + for (rt = rr_head; rt; rt = rt->dst.rt6_next) { + if (rt->rt6i_metric != metric) { + cont = rt; + break; + } + + match = find_match(rt, oif, strict, &mpri, match, do_rr); + } + + for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) { + if (rt->rt6i_metric != metric) { + cont = rt; + break; + } + match = find_match(rt, oif, strict, &mpri, match, do_rr); - for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric; - rt = rt->dst.rt6_next) + } + + if (match || !cont) + return match; + + for (rt = cont; rt; rt = rt->dst.rt6_next) match = find_match(rt, oif, strict, &mpri, match, do_rr); return match; -- cgit From 653437d02f1f12d528c290e64f6dc54be1224db2 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 28 Apr 2015 13:03:05 -0700 Subject: ipv6: Stop /128 route from disappearing after pmtu update This patch is mostly from Steffen Klassert . I only removed the (rt6->rt6i_dst.plen == 128) check from ip6_rt_update_pmtu() because the (rt6->rt6i_flags & RTF_CACHE) test has already implied it. This patch: 1. Create RTF_CACHE route for /128 non local route 2. After (1), all routes that allow pmtu update should have a RTF_CACHE clone. Hence, stop updating MTU for any non RTF_CACHE route. Signed-off-by: Martin KaFai Lau Signed-off-by: Steffen Klassert Reviewed-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv6/route.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 07562a2702c9..aa4cfdd13f28 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -977,7 +977,7 @@ redo_rt6_select: if (!(rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY))) nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr); - else if (!(rt->dst.flags & DST_HOST)) + else if (!(rt->dst.flags & DST_HOST) || !(rt->dst.flags & RTF_LOCAL)) nrt = rt6_alloc_clone(rt, &fl6->daddr); else goto out2; @@ -1172,7 +1172,7 @@ static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct rt6_info *rt6 = (struct rt6_info *)dst; dst_confirm(dst); - if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) { + if (mtu < dst_mtu(dst) && (rt6->rt6i_flags & RTF_CACHE)) { struct net *net = dev_net(dst->dev); rt6->rt6i_flags |= RTF_MODIFIED; -- cgit From 4b32b5ad31a68a661f761c76dfd0d076636d3ae9 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 28 Apr 2015 13:03:06 -0700 Subject: ipv6: Stop rt6_info from using inet_peer's metrics inet_peer is indexed by the dst address alone. However, the fib6 tree could have multiple routing entries (rt6_info) for the same dst. For example, 1. A /128 dst via multiple gateways. 2. A RTF_CACHE route cloned from a /128 route. In the above cases, all of them will share the same metrics and step on each other. This patch will steer away from inet_peer's metrics and use dst_cow_metrics_generic() for everything. Change Highlights: 1. Remove rt6_cow_metrics() which currently acquires metrics from inet_peer for DST_HOST route (i.e. /128 route). 2. Add rt6i_pmtu to take care of the pmtu update to avoid creating a full size metrics just to override the RTAX_MTU. 3. After (2), the RTF_CACHE route can also share the metrics with its dst.from route, by: dst_init_metrics(&cache_rt->dst, dst_metrics_ptr(cache_rt->dst.from), true); 4. Stop creating RTF_CACHE route by cloning another RTF_CACHE route. Instead, directly clone from rt->dst. [ Currently, cloning from another RTF_CACHE is only possible during rt6_do_redirect(). Also, the old clone is removed from the tree immediately after the new clone is added. ] In case of cloning from an older redirect RTF_CACHE, it should work as before. In case of cloning from an older pmtu RTF_CACHE, this patch will forget the pmtu and re-learn it (if there is any) from the redirected route. The _rt6i_peer and DST_METRICS_FORCE_OVERWRITE will be removed in the next cleanup patch. Signed-off-by: Martin KaFai Lau Reviewed-by: Hannes Frederic Sowa Cc: Steffen Klassert Signed-off-by: David S. Miller --- net/ipv6/route.c | 102 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 59 insertions(+), 43 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index aa4cfdd13f28..4d6eb5d90cb4 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -92,6 +92,7 @@ static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu); static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); +static void rt6_dst_from_metrics_check(struct rt6_info *rt); static int rt6_score_route(struct rt6_info *rt, int oif, int strict); #ifdef CONFIG_IPV6_ROUTE_INFO @@ -136,33 +137,12 @@ static struct inet_peer *rt6_get_peer_create(struct rt6_info *rt) static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old) { - struct rt6_info *rt = (struct rt6_info *) dst; - struct inet_peer *peer; - u32 *p = NULL; + struct rt6_info *rt = (struct rt6_info *)dst; - if (!(rt->dst.flags & DST_HOST)) + if (rt->rt6i_flags & RTF_CACHE) + return NULL; + else return dst_cow_metrics_generic(dst, old); - - peer = rt6_get_peer_create(rt); - if (peer) { - u32 *old_p = __DST_METRICS_PTR(old); - unsigned long prev, new; - - p = peer->metrics; - if (inet_metrics_new(peer) || - (old & DST_METRICS_FORCE_OVERWRITE)) - memcpy(p, old_p, sizeof(u32) * RTAX_MAX); - - new = (unsigned long) p; - prev = cmpxchg(&dst->_metrics, old, new); - - if (prev != old) { - p = __DST_METRICS_PTR(prev); - if (prev & DST_METRICS_READ_ONLY) - p = NULL; - } - } - return p; } static inline const void *choose_neigh_daddr(struct rt6_info *rt, @@ -323,8 +303,7 @@ static void ip6_dst_destroy(struct dst_entry *dst) struct inet6_dev *idev = rt->rt6i_idev; struct dst_entry *from = dst->from; - if (!(rt->dst.flags & DST_HOST)) - dst_destroy_metrics_generic(dst); + dst_destroy_metrics_generic(dst); if (idev) { rt->rt6i_idev = NULL; @@ -333,11 +312,6 @@ static void ip6_dst_destroy(struct dst_entry *dst) dst->from = NULL; dst_release(from); - - if (rt6_has_peer(rt)) { - struct inet_peer *peer = rt6_peer_ptr(rt); - inet_putpeer(peer); - } } static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, @@ -1003,6 +977,7 @@ redo_rt6_select: goto redo_fib6_lookup_lock; out2: + rt6_dst_from_metrics_check(rt); rt->dst.lastuse = jiffies; rt->dst.__use++; @@ -1111,6 +1086,13 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori * Destination cache support functions */ +static void rt6_dst_from_metrics_check(struct rt6_info *rt) +{ + if (rt->dst.from && + dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from)) + dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true); +} + static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) { struct rt6_info *rt; @@ -1127,6 +1109,8 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) if (rt6_check_expired(rt)) return NULL; + rt6_dst_from_metrics_check(rt); + return dst; } @@ -1179,7 +1163,7 @@ static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; - dst_metric_set(dst, RTAX_MTU, mtu); + rt6->rt6i_pmtu = mtu; rt6_update_expires(rt6, net->ipv6.sysctl.ip6_rt_mtu_expires); } } @@ -1359,9 +1343,14 @@ static unsigned int ip6_default_advmss(const struct dst_entry *dst) static unsigned int ip6_mtu(const struct dst_entry *dst) { + const struct rt6_info *rt = (const struct rt6_info *)dst; + unsigned int mtu = rt->rt6i_pmtu; struct inet6_dev *idev; - unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); + if (mtu) + goto out; + + mtu = dst_metric_raw(dst, RTAX_MTU); if (mtu) goto out; @@ -1947,12 +1936,27 @@ out: * Misc support functions */ +static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from) +{ + BUG_ON(from->dst.from); + + rt->rt6i_flags &= ~RTF_EXPIRES; + dst_hold(&from->dst); + rt->dst.from = &from->dst; + dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true); +} + static struct rt6_info *ip6_rt_copy(struct rt6_info *ort, const struct in6_addr *dest) { struct net *net = dev_net(ort->dst.dev); - struct rt6_info *rt = ip6_dst_alloc(net, ort->dst.dev, 0, - ort->rt6i_table); + struct rt6_info *rt; + + if (ort->rt6i_flags & RTF_CACHE) + ort = (struct rt6_info *)ort->dst.from; + + rt = ip6_dst_alloc(net, ort->dst.dev, 0, + ort->rt6i_table); if (rt) { rt->dst.input = ort->dst.input; @@ -1961,7 +1965,6 @@ static struct rt6_info *ip6_rt_copy(struct rt6_info *ort, rt->rt6i_dst.addr = *dest; rt->rt6i_dst.plen = 128; - dst_copy_metrics(&rt->dst, &ort->dst); rt->dst.error = ort->dst.error; rt->rt6i_idev = ort->rt6i_idev; if (rt->rt6i_idev) @@ -2393,11 +2396,20 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) PMTU discouvery. */ if (rt->dst.dev == arg->dev && - !dst_metric_locked(&rt->dst, RTAX_MTU) && - (dst_mtu(&rt->dst) >= arg->mtu || - (dst_mtu(&rt->dst) < arg->mtu && - dst_mtu(&rt->dst) == idev->cnf.mtu6))) { - dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu); + !dst_metric_locked(&rt->dst, RTAX_MTU)) { + if (rt->rt6i_flags & RTF_CACHE) { + /* For RTF_CACHE with rt6i_pmtu == 0 + * (i.e. a redirected route), + * the metrics of its rt->dst.from has already + * been updated. + */ + if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu) + rt->rt6i_pmtu = arg->mtu; + } else if (dst_mtu(&rt->dst) >= arg->mtu || + (dst_mtu(&rt->dst) < arg->mtu && + dst_mtu(&rt->dst) == idev->cnf.mtu6)) { + dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu); + } } return 0; } @@ -2627,6 +2639,7 @@ static int rt6_fill_node(struct net *net, int iif, int type, u32 portid, u32 seq, int prefix, int nowait, unsigned int flags) { + u32 metrics[RTAX_MAX]; struct rtmsg *rtm; struct nlmsghdr *nlh; long expires; @@ -2740,7 +2753,10 @@ static int rt6_fill_node(struct net *net, goto nla_put_failure; } - if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0) + memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics)); + if (rt->rt6i_pmtu) + metrics[RTAX_MTU - 1] = rt->rt6i_pmtu; + if (rtnetlink_put_metrics(skb, metrics) < 0) goto nla_put_failure; if (rt->rt6i_flags & RTF_GATEWAY) { -- cgit From afc4eef80c92b199357db3570d3c9c7631d699ff Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 28 Apr 2015 13:03:07 -0700 Subject: ipv6: Remove DST_METRICS_FORCE_OVERWRITE and _rt6i_peer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _rt6i_peer is no longer needed after the last patch, 'ipv6: Stop rt6_info from using inet_peer's metrics'. DST_METRICS_FORCE_OVERWRITE is added by commit e5fd387ad5b3 ("ipv6: do not overwrite inetpeer metrics prematurely"). Since inetpeer is no longer used for metrics, this bit is also not needed. Signed-off-by: Martin KaFai Lau Reviewed-by: Hannes Frederic Sowa Cc: Michal Kubeček Cc: Steffen Klassert Signed-off-by: David S. Miller --- net/ipv6/route.c | 36 +----------------------------------- net/ipv6/xfrm6_policy.c | 14 -------------- 2 files changed, 1 insertion(+), 49 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 4d6eb5d90cb4..352271154ddb 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -105,36 +105,6 @@ static struct rt6_info *rt6_get_route_info(struct net *net, const struct in6_addr *gwaddr, int ifindex); #endif -static void rt6_bind_peer(struct rt6_info *rt, int create) -{ - struct inet_peer_base *base; - struct inet_peer *peer; - - base = inetpeer_base_ptr(rt->_rt6i_peer); - if (!base) - return; - - peer = inet_getpeer_v6(base, &rt->rt6i_dst.addr, create); - if (peer) { - if (!rt6_set_peer(rt, peer)) - inet_putpeer(peer); - } -} - -static struct inet_peer *__rt6_get_peer(struct rt6_info *rt, int create) -{ - if (rt6_has_peer(rt)) - return rt6_peer_ptr(rt); - - rt6_bind_peer(rt, create); - return (rt6_has_peer(rt) ? rt6_peer_ptr(rt) : NULL); -} - -static struct inet_peer *rt6_get_peer_create(struct rt6_info *rt) -{ - return __rt6_get_peer(rt, 1); -} - static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old) { struct rt6_info *rt = (struct rt6_info *)dst; @@ -291,7 +261,6 @@ static inline struct rt6_info *ip6_dst_alloc(struct net *net, struct dst_entry *dst = &rt->dst; memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); - rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers); INIT_LIST_HEAD(&rt->rt6i_siblings); } return rt; @@ -1052,7 +1021,6 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori new = &rt->dst; memset(new + 1, 0, sizeof(*rt) - sizeof(*new)); - rt6_init_peer(rt, net->ipv6.peers); new->__use = 1; new->input = dst_discard; @@ -1597,10 +1565,8 @@ int ip6_route_add(struct fib6_config *cfg) ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); rt->rt6i_dst.plen = cfg->fc_dst_len; - if (rt->rt6i_dst.plen == 128) { + if (rt->rt6i_dst.plen == 128) rt->dst.flags |= DST_HOST; - dst_metrics_set_force_overwrite(&rt->dst); - } #ifdef CONFIG_IPV6_SUBTREES ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len); diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index f337a908a76a..6ae256b4c55a 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -71,13 +71,6 @@ static int xfrm6_get_tos(const struct flowi *fl) return 0; } -static void xfrm6_init_dst(struct net *net, struct xfrm_dst *xdst) -{ - struct rt6_info *rt = (struct rt6_info *)xdst; - - rt6_init_peer(rt, net->ipv6.peers); -} - static int xfrm6_init_path(struct xfrm_dst *path, struct dst_entry *dst, int nfheader_len) { @@ -106,8 +99,6 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, return -ENODEV; } - rt6_transfer_peer(&xdst->u.rt6, rt); - /* Sheit... I remember I did this right. Apparently, * it was magically lost, so this code needs audit */ xdst->u.rt6.rt6i_flags = rt->rt6i_flags & (RTF_ANYCAST | @@ -255,10 +246,6 @@ static void xfrm6_dst_destroy(struct dst_entry *dst) if (likely(xdst->u.rt6.rt6i_idev)) in6_dev_put(xdst->u.rt6.rt6i_idev); dst_destroy_metrics_generic(dst); - if (rt6_has_peer(&xdst->u.rt6)) { - struct inet_peer *peer = rt6_peer_ptr(&xdst->u.rt6); - inet_putpeer(peer); - } xfrm_dst_destroy(xdst); } @@ -308,7 +295,6 @@ static struct xfrm_policy_afinfo xfrm6_policy_afinfo = { .get_saddr = xfrm6_get_saddr, .decode_session = _decode_session6, .get_tos = xfrm6_get_tos, - .init_dst = xfrm6_init_dst, .init_path = xfrm6_init_path, .fill_dst = xfrm6_fill_dst, .blackhole_route = ip6_blackhole_route, -- cgit From 7eee8cd4d8d6ce3f70e6bea716c19b28635a5232 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Thu, 30 Apr 2015 17:25:12 +0800 Subject: ipv4: remove the unnecessary codes in fib_info_hash_move The whole hlist will be moved, so not need to call hlist_del before add the hlist_node to other hlist_head. Signed-off-by: Li RongQing Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 8d695b6659c7..28ec3c1823bf 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -713,8 +713,6 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash, struct hlist_head *dest; unsigned int new_hash; - hlist_del(&fi->fib_hash); - new_hash = fib_info_hashfn(fi); dest = &new_info_hash[new_hash]; hlist_add_head(&fi->fib_hash, dest); @@ -731,8 +729,6 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash, struct hlist_head *ldest; unsigned int new_hash; - hlist_del(&fi->fib_lhash); - new_hash = fib_laddr_hashfn(fi->fib_prefsrc); ldest = &new_laddrhash[new_hash]; hlist_add_head(&fi->fib_lhash, ldest); -- cgit From 4749c3ef854e3a5d3dd3cc0ccd2dcb7e05d583bd Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 30 Apr 2015 12:12:00 +0200 Subject: net: sched: remove TC_MUNGED bits Not used. pedit sets TC_MUNGED when packet content was altered, but all the core does is unset MUNGED again and then set OK2MUNGE. And the latter isn't tested anywhere. So lets remove both TC_MUNGED and TC_OK2MUNGE. Signed-off-by: Florian Westphal Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_api.c | 5 ----- net/sched/act_pedit.c | 5 +---- 2 files changed, 1 insertion(+), 9 deletions(-) (limited to 'net') diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 3d43e4979f27..af427a3dbcba 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -392,11 +392,6 @@ int tcf_action_exec(struct sk_buff *skb, const struct list_head *actions, list_for_each_entry(a, actions, list) { repeat: ret = a->ops->act(skb, a, res); - if (TC_MUNGED & skb->tc_verd) { - /* copied already, allow trampling */ - skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd); - skb->tc_verd = CLR_TC_MUNGED(skb->tc_verd); - } if (ret == TC_ACT_REPEAT) goto repeat; /* we need a ttl - JHS */ if (ret != TC_ACT_PIPE) diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 59649d588d79..17e6d6669c7f 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -108,7 +108,7 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_pedit *p = a->priv; - int i, munged = 0; + int i; unsigned int off; if (skb_unclone(skb, GFP_ATOMIC)) @@ -156,11 +156,8 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, *ptr = ((*ptr & tkey->mask) ^ tkey->val); if (ptr == &_data) skb_store_bits(skb, off + offset, ptr, 4); - munged++; } - if (munged) - skb->tc_verd = SET_TC_MUNGED(skb->tc_verd); goto done; } else WARN(1, "pedit BUG: index %d\n", p->tcf_index); -- cgit From 7035870d1219f5cd86128edcb4c3517def632ad3 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Sun, 3 May 2015 17:05:49 -0700 Subject: ipv6: Check RTF_LOCAL on rt->rt6i_flags instead of rt->dst.flags In my earlier commit: 653437d02f1f ("ipv6: Stop /128 route from disappearing after pmtu update"), there was a horrible typo. Instead of checking RTF_LOCAL on rt->rt6i_flags, it was checked on rt->dst.flags. This patch fixes it. Signed-off-by: Martin KaFai Lau Cc: Hajime Tazaki Cc: David S. Miller Signed-off-by: David S. Miller --- net/ipv6/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 352271154ddb..106dbe5140f1 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -920,7 +920,7 @@ redo_rt6_select: if (!(rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY))) nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr); - else if (!(rt->dst.flags & DST_HOST) || !(rt->dst.flags & RTF_LOCAL)) + else if (!(rt->dst.flags & DST_HOST) || !(rt->rt6i_flags & RTF_LOCAL)) nrt = rt6_alloc_clone(rt, &fl6->daddr); else goto out2; -- cgit From 82a584b7cd366511a22e37675b029cf2fb58e291 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 29 Apr 2015 15:33:21 -0700 Subject: ipv6: Flow label state ranges This patch divides the IPv6 flow label space into two ranges: 0-7ffff is reserved for flow label manager, 80000-fffff will be used for creating auto flow labels (per RFC6438). This only affects how labels are set on transmit, it does not affect receive. This range split can be disbaled by systcl. Background: IPv6 flow labels have been an unmitigated disappointment thus far in the lifetime of IPv6. Support in HW devices to use them for ECMP is lacking, and OSes don't turn them on by default. If we had these we could get much better hashing in IPv6 networks without resorting to DPI, possibly eliminating some of the motivations to to define new encaps in UDP just for getting ECMP. Unfortunately, the initial specfications of IPv6 did not clarify how they are to be used. There has always been a vague concept that these can be used for ECMP, flow hashing, etc. and we do now have a good standard how to this in RFC6438. The problem is that flow labels can be either stateful or stateless (as in RFC6438), and we are presented with the possibility that a stateless label may collide with a stateful one. Attempts to split the flow label space were rejected in IETF. When we added support in Linux for RFC6438, we could not turn on flow labels by default due to this conflict. This patch splits the flow label space and should give us a path to enabling auto flow labels by default for all IPv6 packets. This is an API change so we need to consider compatibility with existing deployment. The stateful range is chosen to be the lower values in hopes that most uses would have chosen small numbers. Once we resolve the stateless/stateful issue, we can proceed to look at enabling RFC6438 flow labels by default (starting with scaled testing). Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv6/af_inet6.c | 1 + net/ipv6/ip6_flowlabel.c | 4 ++++ net/ipv6/sysctl_net_ipv6.c | 8 ++++++++ 3 files changed, 13 insertions(+) (limited to 'net') diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index eef63b394c5a..4632afa57e05 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -768,6 +768,7 @@ static int __net_init inet6_net_init(struct net *net) net->ipv6.sysctl.auto_flowlabels = 0; net->ipv6.sysctl.idgen_retries = 3; net->ipv6.sysctl.idgen_delay = 1 * HZ; + net->ipv6.sysctl.flowlabel_state_ranges = 1; atomic_set(&net->ipv6.fib6_sernum, 1); err = ipv6_init_mibs(net); diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index d491125011c4..1f9ebe3cbb4a 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -595,6 +595,10 @@ int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen) if (freq.flr_label & ~IPV6_FLOWLABEL_MASK) return -EINVAL; + if (net->ipv6.sysctl.flowlabel_state_ranges && + (freq.flr_label & IPV6_FLOWLABEL_STATELESS_FLAG)) + return -ERANGE; + fl = fl_create(net, sk, &freq, optval, optlen, &err); if (!fl) return err; diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index abcc79f649b3..4e705add4f18 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -68,6 +68,13 @@ static struct ctl_table ipv6_table_template[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, + { + .procname = "flowlabel_state_ranges", + .data = &init_net.ipv6.sysctl.flowlabel_state_ranges, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { } }; @@ -109,6 +116,7 @@ static int __net_init ipv6_sysctl_net_init(struct net *net) ipv6_table[4].data = &net->ipv6.sysctl.fwmark_reflect; ipv6_table[5].data = &net->ipv6.sysctl.idgen_retries; ipv6_table[6].data = &net->ipv6.sysctl.idgen_delay; + ipv6_table[7].data = &net->ipv6.sysctl.flowlabel_state_ranges; ipv6_route_table = ipv6_route_sysctl_init(net); if (!ipv6_route_table) -- cgit From 849b920e17a2b1e9a3c912ca960dc667d27985e8 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 30 Apr 2015 14:53:48 -0700 Subject: etherdev: Avoid unnecessary byte swap in check for Ethertype This change takes advantage of the fact that ETH_P_802_3_MIN is aligned to 512 so as a result we can actually ignore the lower 8b when comparing the Ethertype to ETH_P_802_3_MIN. This allows us to avoid a byte swap by simply masking the value and comparing it to the byte swapped value for ETH_P_802_3_MIN. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ethernet/eth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index f3bad41d725f..60069318d5d1 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -178,7 +178,7 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) if (unlikely(netdev_uses_dsa(dev))) return htons(ETH_P_XDSA); - if (likely(ntohs(eth->h_proto) >= ETH_P_802_3_MIN)) + if (likely((eth->h_proto & htons(0xFF00)) >= htons(ETH_P_802_3_MIN))) return eth->h_proto; /* -- cgit From d54385ce68cd18ab002b46f61246ad197cec92de Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 30 Apr 2015 14:53:54 -0700 Subject: etherdev: Process is_multicast_ether_addr at same size as other operations This change makes it so that we process the address in is_multicast_ether_addr at the same size as the other calls. This allows us to avoid duplicate reads when used with other calls such as is_zero_ether_addr or eth_addr_copy. In addition I have added a 64 bit version of the function so in eth_type_trans we can process the destination address as a 64 bit value throughout. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ethernet/eth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 60069318d5d1..21c211e9fd5a 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -159,7 +159,7 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) skb_pull_inline(skb, ETH_HLEN); eth = eth_hdr(skb); - if (unlikely(is_multicast_ether_addr(eth->h_dest))) { + if (unlikely(is_multicast_ether_addr_64bits(eth->h_dest))) { if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast)) skb->pkt_type = PACKET_BROADCAST; else -- cgit From 610986e7262624e2dd29ad8dea05b4e1ac3f07fb Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 30 Apr 2015 14:53:59 -0700 Subject: etherdev: Use skb->data to retrieve Ethernet header instead of eth_hdr Avoid recomputing the Ethernet header location and instead just use the pointer provided by skb->data. The problem with using eth_hdr is that the compiler wasn't smart enough to realize that skb->head + skb->mac_header was the same thing as skb->data before it added ETH_HLEN. By just caching it off before calling skb_pull_inline we can avoid a few unnecessary instructions. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ethernet/eth.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 21c211e9fd5a..314e4c5a5a5e 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -156,8 +156,9 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) skb->dev = dev; skb_reset_mac_header(skb); + + eth = (struct ethhdr *)skb->data; skb_pull_inline(skb, ETH_HLEN); - eth = eth_hdr(skb); if (unlikely(is_multicast_ether_addr_64bits(eth->h_dest))) { if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast)) -- cgit From 196da974758550a3933c8b0244ef98148df10552 Mon Sep 17 00:00:00 2001 From: Kenneth Klette Jonassen Date: Fri, 1 May 2015 01:10:57 +0200 Subject: tcp: move struct tcp_sacktag_state to tcp_ack() Later patch passes two values set in tcp_sacktag_one() to tcp_clean_rtx_queue(). Prepare passing them via struct tcp_sacktag_state. Acked-by: Yuchung Cheng Cc: Eric Dumazet Signed-off-by: Kenneth Klette Jonassen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 45 ++++++++++++++++++++++----------------------- 1 file changed, 22 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index bc790ea9960f..9902cf1abdf4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1634,7 +1634,7 @@ static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_bl static int tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, - u32 prior_snd_una, long *sack_rtt_us) + u32 prior_snd_una, struct tcp_sacktag_state *state) { struct tcp_sock *tp = tcp_sk(sk); const unsigned char *ptr = (skb_transport_header(ack_skb) + @@ -1642,7 +1642,6 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2); struct tcp_sack_block sp[TCP_NUM_SACKS]; struct tcp_sack_block *cache; - struct tcp_sacktag_state state; struct sk_buff *skb; int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3); int used_sacks; @@ -1650,9 +1649,8 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, int i, j; int first_sack_index; - state.flag = 0; - state.reord = tp->packets_out; - state.rtt_us = -1L; + state->flag = 0; + state->reord = tp->packets_out; if (!tp->sacked_out) { if (WARN_ON(tp->fackets_out)) @@ -1663,7 +1661,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire, num_sacks, prior_snd_una); if (found_dup_sack) - state.flag |= FLAG_DSACKING_ACK; + state->flag |= FLAG_DSACKING_ACK; /* Eliminate too old ACKs, but take into * account more or less fresh ones, they can @@ -1728,7 +1726,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, } skb = tcp_write_queue_head(sk); - state.fack_count = 0; + state->fack_count = 0; i = 0; if (!tp->sacked_out) { @@ -1762,10 +1760,10 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, /* Head todo? */ if (before(start_seq, cache->start_seq)) { - skb = tcp_sacktag_skip(skb, sk, &state, + skb = tcp_sacktag_skip(skb, sk, state, start_seq); skb = tcp_sacktag_walk(skb, sk, next_dup, - &state, + state, start_seq, cache->start_seq, dup_sack); @@ -1776,7 +1774,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, goto advance_sp; skb = tcp_maybe_skipping_dsack(skb, sk, next_dup, - &state, + state, cache->end_seq); /* ...tail remains todo... */ @@ -1785,12 +1783,12 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, skb = tcp_highest_sack(sk); if (!skb) break; - state.fack_count = tp->fackets_out; + state->fack_count = tp->fackets_out; cache++; goto walk; } - skb = tcp_sacktag_skip(skb, sk, &state, cache->end_seq); + skb = tcp_sacktag_skip(skb, sk, state, cache->end_seq); /* Check overlap against next cached too (past this one already) */ cache++; continue; @@ -1800,12 +1798,12 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, skb = tcp_highest_sack(sk); if (!skb) break; - state.fack_count = tp->fackets_out; + state->fack_count = tp->fackets_out; } - skb = tcp_sacktag_skip(skb, sk, &state, start_seq); + skb = tcp_sacktag_skip(skb, sk, state, start_seq); walk: - skb = tcp_sacktag_walk(skb, sk, next_dup, &state, + skb = tcp_sacktag_walk(skb, sk, next_dup, state, start_seq, end_seq, dup_sack); advance_sp: @@ -1820,9 +1818,9 @@ advance_sp: for (j = 0; j < used_sacks; j++) tp->recv_sack_cache[i++] = sp[j]; - if ((state.reord < tp->fackets_out) && + if ((state->reord < tp->fackets_out) && ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker)) - tcp_update_reordering(sk, tp->fackets_out - state.reord, 0); + tcp_update_reordering(sk, tp->fackets_out - state->reord, 0); tcp_mark_lost_retrans(sk); tcp_verify_left_out(tp); @@ -1834,8 +1832,7 @@ out: WARN_ON((int)tp->retrans_out < 0); WARN_ON((int)tcp_packets_in_flight(tp) < 0); #endif - *sack_rtt_us = state.rtt_us; - return state.flag; + return state->flag; } /* Limits sacked_out so that sum with lost_out isn't ever larger than @@ -3459,6 +3456,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + struct tcp_sacktag_state sack_state; u32 prior_snd_una = tp->snd_una; u32 ack_seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; @@ -3467,7 +3465,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) int prior_packets = tp->packets_out; const int prior_unsacked = tp->packets_out - tp->sacked_out; int acked = 0; /* Number of packets newly acked */ - long sack_rtt_us = -1L; + + sack_state.rtt_us = -1L; /* We very likely will need to access write queue head. */ prefetchw(sk->sk_write_queue.next); @@ -3531,7 +3530,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (TCP_SKB_CB(skb)->sacked) flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, - &sack_rtt_us); + &sack_state); if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) { flag |= FLAG_ECE; @@ -3556,7 +3555,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) /* See if we can take anything off of the retransmit queue. */ acked = tp->packets_out; flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, - sack_rtt_us); + sack_state.rtt_us); acked -= tp->packets_out; /* Advance cwnd if state allows */ @@ -3608,7 +3607,7 @@ old_ack: */ if (TCP_SKB_CB(skb)->sacked) { flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, - &sack_rtt_us); + &sack_state); tcp_fastretrans_alert(sk, acked, prior_unsacked, is_dupack, flag); } -- cgit From 31231a8a873026410eab438c5757430546a517d1 Mon Sep 17 00:00:00 2001 From: Kenneth Klette Jonassen Date: Fri, 1 May 2015 01:10:58 +0200 Subject: tcp: improve RTT from SACK for CC tcp_sacktag_one() always picks the earliest sequence SACKed for RTT. This might not make sense for congestion control in cases where: 1. ACKs are lost, i.e. a SACK following a lost SACK covers both new and old segments at the receiver. 2. The receiver disregards the RFC 5681 recommendation to immediately ACK out-of-order segments. Give congestion control a RTT for the latest segment SACKed, which is the most accurate RTT estimate, but preserve the conservative RTT for RTO. Removes the call to skb_mstamp_get() in tcp_sacktag_one(). Cc: Yuchung Cheng Cc: Eric Dumazet Signed-off-by: Kenneth Klette Jonassen Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9902cf1abdf4..f563d2a88809 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1130,7 +1130,12 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, struct tcp_sacktag_state { int reord; int fack_count; - long rtt_us; /* RTT measured by SACKing never-retransmitted data */ + /* Timestamps for earliest and latest never-retransmitted segment + * that was SACKed. RTO needs the earliest RTT to stay conservative, + * but congestion control should still get an accurate delay signal. + */ + struct skb_mstamp first_sackt; + struct skb_mstamp last_sackt; int flag; }; @@ -1233,14 +1238,9 @@ static u8 tcp_sacktag_one(struct sock *sk, state->reord); if (!after(end_seq, tp->high_seq)) state->flag |= FLAG_ORIG_SACK_ACKED; - /* Pick the earliest sequence sacked for RTT */ - if (state->rtt_us < 0) { - struct skb_mstamp now; - - skb_mstamp_get(&now); - state->rtt_us = skb_mstamp_us_delta(&now, - xmit_time); - } + if (state->first_sackt.v64 == 0) + state->first_sackt = *xmit_time; + state->last_sackt = *xmit_time; } if (sacked & TCPCB_LOST) { @@ -3049,7 +3049,8 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, * arrived at the other end. */ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, - u32 prior_snd_una, long sack_rtt_us) + u32 prior_snd_una, + struct tcp_sacktag_state *sack) { const struct inet_connection_sock *icsk = inet_csk(sk); struct skb_mstamp first_ackt, last_ackt, now; @@ -3057,8 +3058,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, u32 prior_sacked = tp->sacked_out; u32 reord = tp->packets_out; bool fully_acked = true; - long ca_seq_rtt_us = -1L; + long sack_rtt_us = -1L; long seq_rtt_us = -1L; + long ca_rtt_us = -1L; struct sk_buff *skb; u32 pkts_acked = 0; bool rtt_update; @@ -3147,7 +3149,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, skb_mstamp_get(&now); if (likely(first_ackt.v64)) { seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt); - ca_seq_rtt_us = skb_mstamp_us_delta(&now, &last_ackt); + ca_rtt_us = skb_mstamp_us_delta(&now, &last_ackt); + } + if (sack->first_sackt.v64) { + sack_rtt_us = skb_mstamp_us_delta(&now, &sack->first_sackt); + ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt); } rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us); @@ -3178,10 +3184,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tp->fackets_out -= min(pkts_acked, tp->fackets_out); - if (ca_ops->pkts_acked) { - long rtt_us = min_t(ulong, ca_seq_rtt_us, sack_rtt_us); - ca_ops->pkts_acked(sk, pkts_acked, rtt_us); - } + if (ca_ops->pkts_acked) + ca_ops->pkts_acked(sk, pkts_acked, ca_rtt_us); } else if (skb && rtt_update && sack_rtt_us >= 0 && sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) { @@ -3466,7 +3470,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) const int prior_unsacked = tp->packets_out - tp->sacked_out; int acked = 0; /* Number of packets newly acked */ - sack_state.rtt_us = -1L; + sack_state.first_sackt.v64 = 0; /* We very likely will need to access write queue head. */ prefetchw(sk->sk_write_queue.next); @@ -3555,7 +3559,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) /* See if we can take anything off of the retransmit queue. */ acked = tp->packets_out; flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, - sack_state.rtt_us); + &sack_state); acked -= tp->packets_out; /* Advance cwnd if state allows */ -- cgit From 138998fdd12e7362756e158d00856a2aabd5f0c1 Mon Sep 17 00:00:00 2001 From: Kenneth Klette Jonassen Date: Fri, 1 May 2015 01:10:59 +0200 Subject: tcp: invoke pkts_acked hook on every ACK Invoking pkts_acked is currently conditioned on FLAG_ACKED: receiving a cumulative ACK of new data, or ACK with SYN flag set. Remove this condition so that CC may get RTT measurements from all SACKs. Cc: Yuchung Cheng Cc: Eric Dumazet Cc: Neal Cardwell Signed-off-by: Kenneth Klette Jonassen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f563d2a88809..09bdc4abfcbb 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3159,9 +3159,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us); if (flag & FLAG_ACKED) { - const struct tcp_congestion_ops *ca_ops - = inet_csk(sk)->icsk_ca_ops; - tcp_rearm_rto(sk); if (unlikely(icsk->icsk_mtup.probe_size && !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) { @@ -3184,9 +3181,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tp->fackets_out -= min(pkts_acked, tp->fackets_out); - if (ca_ops->pkts_acked) - ca_ops->pkts_acked(sk, pkts_acked, ca_rtt_us); - } else if (skb && rtt_update && sack_rtt_us >= 0 && sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) { /* Do not re-arm RTO if the sack RTT is measured from data sent @@ -3196,6 +3190,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tcp_rearm_rto(sk); } + if (icsk->icsk_ca_ops->pkts_acked) + icsk->icsk_ca_ops->pkts_acked(sk, pkts_acked, ca_rtt_us); + #if FASTRETRANS_DEBUG > 0 WARN_ON((int)tp->sacked_out < 0); WARN_ON((int)tp->lost_out < 0); -- cgit From 087c1a601ad7f851a2d31f5fa0e5e9dfc766df55 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 30 Apr 2015 20:14:07 -0700 Subject: net: sched: run ingress qdisc without locks TC classifiers/actions were converted to RCU by John in the series: http://thread.gmane.org/gmane.linux.network/329739/focus=329739 and many follow on patches. This is the last patch from that series that finally drops ingress spin_lock. Single cpu ingress+u32 performance goes from 22.9 Mpps to 24.5 Mpps. In two cpu case when both cores are receiving traffic on the same device and go into the same ingress+u32 the performance jumps from 4.5 + 4.5 Mpps to 23.5 + 23.5 Mpps Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Signed-off-by: Jamal Hadi Salim Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/core/dev.c | 2 -- net/sched/sch_ingress.c | 5 +++-- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index c7ba0388f1be..74a5b62f7568 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3547,10 +3547,8 @@ static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq) q = rcu_dereference(rxq->qdisc); if (q != &noop_qdisc) { - spin_lock(qdisc_lock(q)); if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) result = qdisc_enqueue_root(skb, q); - spin_unlock(qdisc_lock(q)); } return result; diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index 4cdbfb85686a..a89cc3278bfb 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -65,11 +65,11 @@ static int ingress_enqueue(struct sk_buff *skb, struct Qdisc *sch) result = tc_classify(skb, fl, &res); - qdisc_bstats_update(sch, skb); + qdisc_bstats_update_cpu(sch, skb); switch (result) { case TC_ACT_SHOT: result = TC_ACT_SHOT; - qdisc_qstats_drop(sch); + qdisc_qstats_drop_cpu(sch); break; case TC_ACT_STOLEN: case TC_ACT_QUEUED: @@ -91,6 +91,7 @@ static int ingress_enqueue(struct sk_buff *skb, struct Qdisc *sch) static int ingress_init(struct Qdisc *sch, struct nlattr *opt) { net_inc_ingress_queue(); + sch->flags |= TCQ_F_CPUSTATS; return 0; } -- cgit From 6a21165480a066a27c1f1dbd32aec581c612ba23 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Fri, 1 May 2015 16:39:54 +0200 Subject: net: ipv4: route: Fix sending IGMP messages with link address In setups with a global scope address on an interface, and a lesser scope address on an interface sending IGMP reports, the reports can be sent using the other interfaces global scope address rather than the local interface address. RFC 2236 suggests: Ignore the Report if you cannot identify the source address of the packet as belonging to a subnet assigned to the interface on which the packet was received. since such reports could be forged. Look at the protocol when deciding if a RT_SCOPE_LINK address should be used for the packet. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- net/ipv4/route.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 7cad0bf1c71c..9e15f5ca4495 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2091,7 +2091,8 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) goto out; } if (ipv4_is_local_multicast(fl4->daddr) || - ipv4_is_lbcast(fl4->daddr)) { + ipv4_is_lbcast(fl4->daddr) || + fl4->flowi4_proto == IPPROTO_IGMP) { if (!fl4->saddr) fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); -- cgit From 50fb799289501c2eab9f43fc9af513027e1e994f Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 1 May 2015 11:30:12 -0700 Subject: net: Add skb_get_hash_perturb This calls flow_disect and __skb_get_hash to procure a hash for a packet. Input includes a key to initialize jhash. This function does not set skb->hash. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 38 ++++++++++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 2c35c02a931e..b1df995ef06c 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -267,13 +267,12 @@ static __always_inline void __flow_hash_secret_init(void) net_get_random_once(&hashrnd, sizeof(hashrnd)); } -static __always_inline u32 __flow_hash_3words(u32 a, u32 b, u32 c) +static __always_inline u32 __flow_hash_3words(u32 a, u32 b, u32 c, u32 keyval) { - __flow_hash_secret_init(); - return jhash_3words(a, b, c, hashrnd); + return jhash_3words(a, b, c, keyval); } -static inline u32 __flow_hash_from_keys(struct flow_keys *keys) +static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval) { u32 hash; @@ -287,7 +286,8 @@ static inline u32 __flow_hash_from_keys(struct flow_keys *keys) hash = __flow_hash_3words((__force u32)keys->dst, (__force u32)keys->src, - (__force u32)keys->ports); + (__force u32)keys->ports, + keyval); if (!hash) hash = 1; @@ -296,10 +296,20 @@ static inline u32 __flow_hash_from_keys(struct flow_keys *keys) u32 flow_hash_from_keys(struct flow_keys *keys) { - return __flow_hash_from_keys(keys); + __flow_hash_secret_init(); + return __flow_hash_from_keys(keys, hashrnd); } EXPORT_SYMBOL(flow_hash_from_keys); +static inline u32 ___skb_get_hash(const struct sk_buff *skb, + struct flow_keys *keys, u32 keyval) +{ + if (!skb_flow_dissect(skb, keys)) + return 0; + + return __flow_hash_from_keys(keys, keyval); +} + /* * __skb_get_hash: calculate a flow hash based on src/dst addresses * and src/dst port numbers. Sets hash in skb to non-zero hash value @@ -309,8 +319,12 @@ EXPORT_SYMBOL(flow_hash_from_keys); void __skb_get_hash(struct sk_buff *skb) { struct flow_keys keys; + u32 hash; - if (!skb_flow_dissect(skb, &keys)) + __flow_hash_secret_init(); + + hash = ___skb_get_hash(skb, &keys, hashrnd); + if (!hash) return; if (keys.ports) @@ -318,10 +332,18 @@ void __skb_get_hash(struct sk_buff *skb) skb->sw_hash = 1; - skb->hash = __flow_hash_from_keys(&keys); + skb->hash = hash; } EXPORT_SYMBOL(__skb_get_hash); +__u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb) +{ + struct flow_keys keys; + + return ___skb_get_hash(skb, &keys, perturb); +} +EXPORT_SYMBOL(skb_get_hash_perturb); + /* * Returns a Tx hash based on the given packet descriptor a Tx queues' number * to be used as a distribution range. -- cgit From 342db221829f8341e51839e40c47be137dfc6ebe Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 1 May 2015 11:30:13 -0700 Subject: sched: Call skb_get_hash_perturb in sch_fq_codel Call skb_get_hash_perturb instead of doing skb_flow_dissect and then jhash by hand. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/sched/sch_fq_codel.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 1e52decb7b59..a6fc53d69513 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -23,7 +23,6 @@ #include #include #include -#include #include /* Fair Queue CoDel. @@ -68,15 +67,9 @@ struct fq_codel_sched_data { }; static unsigned int fq_codel_hash(const struct fq_codel_sched_data *q, - const struct sk_buff *skb) + struct sk_buff *skb) { - struct flow_keys keys; - unsigned int hash; - - skb_flow_dissect(skb, &keys); - hash = jhash_3words((__force u32)keys.dst, - (__force u32)keys.src ^ keys.ip_proto, - (__force u32)keys.ports, q->perturbation); + u32 hash = skb_get_hash_perturb(skb, q->perturbation); return reciprocal_scale(hash, q->flows_cnt); } -- cgit From f969777ac35506133777ac7674c394e8e298e623 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 1 May 2015 11:30:14 -0700 Subject: sched: Call skb_get_hash_perturb in sch_hhf Call skb_get_hash_perturb instead of doing skb_flow_dissect and then jhash by hand. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/sched/sch_hhf.c | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) (limited to 'net') diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index 15d3aabfe250..9d15cb6b8cb1 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include @@ -176,22 +175,6 @@ static u32 hhf_time_stamp(void) return jiffies; } -static unsigned int skb_hash(const struct hhf_sched_data *q, - const struct sk_buff *skb) -{ - struct flow_keys keys; - unsigned int hash; - - if (skb->sk && skb->sk->sk_hash) - return skb->sk->sk_hash; - - skb_flow_dissect(skb, &keys); - hash = jhash_3words((__force u32)keys.dst, - (__force u32)keys.src ^ keys.ip_proto, - (__force u32)keys.ports, q->perturbation); - return hash; -} - /* Looks up a heavy-hitter flow in a chaining list of table T. */ static struct hh_flow_state *seek_list(const u32 hash, struct list_head *head, @@ -280,7 +263,7 @@ static enum wdrr_bucket_idx hhf_classify(struct sk_buff *skb, struct Qdisc *sch) } /* Get hashed flow-id of the skb. */ - hash = skb_hash(q, skb); + hash = skb_get_hash_perturb(skb, q->perturbation); /* Check if this packet belongs to an already established HH flow. */ flow_pos = hash & HHF_BIT_MASK; -- cgit From 63c0ad4d4135d3bdb81a1ee42436f3a403632a3e Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 1 May 2015 11:30:15 -0700 Subject: sched: Call skb_get_hash_perturb in sch_sfb Call skb_get_hash_perturb instead of doing skb_flow_dissect and then jhash by hand. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/sched/sch_sfb.c | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 5819dd82630d..4b815193326c 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -26,7 +26,6 @@ #include #include #include -#include /* * SFB uses two B[l][n] : L x N arrays of bins (L levels, N bins per level) @@ -285,9 +284,9 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) int i; u32 p_min = ~0; u32 minqlen = ~0; - u32 r, slot, salt, sfbhash; + u32 r, sfbhash; + u32 slot = q->slot; int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - struct flow_keys keys; if (unlikely(sch->q.qlen >= q->limit)) { qdisc_qstats_overlimit(sch); @@ -309,22 +308,17 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) fl = rcu_dereference_bh(q->filter_list); if (fl) { + u32 salt; + /* If using external classifiers, get result and record it. */ if (!sfb_classify(skb, fl, &ret, &salt)) goto other_drop; - keys.src = salt; - keys.dst = 0; - keys.ports = 0; + sfbhash = jhash_1word(salt, q->bins[slot].perturbation); } else { - skb_flow_dissect(skb, &keys); + sfbhash = skb_get_hash_perturb(skb, q->bins[slot].perturbation); } - slot = q->slot; - sfbhash = jhash_3words((__force u32)keys.dst, - (__force u32)keys.src, - (__force u32)keys.ports, - q->bins[slot].perturbation); if (!sfbhash) sfbhash = 1; sfb_skb_cb(skb)->hashes[slot] = sfbhash; @@ -356,10 +350,8 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (unlikely(p_min >= SFB_MAX_PROB)) { /* Inelastic flow */ if (q->double_buffering) { - sfbhash = jhash_3words((__force u32)keys.dst, - (__force u32)keys.src, - (__force u32)keys.ports, - q->bins[slot].perturbation); + sfbhash = skb_get_hash_perturb(skb, + q->bins[slot].perturbation); if (!sfbhash) sfbhash = 1; sfb_skb_cb(skb)->hashes[slot] = sfbhash; -- cgit From ada1dba04c273dbabefff6a9a9f8c2bdcb61a858 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 1 May 2015 11:30:16 -0700 Subject: sched: Call skb_get_hash_perturb in sch_sfq Call skb_get_hash_perturb instead of doing skb_flow_dissect and then jhash by hand. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/sched/sch_sfq.c | 27 ++------------------------- 1 file changed, 2 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index b877140beda5..7d1492663360 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -23,7 +23,6 @@ #include #include #include -#include #include @@ -156,30 +155,10 @@ static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index return &q->dep[val - SFQ_MAX_FLOWS]; } -/* - * In order to be able to quickly rehash our queue when timer changes - * q->perturbation, we store flow_keys in skb->cb[] - */ -struct sfq_skb_cb { - struct flow_keys keys; -}; - -static inline struct sfq_skb_cb *sfq_skb_cb(const struct sk_buff *skb) -{ - qdisc_cb_private_validate(skb, sizeof(struct sfq_skb_cb)); - return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data; -} - static unsigned int sfq_hash(const struct sfq_sched_data *q, const struct sk_buff *skb) { - const struct flow_keys *keys = &sfq_skb_cb(skb)->keys; - unsigned int hash; - - hash = jhash_3words((__force u32)keys->dst, - (__force u32)keys->src ^ keys->ip_proto, - (__force u32)keys->ports, q->perturbation); - return hash & (q->divisor - 1); + return skb_get_hash_perturb(skb, q->perturbation) & (q->divisor - 1); } static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, @@ -196,10 +175,8 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, return TC_H_MIN(skb->priority); fl = rcu_dereference_bh(q->filter_list); - if (!fl) { - skb_flow_dissect(skb, &sfq_skb_cb(skb)->keys); + if (!fl) return sfq_hash(q, skb) + 1; - } *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; result = tc_classify(skb, fl, &res); -- cgit From 2f59e1ebaa7f762c8825871b5486b5f5b4fa952f Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 1 May 2015 11:30:17 -0700 Subject: net: Add flow_keys digest Some users of flow keys (well just sch_choke now) need to pass flow_keys in skbuff cb, and use them for exact comparisons of flows so that skb->hash is not sufficient. In order to increase size of the flow_keys structure, we introduce another structure for the purpose of passing flow keys in skbuff cb. We limit this structure to sixteen bytes, and we will technically treat this as a digest of flow_keys struct hence its name flow_keys_digest. In the first incaranation we just copy the flow_keys structure up to 16 bytes-- this is the same information previously passed in the cb. In the future, we'll adapt this for larger flow_keys and could use something like SHA-1 over the whole flow_keys to improve the quality of the digest. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index b1df995ef06c..d3acc4dff4ae 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -310,6 +310,33 @@ static inline u32 ___skb_get_hash(const struct sk_buff *skb, return __flow_hash_from_keys(keys, keyval); } +struct _flow_keys_digest_data { + __be16 n_proto; + u8 ip_proto; + u8 padding; + __be32 ports; + __be32 src; + __be32 dst; +}; + +void make_flow_keys_digest(struct flow_keys_digest *digest, + const struct flow_keys *flow) +{ + struct _flow_keys_digest_data *data = + (struct _flow_keys_digest_data *)digest; + + BUILD_BUG_ON(sizeof(*data) > sizeof(*digest)); + + memset(digest, 0, sizeof(*digest)); + + data->n_proto = flow->n_proto; + data->ip_proto = flow->ip_proto; + data->ports = flow->ports; + data->src = flow->src; + data->dst = flow->dst; +} +EXPORT_SYMBOL(make_flow_keys_digest); + /* * __skb_get_hash: calculate a flow hash based on src/dst addresses * and src/dst port numbers. Sets hash in skb to non-zero hash value -- cgit From 2e99403d28c182aa7ffb2d4ef34472df4873a4dd Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 1 May 2015 11:30:18 -0700 Subject: sch_choke: Use flow_keys_digest Call make_flow_keys_digest to get a digest from flow keys and use that to pass skbuff cb and for comparing flows. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/sched/sch_choke.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index c009eb9045ce..dfe3da75594c 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -133,16 +133,10 @@ static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx) --sch->q.qlen; } -/* private part of skb->cb[] that a qdisc is allowed to use - * is limited to QDISC_CB_PRIV_LEN bytes. - * As a flow key might be too large, we store a part of it only. - */ -#define CHOKE_K_LEN min_t(u32, sizeof(struct flow_keys), QDISC_CB_PRIV_LEN - 3) - struct choke_skb_cb { u16 classid; u8 keys_valid; - u8 keys[QDISC_CB_PRIV_LEN - 3]; + struct flow_keys_digest keys; }; static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb) @@ -177,18 +171,18 @@ static bool choke_match_flow(struct sk_buff *skb1, if (!choke_skb_cb(skb1)->keys_valid) { choke_skb_cb(skb1)->keys_valid = 1; skb_flow_dissect(skb1, &temp); - memcpy(&choke_skb_cb(skb1)->keys, &temp, CHOKE_K_LEN); + make_flow_keys_digest(&choke_skb_cb(skb1)->keys, &temp); } if (!choke_skb_cb(skb2)->keys_valid) { choke_skb_cb(skb2)->keys_valid = 1; skb_flow_dissect(skb2, &temp); - memcpy(&choke_skb_cb(skb2)->keys, &temp, CHOKE_K_LEN); + make_flow_keys_digest(&choke_skb_cb(skb2)->keys, &temp); } return !memcmp(&choke_skb_cb(skb1)->keys, &choke_skb_cb(skb2)->keys, - CHOKE_K_LEN); + sizeof(choke_skb_cb(skb1)->keys)); } /* -- cgit From c19ae86a510cf4332af64caab04718bc853d3184 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Fri, 1 May 2015 22:19:43 -0700 Subject: tc: remove unused redirect ttl improves ingress+u32 performance from 22.4 Mpps to 22.9 Mpps Signed-off-by: Jamal Hadi Salim Signed-off-by: Alexei Starovoitov Acked-by: Florian Westphal Acked-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- net/core/dev.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 74a5b62f7568..862875ec8f2f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3531,18 +3531,9 @@ EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); */ static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq) { - struct net_device *dev = skb->dev; - u32 ttl = G_TC_RTTL(skb->tc_verd); int result = TC_ACT_OK; struct Qdisc *q; - if (unlikely(MAX_RED_LOOP < ttl++)) { - net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n", - skb->skb_iif, dev->ifindex); - return TC_ACT_SHOT; - } - - skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl); skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); q = rcu_dereference(rxq->qdisc); -- cgit From 3c9e4f870012350a36dc3091c7a57f5ba2799afe Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Sat, 2 May 2015 14:01:06 +0200 Subject: bridge: multicast: call skb_checksum_{simple_, }validate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's use these new, neat helpers. Signed-off-by: Linus Lüssing Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/bridge/br_multicast.c | 28 ++++------------------------ 1 file changed, 4 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 4b6722f8f179..b52f4cb8aee9 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1610,16 +1610,8 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, if (!pskb_may_pull(skb2, sizeof(*ih))) goto out; - switch (skb2->ip_summed) { - case CHECKSUM_COMPLETE: - if (!csum_fold(skb2->csum)) - break; - /* fall through */ - case CHECKSUM_NONE: - skb2->csum = 0; - if (skb_checksum_complete(skb2)) - goto out; - } + if (skb_checksum_simple_validate(skb2)) + goto out; err = 0; @@ -1737,20 +1729,8 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br, ip6h = ipv6_hdr(skb2); - switch (skb2->ip_summed) { - case CHECKSUM_COMPLETE: - if (!csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, skb2->len, - IPPROTO_ICMPV6, skb2->csum)) - break; - /*FALLTHROUGH*/ - case CHECKSUM_NONE: - skb2->csum = ~csum_unfold(csum_ipv6_magic(&ip6h->saddr, - &ip6h->daddr, - skb2->len, - IPPROTO_ICMPV6, 0)); - if (__skb_checksum_complete(skb2)) - goto out; - } + if (skb_checksum_validate(skb2, IPPROTO_ICMPV6, ip6_compute_pseudo)) + goto out; err = 0; -- cgit From 9afd85c9e4552b276e2f4cfefd622bdeeffbbf26 Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Sat, 2 May 2015 14:01:07 +0200 Subject: net: Export IGMP/MLD message validation code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With this patch, the IGMP and MLD message validation functions are moved from the bridge code to IPv4/IPv6 multicast files. Some small refactoring was done to enhance readibility and to iron out some differences in behaviour between the IGMP and MLD parsing code (e.g. the skb-cloning of MLD messages is now only done if necessary, just like the IGMP part always did). Finally, these IGMP and MLD message validation functions are exported so that not only the bridge can use it but batman-adv later, too. Signed-off-by: Linus Lüssing Signed-off-by: David S. Miller --- net/bridge/br_multicast.c | 218 +++++++--------------------------------------- net/core/skbuff.c | 87 ++++++++++++++++++ net/ipv4/igmp.c | 162 ++++++++++++++++++++++++++++++++++ net/ipv6/Makefile | 1 + net/ipv6/mcast_snoop.c | 213 ++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 493 insertions(+), 188 deletions(-) create mode 100644 net/ipv6/mcast_snoop.c (limited to 'net') diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index b52f4cb8aee9..2d69d5cab52f 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -975,9 +975,6 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br, int err = 0; __be32 group; - if (!pskb_may_pull(skb, sizeof(*ih))) - return -EINVAL; - ih = igmpv3_report_hdr(skb); num = ntohs(ih->ngrec); len = sizeof(*ih); @@ -1248,25 +1245,14 @@ static int br_ip4_multicast_query(struct net_bridge *br, max_delay = 10 * HZ; group = 0; } - } else { - if (!pskb_may_pull(skb, sizeof(struct igmpv3_query))) { - err = -EINVAL; - goto out; - } - + } else if (skb->len >= sizeof(*ih3)) { ih3 = igmpv3_query_hdr(skb); if (ih3->nsrcs) goto out; max_delay = ih3->code ? IGMPV3_MRC(ih3->code) * (HZ / IGMP_TIMER_SCALE) : 1; - } - - /* RFC2236+RFC3376 (IGMPv2+IGMPv3) require the multicast link layer - * all-systems destination addresses (224.0.0.1) for general queries - */ - if (!group && iph->daddr != htonl(INADDR_ALLHOSTS_GROUP)) { - err = -EINVAL; + } else { goto out; } @@ -1329,12 +1315,6 @@ static int br_ip6_multicast_query(struct net_bridge *br, (port && port->state == BR_STATE_DISABLED)) goto out; - /* RFC2710+RFC3810 (MLDv1+MLDv2) require link-local source addresses */ - if (!(ipv6_addr_type(&ip6h->saddr) & IPV6_ADDR_LINKLOCAL)) { - err = -EINVAL; - goto out; - } - if (skb->len == sizeof(*mld)) { if (!pskb_may_pull(skb, sizeof(*mld))) { err = -EINVAL; @@ -1358,14 +1338,6 @@ static int br_ip6_multicast_query(struct net_bridge *br, is_general_query = group && ipv6_addr_any(group); - /* RFC2710+RFC3810 (MLDv1+MLDv2) require the multicast link layer - * all-nodes destination address (ff02::1) for general queries - */ - if (is_general_query && !ipv6_addr_is_ll_all_nodes(&ip6h->daddr)) { - err = -EINVAL; - goto out; - } - if (is_general_query) { saddr.proto = htons(ETH_P_IPV6); saddr.u.ip6 = ip6h->saddr; @@ -1557,66 +1529,22 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, struct sk_buff *skb, u16 vid) { - struct sk_buff *skb2 = skb; - const struct iphdr *iph; + struct sk_buff *skb_trimmed = NULL; struct igmphdr *ih; - unsigned int len; - unsigned int offset; int err; - /* We treat OOM as packet loss for now. */ - if (!pskb_may_pull(skb, sizeof(*iph))) - return -EINVAL; - - iph = ip_hdr(skb); - - if (iph->ihl < 5 || iph->version != 4) - return -EINVAL; - - if (!pskb_may_pull(skb, ip_hdrlen(skb))) - return -EINVAL; - - iph = ip_hdr(skb); + err = ip_mc_check_igmp(skb, &skb_trimmed); - if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) - return -EINVAL; - - if (iph->protocol != IPPROTO_IGMP) { - if (!ipv4_is_local_multicast(iph->daddr)) + if (err == -ENOMSG) { + if (!ipv4_is_local_multicast(ip_hdr(skb)->daddr)) BR_INPUT_SKB_CB(skb)->mrouters_only = 1; return 0; + } else if (err < 0) { + return err; } - len = ntohs(iph->tot_len); - if (skb->len < len || len < ip_hdrlen(skb)) - return -EINVAL; - - if (skb->len > len) { - skb2 = skb_clone(skb, GFP_ATOMIC); - if (!skb2) - return -ENOMEM; - - err = pskb_trim_rcsum(skb2, len); - if (err) - goto err_out; - } - - len -= ip_hdrlen(skb2); - offset = skb_network_offset(skb2) + ip_hdrlen(skb2); - __skb_pull(skb2, offset); - skb_reset_transport_header(skb2); - - err = -EINVAL; - if (!pskb_may_pull(skb2, sizeof(*ih))) - goto out; - - if (skb_checksum_simple_validate(skb2)) - goto out; - - err = 0; - BR_INPUT_SKB_CB(skb)->igmp = 1; - ih = igmp_hdr(skb2); + ih = igmp_hdr(skb); switch (ih->type) { case IGMP_HOST_MEMBERSHIP_REPORT: @@ -1625,21 +1553,19 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, err = br_ip4_multicast_add_group(br, port, ih->group, vid); break; case IGMPV3_HOST_MEMBERSHIP_REPORT: - err = br_ip4_multicast_igmp3_report(br, port, skb2, vid); + err = br_ip4_multicast_igmp3_report(br, port, skb_trimmed, vid); break; case IGMP_HOST_MEMBERSHIP_QUERY: - err = br_ip4_multicast_query(br, port, skb2, vid); + err = br_ip4_multicast_query(br, port, skb_trimmed, vid); break; case IGMP_HOST_LEAVE_MESSAGE: br_ip4_multicast_leave_group(br, port, ih->group, vid); break; } -out: - __skb_push(skb2, offset); -err_out: - if (skb2 != skb) - kfree_skb(skb2); + if (skb_trimmed) + kfree_skb(skb_trimmed); + return err; } @@ -1649,126 +1575,42 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br, struct sk_buff *skb, u16 vid) { - struct sk_buff *skb2; - const struct ipv6hdr *ip6h; - u8 icmp6_type; - u8 nexthdr; - __be16 frag_off; - unsigned int len; - int offset; + struct sk_buff *skb_trimmed = NULL; + struct mld_msg *mld; int err; - if (!pskb_may_pull(skb, sizeof(*ip6h))) - return -EINVAL; + err = ipv6_mc_check_mld(skb, &skb_trimmed); - ip6h = ipv6_hdr(skb); - - /* - * We're interested in MLD messages only. - * - Version is 6 - * - MLD has always Router Alert hop-by-hop option - * - But we do not support jumbrograms. - */ - if (ip6h->version != 6) - return 0; - - /* Prevent flooding this packet if there is no listener present */ - if (!ipv6_addr_is_ll_all_nodes(&ip6h->daddr)) - BR_INPUT_SKB_CB(skb)->mrouters_only = 1; - - if (ip6h->nexthdr != IPPROTO_HOPOPTS || - ip6h->payload_len == 0) - return 0; - - len = ntohs(ip6h->payload_len) + sizeof(*ip6h); - if (skb->len < len) - return -EINVAL; - - nexthdr = ip6h->nexthdr; - offset = ipv6_skip_exthdr(skb, sizeof(*ip6h), &nexthdr, &frag_off); - - if (offset < 0 || nexthdr != IPPROTO_ICMPV6) + if (err == -ENOMSG) { + if (!ipv6_addr_is_ll_all_nodes(&ipv6_hdr(skb)->daddr)) + BR_INPUT_SKB_CB(skb)->mrouters_only = 1; return 0; - - /* Okay, we found ICMPv6 header */ - skb2 = skb_clone(skb, GFP_ATOMIC); - if (!skb2) - return -ENOMEM; - - err = -EINVAL; - if (!pskb_may_pull(skb2, offset + sizeof(struct icmp6hdr))) - goto out; - - len -= offset - skb_network_offset(skb2); - - __skb_pull(skb2, offset); - skb_reset_transport_header(skb2); - skb_postpull_rcsum(skb2, skb_network_header(skb2), - skb_network_header_len(skb2)); - - icmp6_type = icmp6_hdr(skb2)->icmp6_type; - - switch (icmp6_type) { - case ICMPV6_MGM_QUERY: - case ICMPV6_MGM_REPORT: - case ICMPV6_MGM_REDUCTION: - case ICMPV6_MLD2_REPORT: - break; - default: - err = 0; - goto out; - } - - /* Okay, we found MLD message. Check further. */ - if (skb2->len > len) { - err = pskb_trim_rcsum(skb2, len); - if (err) - goto out; - err = -EINVAL; + } else if (err < 0) { + return err; } - ip6h = ipv6_hdr(skb2); - - if (skb_checksum_validate(skb2, IPPROTO_ICMPV6, ip6_compute_pseudo)) - goto out; - - err = 0; - BR_INPUT_SKB_CB(skb)->igmp = 1; + mld = (struct mld_msg *)skb_transport_header(skb); - switch (icmp6_type) { + switch (mld->mld_type) { case ICMPV6_MGM_REPORT: - { - struct mld_msg *mld; - if (!pskb_may_pull(skb2, sizeof(*mld))) { - err = -EINVAL; - goto out; - } - mld = (struct mld_msg *)skb_transport_header(skb2); BR_INPUT_SKB_CB(skb)->mrouters_only = 1; err = br_ip6_multicast_add_group(br, port, &mld->mld_mca, vid); break; - } case ICMPV6_MLD2_REPORT: - err = br_ip6_multicast_mld2_report(br, port, skb2, vid); + err = br_ip6_multicast_mld2_report(br, port, skb_trimmed, vid); break; case ICMPV6_MGM_QUERY: - err = br_ip6_multicast_query(br, port, skb2, vid); + err = br_ip6_multicast_query(br, port, skb_trimmed, vid); break; case ICMPV6_MGM_REDUCTION: - { - struct mld_msg *mld; - if (!pskb_may_pull(skb2, sizeof(*mld))) { - err = -EINVAL; - goto out; - } - mld = (struct mld_msg *)skb_transport_header(skb2); br_ip6_multicast_leave_group(br, port, &mld->mld_mca, vid); - } + break; } -out: - kfree_skb(skb2); + if (skb_trimmed) + kfree_skb(skb_trimmed); + return err; } #endif diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 3cfff2a3d651..1e4278a4dd7e 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4030,6 +4030,93 @@ int skb_checksum_setup(struct sk_buff *skb, bool recalculate) } EXPORT_SYMBOL(skb_checksum_setup); +/** + * skb_checksum_maybe_trim - maybe trims the given skb + * @skb: the skb to check + * @transport_len: the data length beyond the network header + * + * Checks whether the given skb has data beyond the given transport length. + * If so, returns a cloned skb trimmed to this transport length. + * Otherwise returns the provided skb. Returns NULL in error cases + * (e.g. transport_len exceeds skb length or out-of-memory). + * + * Caller needs to set the skb transport header and release the returned skb. + * Provided skb is consumed. + */ +static struct sk_buff *skb_checksum_maybe_trim(struct sk_buff *skb, + unsigned int transport_len) +{ + struct sk_buff *skb_chk; + unsigned int len = skb_transport_offset(skb) + transport_len; + int ret; + + if (skb->len < len) { + kfree_skb(skb); + return NULL; + } else if (skb->len == len) { + return skb; + } + + skb_chk = skb_clone(skb, GFP_ATOMIC); + kfree_skb(skb); + + if (!skb_chk) + return NULL; + + ret = pskb_trim_rcsum(skb_chk, len); + if (ret) { + kfree_skb(skb_chk); + return NULL; + } + + return skb_chk; +} + +/** + * skb_checksum_trimmed - validate checksum of an skb + * @skb: the skb to check + * @transport_len: the data length beyond the network header + * @skb_chkf: checksum function to use + * + * Applies the given checksum function skb_chkf to the provided skb. + * Returns a checked and maybe trimmed skb. Returns NULL on error. + * + * If the skb has data beyond the given transport length, then a + * trimmed & cloned skb is checked and returned. + * + * Caller needs to set the skb transport header and release the returned skb. + * Provided skb is consumed. + */ +struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb, + unsigned int transport_len, + __sum16(*skb_chkf)(struct sk_buff *skb)) +{ + struct sk_buff *skb_chk; + unsigned int offset = skb_transport_offset(skb); + int ret; + + skb_chk = skb_checksum_maybe_trim(skb, transport_len); + if (!skb_chk) + return NULL; + + if (!pskb_may_pull(skb_chk, offset)) { + kfree_skb(skb_chk); + return NULL; + } + + __skb_pull(skb_chk, offset); + ret = skb_chkf(skb_chk); + __skb_push(skb_chk, offset); + + if (ret) { + kfree_skb(skb_chk); + return NULL; + } + + return skb_chk; +} +EXPORT_SYMBOL(skb_checksum_trimmed); + void __skb_warn_lro_forwarding(const struct sk_buff *skb) { net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n", diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index a3a697f5ffba..651cdf648ec4 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1339,6 +1339,168 @@ out: } EXPORT_SYMBOL(ip_mc_inc_group); +static int ip_mc_check_iphdr(struct sk_buff *skb) +{ + const struct iphdr *iph; + unsigned int len; + unsigned int offset = skb_network_offset(skb) + sizeof(*iph); + + if (!pskb_may_pull(skb, offset)) + return -EINVAL; + + iph = ip_hdr(skb); + + if (iph->version != 4 || ip_hdrlen(skb) < sizeof(*iph)) + return -EINVAL; + + offset += ip_hdrlen(skb) - sizeof(*iph); + + if (!pskb_may_pull(skb, offset)) + return -EINVAL; + + iph = ip_hdr(skb); + + if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) + return -EINVAL; + + len = skb_network_offset(skb) + ntohs(iph->tot_len); + if (skb->len < len || len < offset) + return -EINVAL; + + skb_set_transport_header(skb, offset); + + return 0; +} + +static int ip_mc_check_igmp_reportv3(struct sk_buff *skb) +{ + unsigned int len = skb_transport_offset(skb); + + len += sizeof(struct igmpv3_report); + + return pskb_may_pull(skb, len) ? 0 : -EINVAL; +} + +static int ip_mc_check_igmp_query(struct sk_buff *skb) +{ + unsigned int len = skb_transport_offset(skb); + + len += sizeof(struct igmphdr); + if (skb->len < len) + return -EINVAL; + + /* IGMPv{1,2}? */ + if (skb->len != len) { + /* or IGMPv3? */ + len += sizeof(struct igmpv3_query) - sizeof(struct igmphdr); + if (skb->len < len || !pskb_may_pull(skb, len)) + return -EINVAL; + } + + /* RFC2236+RFC3376 (IGMPv2+IGMPv3) require the multicast link layer + * all-systems destination addresses (224.0.0.1) for general queries + */ + if (!igmp_hdr(skb)->group && + ip_hdr(skb)->daddr != htonl(INADDR_ALLHOSTS_GROUP)) + return -EINVAL; + + return 0; +} + +static int ip_mc_check_igmp_msg(struct sk_buff *skb) +{ + switch (igmp_hdr(skb)->type) { + case IGMP_HOST_LEAVE_MESSAGE: + case IGMP_HOST_MEMBERSHIP_REPORT: + case IGMPV2_HOST_MEMBERSHIP_REPORT: + /* fall through */ + return 0; + case IGMPV3_HOST_MEMBERSHIP_REPORT: + return ip_mc_check_igmp_reportv3(skb); + case IGMP_HOST_MEMBERSHIP_QUERY: + return ip_mc_check_igmp_query(skb); + default: + return -ENOMSG; + } +} + +static inline __sum16 ip_mc_validate_checksum(struct sk_buff *skb) +{ + return skb_checksum_simple_validate(skb); +} + +static int __ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed) + +{ + struct sk_buff *skb_chk; + unsigned int transport_len; + unsigned int len = skb_transport_offset(skb) + sizeof(struct igmphdr); + int ret; + + transport_len = ntohs(ip_hdr(skb)->tot_len) - ip_hdrlen(skb); + + skb_get(skb); + skb_chk = skb_checksum_trimmed(skb, transport_len, + ip_mc_validate_checksum); + if (!skb_chk) + return -EINVAL; + + if (!pskb_may_pull(skb_chk, len)) { + kfree_skb(skb_chk); + return -EINVAL; + } + + ret = ip_mc_check_igmp_msg(skb_chk); + if (ret) { + kfree_skb(skb_chk); + return ret; + } + + if (skb_trimmed) + *skb_trimmed = skb_chk; + else + kfree_skb(skb_chk); + + return 0; +} + +/** + * ip_mc_check_igmp - checks whether this is a sane IGMP packet + * @skb: the skb to validate + * @skb_trimmed: to store an skb pointer trimmed to IPv4 packet tail (optional) + * + * Checks whether an IPv4 packet is a valid IGMP packet. If so sets + * skb network and transport headers accordingly and returns zero. + * + * -EINVAL: A broken packet was detected, i.e. it violates some internet + * standard + * -ENOMSG: IP header validation succeeded but it is not an IGMP packet. + * -ENOMEM: A memory allocation failure happened. + * + * Optionally, an skb pointer might be provided via skb_trimmed (or set it + * to NULL): After parsing an IGMP packet successfully it will point to + * an skb which has its tail aligned to the IP packet end. This might + * either be the originally provided skb or a trimmed, cloned version if + * the skb frame had data beyond the IP packet. A cloned skb allows us + * to leave the original skb and its full frame unchanged (which might be + * desirable for layer 2 frame jugglers). + * + * The caller needs to release a reference count from any returned skb_trimmed. + */ +int ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed) +{ + int ret = ip_mc_check_iphdr(skb); + + if (ret < 0) + return ret; + + if (ip_hdr(skb)->protocol != IPPROTO_IGMP) + return -ENOMSG; + + return __ip_mc_check_igmp(skb, skb_trimmed); +} +EXPORT_SYMBOL(ip_mc_check_igmp); + /* * Resend IGMP JOIN report; used by netdev notifier. */ diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index 2e8c06108ab9..0f3f1999719a 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -48,4 +48,5 @@ obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o ifneq ($(CONFIG_IPV6),) obj-$(CONFIG_NET_UDP_TUNNEL) += ip6_udp_tunnel.o +obj-y += mcast_snoop.o endif diff --git a/net/ipv6/mcast_snoop.c b/net/ipv6/mcast_snoop.c new file mode 100644 index 000000000000..1a2cbc13a7d3 --- /dev/null +++ b/net/ipv6/mcast_snoop.c @@ -0,0 +1,213 @@ +/* Copyright (C) 2010: YOSHIFUJI Hideaki + * Copyright (C) 2015: Linus Lüssing + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + * + * + * Based on the MLD support added to br_multicast.c by YOSHIFUJI Hideaki. + */ + +#include +#include +#include +#include +#include + +static int ipv6_mc_check_ip6hdr(struct sk_buff *skb) +{ + const struct ipv6hdr *ip6h; + unsigned int len; + unsigned int offset = skb_network_offset(skb) + sizeof(*ip6h); + + if (!pskb_may_pull(skb, offset)) + return -EINVAL; + + ip6h = ipv6_hdr(skb); + + if (ip6h->version != 6) + return -EINVAL; + + len = offset + ntohs(ip6h->payload_len); + if (skb->len < len || len <= offset) + return -EINVAL; + + return 0; +} + +static int ipv6_mc_check_exthdrs(struct sk_buff *skb) +{ + const struct ipv6hdr *ip6h; + unsigned int offset; + u8 nexthdr; + __be16 frag_off; + + ip6h = ipv6_hdr(skb); + + if (ip6h->nexthdr != IPPROTO_HOPOPTS) + return -ENOMSG; + + nexthdr = ip6h->nexthdr; + offset = skb_network_offset(skb) + sizeof(*ip6h); + offset = ipv6_skip_exthdr(skb, offset, &nexthdr, &frag_off); + + if (offset < 0) + return -EINVAL; + + if (nexthdr != IPPROTO_ICMPV6) + return -ENOMSG; + + skb_set_transport_header(skb, offset); + + return 0; +} + +static int ipv6_mc_check_mld_reportv2(struct sk_buff *skb) +{ + unsigned int len = skb_transport_offset(skb); + + len += sizeof(struct mld2_report); + + return pskb_may_pull(skb, len) ? 0 : -EINVAL; +} + +static int ipv6_mc_check_mld_query(struct sk_buff *skb) +{ + struct mld_msg *mld; + unsigned int len = skb_transport_offset(skb); + + /* RFC2710+RFC3810 (MLDv1+MLDv2) require link-local source addresses */ + if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) + return -EINVAL; + + len += sizeof(struct mld_msg); + if (skb->len < len) + return -EINVAL; + + /* MLDv1? */ + if (skb->len != len) { + /* or MLDv2? */ + len += sizeof(struct mld2_query) - sizeof(struct mld_msg); + if (skb->len < len || !pskb_may_pull(skb, len)) + return -EINVAL; + } + + mld = (struct mld_msg *)skb_transport_header(skb); + + /* RFC2710+RFC3810 (MLDv1+MLDv2) require the multicast link layer + * all-nodes destination address (ff02::1) for general queries + */ + if (ipv6_addr_any(&mld->mld_mca) && + !ipv6_addr_is_ll_all_nodes(&ipv6_hdr(skb)->daddr)) + return -EINVAL; + + return 0; +} + +static int ipv6_mc_check_mld_msg(struct sk_buff *skb) +{ + struct mld_msg *mld = (struct mld_msg *)skb_transport_header(skb); + + switch (mld->mld_type) { + case ICMPV6_MGM_REDUCTION: + case ICMPV6_MGM_REPORT: + /* fall through */ + return 0; + case ICMPV6_MLD2_REPORT: + return ipv6_mc_check_mld_reportv2(skb); + case ICMPV6_MGM_QUERY: + return ipv6_mc_check_mld_query(skb); + default: + return -ENOMSG; + } +} + +static inline __sum16 ipv6_mc_validate_checksum(struct sk_buff *skb) +{ + return skb_checksum_validate(skb, IPPROTO_ICMPV6, ip6_compute_pseudo); +} + +static int __ipv6_mc_check_mld(struct sk_buff *skb, + struct sk_buff **skb_trimmed) + +{ + struct sk_buff *skb_chk = NULL; + unsigned int transport_len; + unsigned int len = skb_transport_offset(skb) + sizeof(struct mld_msg); + int ret; + + transport_len = ntohs(ipv6_hdr(skb)->payload_len); + transport_len -= skb_transport_offset(skb) - sizeof(struct ipv6hdr); + + skb_get(skb); + skb_chk = skb_checksum_trimmed(skb, transport_len, + ipv6_mc_validate_checksum); + if (!skb_chk) + return -EINVAL; + + if (!pskb_may_pull(skb_chk, len)) { + kfree_skb(skb_chk); + return -EINVAL; + } + + ret = ipv6_mc_check_mld_msg(skb_chk); + if (ret) { + kfree_skb(skb_chk); + return ret; + } + + if (skb_trimmed) + *skb_trimmed = skb_chk; + else + kfree_skb(skb_chk); + + return 0; +} + +/** + * ipv6_mc_check_mld - checks whether this is a sane MLD packet + * @skb: the skb to validate + * @skb_trimmed: to store an skb pointer trimmed to IPv6 packet tail (optional) + * + * Checks whether an IPv6 packet is a valid MLD packet. If so sets + * skb network and transport headers accordingly and returns zero. + * + * -EINVAL: A broken packet was detected, i.e. it violates some internet + * standard + * -ENOMSG: IP header validation succeeded but it is not an MLD packet. + * -ENOMEM: A memory allocation failure happened. + * + * Optionally, an skb pointer might be provided via skb_trimmed (or set it + * to NULL): After parsing an MLD packet successfully it will point to + * an skb which has its tail aligned to the IP packet end. This might + * either be the originally provided skb or a trimmed, cloned version if + * the skb frame had data beyond the IP packet. A cloned skb allows us + * to leave the original skb and its full frame unchanged (which might be + * desirable for layer 2 frame jugglers). + * + * The caller needs to release a reference count from any returned skb_trimmed. + */ +int ipv6_mc_check_mld(struct sk_buff *skb, struct sk_buff **skb_trimmed) +{ + int ret; + + ret = ipv6_mc_check_ip6hdr(skb); + if (ret < 0) + return ret; + + ret = ipv6_mc_check_exthdrs(skb); + if (ret < 0) + return ret; + + return __ipv6_mc_check_mld(skb, skb_trimmed); +} +EXPORT_SYMBOL(ipv6_mc_check_mld); -- cgit From 57f1d1868fb5d71a20bfb1bc807274471c2ff459 Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Mon, 4 May 2015 10:36:44 +0800 Subject: tipc: rename functions defined in subscr.c When a topology server accepts a connection request from its client, it allocates a connection instance and a tipc_subscriber structure object. The former is used to communicate with client, and the latter is often treated as a subscriber which manages all subscription events requested from a same client. When a topology server receives a request of subscribing name services from a client through the connection, it creates a tipc_subscription structure instance which is seen as a subscription recording what name services are subscribed. In order to manage all subscriptions from a same client, topology server links them into the subscrp_list of the subscriber. So subscriber and subscription completely represents different meanings respectively, but function names associated with them make us so confused that we are unable to easily tell which function is against subscriber and which is to subscription. So we want to eliminate the confusion by renaming them. Signed-off-by: Ying Xue Reviewed-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/core.c | 4 +- net/tipc/name_table.c | 34 ++++++--------- net/tipc/subscr.c | 116 ++++++++++++++++++++++---------------------------- net/tipc/subscr.h | 18 ++++---- 4 files changed, 75 insertions(+), 97 deletions(-) (limited to 'net') diff --git a/net/tipc/core.c b/net/tipc/core.c index be1c9fa60b09..005ba5eb0ea4 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -68,7 +68,7 @@ static int __net_init tipc_init_net(struct net *net) if (err) goto out_nametbl; - err = tipc_subscr_start(net); + err = tipc_topsrv_start(net); if (err) goto out_subscr; return 0; @@ -83,7 +83,7 @@ out_sk_rht: static void __net_exit tipc_exit_net(struct net *net) { - tipc_subscr_stop(net); + tipc_topsrv_stop(net); tipc_net_stop(net); tipc_nametbl_stop(net); tipc_sk_rht_destroy(net); diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index ab0ac62a1287..0f47f08bf38f 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -330,13 +330,9 @@ static struct publication *tipc_nameseq_insert_publ(struct net *net, /* Any subscriptions waiting for notification? */ list_for_each_entry_safe(s, st, &nseq->subscriptions, nameseq_list) { - tipc_subscr_report_overlap(s, - publ->lower, - publ->upper, - TIPC_PUBLISHED, - publ->ref, - publ->node, - created_subseq); + tipc_subscrp_report_overlap(s, publ->lower, publ->upper, + TIPC_PUBLISHED, publ->ref, + publ->node, created_subseq); } return publ; } @@ -404,13 +400,9 @@ found: /* Notify any waiting subscriptions */ list_for_each_entry_safe(s, st, &nseq->subscriptions, nameseq_list) { - tipc_subscr_report_overlap(s, - publ->lower, - publ->upper, - TIPC_WITHDRAWN, - publ->ref, - publ->node, - removed_subseq); + tipc_subscrp_report_overlap(s, publ->lower, publ->upper, + TIPC_WITHDRAWN, publ->ref, + publ->node, removed_subseq); } return publ; @@ -432,19 +424,17 @@ static void tipc_nameseq_subscribe(struct name_seq *nseq, return; while (sseq != &nseq->sseqs[nseq->first_free]) { - if (tipc_subscr_overlap(s, sseq->lower, sseq->upper)) { + if (tipc_subscrp_check_overlap(s, sseq->lower, sseq->upper)) { struct publication *crs; struct name_info *info = sseq->info; int must_report = 1; list_for_each_entry(crs, &info->zone_list, zone_list) { - tipc_subscr_report_overlap(s, - sseq->lower, - sseq->upper, - TIPC_PUBLISHED, - crs->ref, - crs->node, - must_report); + tipc_subscrp_report_overlap(s, sseq->lower, + sseq->upper, + TIPC_PUBLISHED, + crs->ref, crs->node, + must_report); must_report = 0; } } diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index 1c147c869c2e..caec0b2b0740 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -42,12 +42,12 @@ * struct tipc_subscriber - TIPC network topology subscriber * @conid: connection identifier to server connecting to subscriber * @lock: control access to subscriber - * @subscription_list: list of subscription objects for this subscriber + * @subscrp_list: list of subscription objects for this subscriber */ struct tipc_subscriber { int conid; spinlock_t lock; - struct list_head subscription_list; + struct list_head subscrp_list; }; /** @@ -62,9 +62,9 @@ static u32 htohl(u32 in, int swap) return swap ? swab32(in) : in; } -static void subscr_send_event(struct tipc_subscription *sub, u32 found_lower, - u32 found_upper, u32 event, u32 port_ref, - u32 node) +static void tipc_subscrp_send_event(struct tipc_subscription *sub, + u32 found_lower, u32 found_upper, + u32 event, u32 port_ref, u32 node) { struct tipc_net *tn = net_generic(sub->net, tipc_net_id); struct tipc_subscriber *subscriber = sub->subscriber; @@ -82,12 +82,13 @@ static void subscr_send_event(struct tipc_subscription *sub, u32 found_lower, } /** - * tipc_subscr_overlap - test for subscription overlap with the given values + * tipc_subscrp_check_overlap - test for subscription overlap with the + * given values * * Returns 1 if there is overlap, otherwise 0. */ -int tipc_subscr_overlap(struct tipc_subscription *sub, u32 found_lower, - u32 found_upper) +int tipc_subscrp_check_overlap(struct tipc_subscription *sub, u32 found_lower, + u32 found_upper) { if (found_lower < sub->seq.lower) found_lower = sub->seq.lower; @@ -98,24 +99,20 @@ int tipc_subscr_overlap(struct tipc_subscription *sub, u32 found_lower, return 1; } -/** - * tipc_subscr_report_overlap - issue event if there is subscription overlap - * - * Protected by nameseq.lock in name_table.c - */ -void tipc_subscr_report_overlap(struct tipc_subscription *sub, u32 found_lower, - u32 found_upper, u32 event, u32 port_ref, - u32 node, int must) +void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower, + u32 found_upper, u32 event, u32 port_ref, + u32 node, int must) { - if (!tipc_subscr_overlap(sub, found_lower, found_upper)) + if (!tipc_subscrp_check_overlap(sub, found_lower, found_upper)) return; if (!must && !(sub->filter & TIPC_SUB_PORTS)) return; - subscr_send_event(sub, found_lower, found_upper, event, port_ref, node); + tipc_subscrp_send_event(sub, found_lower, found_upper, event, port_ref, + node); } -static void subscr_timeout(unsigned long data) +static void tipc_subscrp_timeout(unsigned long data) { struct tipc_subscription *sub = (struct tipc_subscription *)data; struct tipc_subscriber *subscriber = sub->subscriber; @@ -134,35 +131,30 @@ static void subscr_timeout(unsigned long data) tipc_nametbl_unsubscribe(sub); /* Unlink subscription from subscriber */ - list_del(&sub->subscription_list); + list_del(&sub->subscrp_list); spin_unlock_bh(&subscriber->lock); /* Notify subscriber of timeout */ - subscr_send_event(sub, sub->evt.s.seq.lower, sub->evt.s.seq.upper, - TIPC_SUBSCR_TIMEOUT, 0, 0); + tipc_subscrp_send_event(sub, sub->evt.s.seq.lower, sub->evt.s.seq.upper, + TIPC_SUBSCR_TIMEOUT, 0, 0); /* Now destroy subscription */ kfree(sub); atomic_dec(&tn->subscription_count); } -/** - * subscr_del - delete a subscription within a subscription list - * - * Called with subscriber lock held. - */ -static void subscr_del(struct tipc_subscription *sub) +static void tipc_subscrp_delete(struct tipc_subscription *sub) { struct tipc_net *tn = net_generic(sub->net, tipc_net_id); tipc_nametbl_unsubscribe(sub); - list_del(&sub->subscription_list); + list_del(&sub->subscrp_list); kfree(sub); atomic_dec(&tn->subscription_count); } -static void subscr_release(struct tipc_subscriber *subscriber) +static void tipc_subscrb_delete(struct tipc_subscriber *subscriber) { struct tipc_subscription *sub; struct tipc_subscription *sub_temp; @@ -170,14 +162,14 @@ static void subscr_release(struct tipc_subscriber *subscriber) spin_lock_bh(&subscriber->lock); /* Destroy any existing subscriptions for subscriber */ - list_for_each_entry_safe(sub, sub_temp, &subscriber->subscription_list, - subscription_list) { + list_for_each_entry_safe(sub, sub_temp, &subscriber->subscrp_list, + subscrp_list) { if (sub->timeout != TIPC_WAIT_FOREVER) { spin_unlock_bh(&subscriber->lock); del_timer_sync(&sub->timer); spin_lock_bh(&subscriber->lock); } - subscr_del(sub); + tipc_subscrp_delete(sub); } spin_unlock_bh(&subscriber->lock); @@ -186,7 +178,7 @@ static void subscr_release(struct tipc_subscriber *subscriber) } /** - * subscr_cancel - handle subscription cancellation request + * tipc_subscrp_cancel - handle subscription cancellation request * * Called with subscriber lock held. Routine must temporarily release lock * to enable the subscription timeout routine to finish without deadlocking; @@ -194,16 +186,16 @@ static void subscr_release(struct tipc_subscriber *subscriber) * * Note that fields of 's' use subscriber's endianness! */ -static void subscr_cancel(struct tipc_subscr *s, - struct tipc_subscriber *subscriber) +static void tipc_subscrp_cancel(struct tipc_subscr *s, + struct tipc_subscriber *subscriber) { struct tipc_subscription *sub; struct tipc_subscription *sub_temp; int found = 0; /* Find first matching subscription, exit if not found */ - list_for_each_entry_safe(sub, sub_temp, &subscriber->subscription_list, - subscription_list) { + list_for_each_entry_safe(sub, sub_temp, &subscriber->subscrp_list, + subscrp_list) { if (!memcmp(s, &sub->evt.s, sizeof(struct tipc_subscr))) { found = 1; break; @@ -219,17 +211,12 @@ static void subscr_cancel(struct tipc_subscr *s, del_timer_sync(&sub->timer); spin_lock_bh(&subscriber->lock); } - subscr_del(sub); + tipc_subscrp_delete(sub); } -/** - * subscr_subscribe - create subscription for subscriber - * - * Called with subscriber lock held. - */ -static int subscr_subscribe(struct net *net, struct tipc_subscr *s, - struct tipc_subscriber *subscriber, - struct tipc_subscription **sub_p) +static int tipc_subscrp_create(struct net *net, struct tipc_subscr *s, + struct tipc_subscriber *subscriber, + struct tipc_subscription **sub_p) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_subscription *sub; @@ -241,7 +228,7 @@ static int subscr_subscribe(struct net *net, struct tipc_subscr *s, /* Detect & process a subscription cancellation request */ if (s->filter & htohl(TIPC_SUB_CANCEL, swap)) { s->filter &= ~htohl(TIPC_SUB_CANCEL, swap); - subscr_cancel(s, subscriber); + tipc_subscrp_cancel(s, subscriber); return 0; } @@ -273,13 +260,14 @@ static int subscr_subscribe(struct net *net, struct tipc_subscr *s, kfree(sub); return -EINVAL; } - list_add(&sub->subscription_list, &subscriber->subscription_list); + list_add(&sub->subscrp_list, &subscriber->subscrp_list); sub->subscriber = subscriber; sub->swap = swap; - memcpy(&sub->evt.s, s, sizeof(struct tipc_subscr)); + memcpy(&sub->evt.s, s, sizeof(*s)); atomic_inc(&tn->subscription_count); if (sub->timeout != TIPC_WAIT_FOREVER) { - setup_timer(&sub->timer, subscr_timeout, (unsigned long)sub); + setup_timer(&sub->timer, tipc_subscrp_timeout, + (unsigned long)sub); mod_timer(&sub->timer, jiffies + sub->timeout); } *sub_p = sub; @@ -287,22 +275,22 @@ static int subscr_subscribe(struct net *net, struct tipc_subscr *s, } /* Handle one termination request for the subscriber */ -static void subscr_conn_shutdown_event(int conid, void *usr_data) +static void tipc_subscrb_shutdown_cb(int conid, void *usr_data) { - subscr_release((struct tipc_subscriber *)usr_data); + tipc_subscrb_delete((struct tipc_subscriber *)usr_data); } /* Handle one request to create a new subscription for the subscriber */ -static void subscr_conn_msg_event(struct net *net, int conid, - struct sockaddr_tipc *addr, void *usr_data, - void *buf, size_t len) +static void tipc_subscrb_rcv_cb(struct net *net, int conid, + struct sockaddr_tipc *addr, void *usr_data, + void *buf, size_t len) { struct tipc_subscriber *subscriber = usr_data; struct tipc_subscription *sub = NULL; struct tipc_net *tn = net_generic(net, tipc_net_id); spin_lock_bh(&subscriber->lock); - subscr_subscribe(net, (struct tipc_subscr *)buf, subscriber, &sub); + tipc_subscrp_create(net, (struct tipc_subscr *)buf, subscriber, &sub); if (sub) tipc_nametbl_subscribe(sub); else @@ -311,7 +299,7 @@ static void subscr_conn_msg_event(struct net *net, int conid, } /* Handle one request to establish a new subscriber */ -static void *subscr_named_msg_event(int conid) +static void *tipc_subscrb_connect_cb(int conid) { struct tipc_subscriber *subscriber; @@ -321,14 +309,14 @@ static void *subscr_named_msg_event(int conid) pr_warn("Subscriber rejected, no memory\n"); return NULL; } - INIT_LIST_HEAD(&subscriber->subscription_list); + INIT_LIST_HEAD(&subscriber->subscrp_list); subscriber->conid = conid; spin_lock_init(&subscriber->lock); return (void *)subscriber; } -int tipc_subscr_start(struct net *net) +int tipc_topsrv_start(struct net *net) { struct tipc_net *tn = net_generic(net, tipc_net_id); const char name[] = "topology_server"; @@ -355,9 +343,9 @@ int tipc_subscr_start(struct net *net) topsrv->imp = TIPC_CRITICAL_IMPORTANCE; topsrv->type = SOCK_SEQPACKET; topsrv->max_rcvbuf_size = sizeof(struct tipc_subscr); - topsrv->tipc_conn_recvmsg = subscr_conn_msg_event; - topsrv->tipc_conn_new = subscr_named_msg_event; - topsrv->tipc_conn_shutdown = subscr_conn_shutdown_event; + topsrv->tipc_conn_recvmsg = tipc_subscrb_rcv_cb; + topsrv->tipc_conn_new = tipc_subscrb_connect_cb; + topsrv->tipc_conn_shutdown = tipc_subscrb_shutdown_cb; strncpy(topsrv->name, name, strlen(name) + 1); tn->topsrv = topsrv; @@ -366,7 +354,7 @@ int tipc_subscr_start(struct net *net) return tipc_server_start(topsrv); } -void tipc_subscr_stop(struct net *net) +void tipc_topsrv_stop(struct net *net) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_server *topsrv = tn->topsrv; diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h index 33488bd9fe3c..92ee18cc5fe6 100644 --- a/net/tipc/subscr.h +++ b/net/tipc/subscr.h @@ -54,7 +54,7 @@ struct tipc_subscriber; * @filter: event filtering to be done for subscription * @timer: timer governing subscription duration (optional) * @nameseq_list: adjacent subscriptions in name sequence's subscription list - * @subscription_list: adjacent subscriptions in subscriber's subscription list + * @subscrp_list: adjacent subscriptions in subscriber's subscription list * @server_ref: object reference of server port associated with subscription * @swap: indicates if subscriber uses opposite endianness in its messages * @evt: template for events generated by subscription @@ -67,17 +67,17 @@ struct tipc_subscription { u32 filter; struct timer_list timer; struct list_head nameseq_list; - struct list_head subscription_list; + struct list_head subscrp_list; int swap; struct tipc_event evt; }; -int tipc_subscr_overlap(struct tipc_subscription *sub, u32 found_lower, - u32 found_upper); -void tipc_subscr_report_overlap(struct tipc_subscription *sub, u32 found_lower, - u32 found_upper, u32 event, u32 port_ref, - u32 node, int must); -int tipc_subscr_start(struct net *net); -void tipc_subscr_stop(struct net *net); +int tipc_subscrp_check_overlap(struct tipc_subscription *sub, u32 found_lower, + u32 found_upper); +void tipc_subscrp_report_overlap(struct tipc_subscription *sub, + u32 found_lower, u32 found_upper, u32 event, + u32 port_ref, u32 node, int must); +int tipc_topsrv_start(struct net *net); +void tipc_topsrv_stop(struct net *net); #endif -- cgit From 1b764828add9feaa18a8f916a79b954ac8a20a73 Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Mon, 4 May 2015 10:36:45 +0800 Subject: tipc: introduce tipc_subscrb_create routine Introducing a new function makes the purpose of tipc_subscrb_connect_cb callback routine more clear. Signed-off-by: Ying Xue Reviewed-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/subscr.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index caec0b2b0740..d0dbde420540 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -154,6 +154,22 @@ static void tipc_subscrp_delete(struct tipc_subscription *sub) atomic_dec(&tn->subscription_count); } +static struct tipc_subscriber *tipc_subscrb_create(int conid) +{ + struct tipc_subscriber *subscriber; + + subscriber = kzalloc(sizeof(*subscriber), GFP_ATOMIC); + if (!subscriber) { + pr_warn("Subscriber rejected, no memory\n"); + return NULL; + } + INIT_LIST_HEAD(&subscriber->subscrp_list); + subscriber->conid = conid; + spin_lock_init(&subscriber->lock); + + return subscriber; +} + static void tipc_subscrb_delete(struct tipc_subscriber *subscriber) { struct tipc_subscription *sub; @@ -301,19 +317,7 @@ static void tipc_subscrb_rcv_cb(struct net *net, int conid, /* Handle one request to establish a new subscriber */ static void *tipc_subscrb_connect_cb(int conid) { - struct tipc_subscriber *subscriber; - - /* Create subscriber object */ - subscriber = kzalloc(sizeof(struct tipc_subscriber), GFP_ATOMIC); - if (subscriber == NULL) { - pr_warn("Subscriber rejected, no memory\n"); - return NULL; - } - INIT_LIST_HEAD(&subscriber->subscrp_list); - subscriber->conid = conid; - spin_lock_init(&subscriber->lock); - - return (void *)subscriber; + return (void *)tipc_subscrb_create(conid); } int tipc_topsrv_start(struct net *net) -- cgit From 00bc00a9384c306cdd48611a53b955d936349bf6 Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Mon, 4 May 2015 10:36:46 +0800 Subject: tipc: involve reference counter for subscriber At present subscriber's lock is used to protect the subscription list of subscriber as well as subscriptions linked into the list. While one or all subscriptions are deleted through iterating the list, the subscriber's lock must be held. Meanwhile, as deletion of subscription may happen in subscription timer's handler, the lock must be grabbed in the function as well. When subscription's timer is terminated with del_timer_sync() during above iteration, subscriber's lock has to be temporarily released, otherwise, deadlock may occur. However, the temporary release may cause the double free of a subscription as the subscription is not disconnected from the subscription list. Now if a reference counter is introduced to subscriber, subscription's timer can be asynchronously stopped with del_timer(). As a result, the issue is not only able to be fixed, but also relevant code is pretty readable and understandable. Signed-off-by: Ying Xue Reviewed-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/subscr.c | 120 +++++++++++++++++++++++------------------------------- 1 file changed, 52 insertions(+), 68 deletions(-) (limited to 'net') diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index d0dbde420540..9b24c9c7e05a 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -40,16 +40,21 @@ /** * struct tipc_subscriber - TIPC network topology subscriber + * @kref: reference counter to tipc_subscription object * @conid: connection identifier to server connecting to subscriber * @lock: control access to subscriber * @subscrp_list: list of subscription objects for this subscriber */ struct tipc_subscriber { + struct kref kref; int conid; spinlock_t lock; struct list_head subscrp_list; }; +static void tipc_subscrp_delete(struct tipc_subscription *sub); +static void tipc_subscrb_put(struct tipc_subscriber *subscriber); + /** * htohl - convert value to endianness used by destination * @in: value to convert @@ -116,42 +121,34 @@ static void tipc_subscrp_timeout(unsigned long data) { struct tipc_subscription *sub = (struct tipc_subscription *)data; struct tipc_subscriber *subscriber = sub->subscriber; - struct tipc_net *tn = net_generic(sub->net, tipc_net_id); - - /* The spin lock per subscriber is used to protect its members */ - spin_lock_bh(&subscriber->lock); - - /* Validate timeout (in case subscription is being cancelled) */ - if (sub->timeout == TIPC_WAIT_FOREVER) { - spin_unlock_bh(&subscriber->lock); - return; - } - - /* Unlink subscription from name table */ - tipc_nametbl_unsubscribe(sub); - - /* Unlink subscription from subscriber */ - list_del(&sub->subscrp_list); - - spin_unlock_bh(&subscriber->lock); /* Notify subscriber of timeout */ tipc_subscrp_send_event(sub, sub->evt.s.seq.lower, sub->evt.s.seq.upper, TIPC_SUBSCR_TIMEOUT, 0, 0); - /* Now destroy subscription */ - kfree(sub); - atomic_dec(&tn->subscription_count); + spin_lock_bh(&subscriber->lock); + tipc_subscrp_delete(sub); + spin_unlock_bh(&subscriber->lock); + + tipc_subscrb_put(subscriber); } -static void tipc_subscrp_delete(struct tipc_subscription *sub) +static void tipc_subscrb_kref_release(struct kref *kref) { - struct tipc_net *tn = net_generic(sub->net, tipc_net_id); + struct tipc_subscriber *subcriber = container_of(kref, + struct tipc_subscriber, kref); - tipc_nametbl_unsubscribe(sub); - list_del(&sub->subscrp_list); - kfree(sub); - atomic_dec(&tn->subscription_count); + kfree(subcriber); +} + +static void tipc_subscrb_put(struct tipc_subscriber *subscriber) +{ + kref_put(&subscriber->kref, tipc_subscrb_kref_release); +} + +static void tipc_subscrb_get(struct tipc_subscriber *subscriber) +{ + kref_get(&subscriber->kref); } static struct tipc_subscriber *tipc_subscrb_create(int conid) @@ -163,6 +160,7 @@ static struct tipc_subscriber *tipc_subscrb_create(int conid) pr_warn("Subscriber rejected, no memory\n"); return NULL; } + kref_init(&subscriber->kref); INIT_LIST_HEAD(&subscriber->subscrp_list); subscriber->conid = conid; spin_lock_init(&subscriber->lock); @@ -172,62 +170,48 @@ static struct tipc_subscriber *tipc_subscrb_create(int conid) static void tipc_subscrb_delete(struct tipc_subscriber *subscriber) { - struct tipc_subscription *sub; - struct tipc_subscription *sub_temp; + struct tipc_subscription *sub, *temp; spin_lock_bh(&subscriber->lock); - /* Destroy any existing subscriptions for subscriber */ - list_for_each_entry_safe(sub, sub_temp, &subscriber->subscrp_list, + list_for_each_entry_safe(sub, temp, &subscriber->subscrp_list, subscrp_list) { - if (sub->timeout != TIPC_WAIT_FOREVER) { - spin_unlock_bh(&subscriber->lock); - del_timer_sync(&sub->timer); - spin_lock_bh(&subscriber->lock); + if (del_timer(&sub->timer)) { + tipc_subscrp_delete(sub); + tipc_subscrb_put(subscriber); } - tipc_subscrp_delete(sub); } spin_unlock_bh(&subscriber->lock); - /* Now destroy subscriber */ - kfree(subscriber); + tipc_subscrb_put(subscriber); +} + +static void tipc_subscrp_delete(struct tipc_subscription *sub) +{ + struct tipc_net *tn = net_generic(sub->net, tipc_net_id); + + tipc_nametbl_unsubscribe(sub); + list_del(&sub->subscrp_list); + kfree(sub); + atomic_dec(&tn->subscription_count); } -/** - * tipc_subscrp_cancel - handle subscription cancellation request - * - * Called with subscriber lock held. Routine must temporarily release lock - * to enable the subscription timeout routine to finish without deadlocking; - * the lock is then reclaimed to allow caller to release it upon return. - * - * Note that fields of 's' use subscriber's endianness! - */ static void tipc_subscrp_cancel(struct tipc_subscr *s, struct tipc_subscriber *subscriber) { - struct tipc_subscription *sub; - struct tipc_subscription *sub_temp; - int found = 0; + struct tipc_subscription *sub, *temp; /* Find first matching subscription, exit if not found */ - list_for_each_entry_safe(sub, sub_temp, &subscriber->subscrp_list, + list_for_each_entry_safe(sub, temp, &subscriber->subscrp_list, subscrp_list) { if (!memcmp(s, &sub->evt.s, sizeof(struct tipc_subscr))) { - found = 1; + if (del_timer(&sub->timer)) { + tipc_subscrp_delete(sub); + tipc_subscrb_put(subscriber); + } break; } } - if (!found) - return; - - /* Cancel subscription timer (if used), then delete subscription */ - if (sub->timeout != TIPC_WAIT_FOREVER) { - sub->timeout = TIPC_WAIT_FOREVER; - spin_unlock_bh(&subscriber->lock); - del_timer_sync(&sub->timer); - spin_lock_bh(&subscriber->lock); - } - tipc_subscrp_delete(sub); } static int tipc_subscrp_create(struct net *net, struct tipc_subscr *s, @@ -281,11 +265,11 @@ static int tipc_subscrp_create(struct net *net, struct tipc_subscr *s, sub->swap = swap; memcpy(&sub->evt.s, s, sizeof(*s)); atomic_inc(&tn->subscription_count); - if (sub->timeout != TIPC_WAIT_FOREVER) { - setup_timer(&sub->timer, tipc_subscrp_timeout, - (unsigned long)sub); - mod_timer(&sub->timer, jiffies + sub->timeout); - } + setup_timer(&sub->timer, tipc_subscrp_timeout, (unsigned long)sub); + if (sub->timeout != TIPC_WAIT_FOREVER) + sub->timeout += jiffies; + if (!mod_timer(&sub->timer, sub->timeout)) + tipc_subscrb_get(subscriber); *sub_p = sub; return 0; } -- cgit From a13683f292b2ce697f71fa3788a9335ebcb32676 Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Mon, 4 May 2015 10:36:47 +0800 Subject: tipc: adjust locking policy of subscription Currently subscriber's lock protects not only subscriber's subscription list but also all subscriptions linked into the list. However, as all members of subscription are never changed after they are initialized, it's unnecessary for subscription to be protected under subscriber's lock. If the lock is used to only protect subscriber's subscription list, the adjustment not only makes the locking policy simpler, but also helps to avoid a deadlock which may happen once creating a subscription is failed. Signed-off-by: Ying Xue Reviewed-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/subscr.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index 9b24c9c7e05a..350cca33ee0a 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -201,6 +201,7 @@ static void tipc_subscrp_cancel(struct tipc_subscr *s, { struct tipc_subscription *sub, *temp; + spin_lock_bh(&subscriber->lock); /* Find first matching subscription, exit if not found */ list_for_each_entry_safe(sub, temp, &subscriber->subscrp_list, subscrp_list) { @@ -212,6 +213,7 @@ static void tipc_subscrp_cancel(struct tipc_subscr *s, break; } } + spin_unlock_bh(&subscriber->lock); } static int tipc_subscrp_create(struct net *net, struct tipc_subscr *s, @@ -260,7 +262,9 @@ static int tipc_subscrp_create(struct net *net, struct tipc_subscr *s, kfree(sub); return -EINVAL; } + spin_lock_bh(&subscriber->lock); list_add(&sub->subscrp_list, &subscriber->subscrp_list); + spin_unlock_bh(&subscriber->lock); sub->subscriber = subscriber; sub->swap = swap; memcpy(&sub->evt.s, s, sizeof(*s)); @@ -289,13 +293,11 @@ static void tipc_subscrb_rcv_cb(struct net *net, int conid, struct tipc_subscription *sub = NULL; struct tipc_net *tn = net_generic(net, tipc_net_id); - spin_lock_bh(&subscriber->lock); tipc_subscrp_create(net, (struct tipc_subscr *)buf, subscriber, &sub); if (sub) tipc_nametbl_subscribe(sub); else tipc_conn_terminate(tn->topsrv, subscriber->conid); - spin_unlock_bh(&subscriber->lock); } /* Handle one request to establish a new subscriber */ -- cgit From 90bdfcb76f7d3b4a763ded3242277578ef22eda4 Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Mon, 4 May 2015 10:36:48 +0800 Subject: tipc: deal with return value of tipc_conn_new callback Once tipc_conn_new() returns NULL, the connection should be shut down immediately, otherwise, oops may happen due to the NULL pointer. Signed-off-by: Ying Xue Reviewed-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/server.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/tipc/server.c b/net/tipc/server.c index 77ff03ed1e18..a91a2f79209a 100644 --- a/net/tipc/server.c +++ b/net/tipc/server.c @@ -309,6 +309,10 @@ static int tipc_accept_from_sock(struct tipc_conn *con) /* Notify that new connection is incoming */ newcon->usr_data = s->tipc_conn_new(newcon->conid); + if (!newcon->usr_data) { + sock_release(newsock); + return -ENOMEM; + } /* Wake up receive process in case of 'SYN+' message */ newsock->sk->sk_data_ready(newsock->sk); -- cgit From fcba67c94abe83e0e69a65737000ccbb16a4fa03 Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Tue, 5 May 2015 00:19:35 +0200 Subject: net: fix two sparse warnings introduced by IGMP/MLD parsing exports MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit > net/core/skbuff.c:4108:13: sparse: incorrect type in assignment (different base types) > net/ipv6/mcast_snoop.c:63 ipv6_mc_check_exthdrs() warn: unsigned 'offset' is never less than zero. Introduced by 9afd85c9e4552b276e2f4cfefd622bdeeffbbf26 ("net: Export IGMP/MLD message validation code") Reported-by: kbuild test robot Signed-off-by: Linus Lüssing Signed-off-by: David S. Miller --- net/core/skbuff.c | 2 +- net/ipv6/mcast_snoop.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 1e4278a4dd7e..b9eb90b39ac7 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4093,7 +4093,7 @@ struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb, { struct sk_buff *skb_chk; unsigned int offset = skb_transport_offset(skb); - int ret; + __sum16 ret; skb_chk = skb_checksum_maybe_trim(skb, transport_len); if (!skb_chk) diff --git a/net/ipv6/mcast_snoop.c b/net/ipv6/mcast_snoop.c index 1a2cbc13a7d3..df8afe5ab31e 100644 --- a/net/ipv6/mcast_snoop.c +++ b/net/ipv6/mcast_snoop.c @@ -47,7 +47,7 @@ static int ipv6_mc_check_ip6hdr(struct sk_buff *skb) static int ipv6_mc_check_exthdrs(struct sk_buff *skb) { const struct ipv6hdr *ip6h; - unsigned int offset; + int offset; u8 nexthdr; __be16 frag_off; -- cgit From 586f2eb416f4271444e807b96346df3d30e8dfb5 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Thu, 30 Apr 2015 17:13:41 +0800 Subject: xfrm: remove the unnecessary checking before call xfrm_pol_hold xfrm_pol_hold will check its input with NULL Signed-off-by: Li RongQing Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index c4c47f337a3a..435bc0dc7630 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1127,8 +1127,8 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, break; } } - if (ret) - xfrm_pol_hold(ret); + + xfrm_pol_hold(ret); fail: read_unlock_bh(&net->xfrm.xfrm_policy_lock); @@ -3211,8 +3211,7 @@ static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector * } } - if (ret) - xfrm_pol_hold(ret); + xfrm_pol_hold(ret); read_unlock_bh(&net->xfrm.xfrm_policy_lock); -- cgit From de2ad486cb6cfdf3ef79a96a18d78bb1b135c0e0 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Thu, 30 Apr 2015 17:25:19 +0800 Subject: xfrm: move the checking for old xfrm_policy hold_queue to beginning if hold_queue of old xfrm_policy is NULL, return directly, then not need to run other codes, especially take the spin lock Signed-off-by: Li RongQing Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 435bc0dc7630..3d264e5e7409 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -700,6 +700,9 @@ static void xfrm_policy_requeue(struct xfrm_policy *old, struct xfrm_policy_queue *pq = &old->polq; struct sk_buff_head list; + if (skb_queue_empty(&pq->hold_queue)) + return; + __skb_queue_head_init(&list); spin_lock_bh(&pq->hold_queue.lock); @@ -708,9 +711,6 @@ static void xfrm_policy_requeue(struct xfrm_policy *old, xfrm_pol_put(old); spin_unlock_bh(&pq->hold_queue.lock); - if (skb_queue_empty(&list)) - return; - pq = &new->polq; spin_lock_bh(&pq->hold_queue.lock); -- cgit From f1160434c7658af3f7b0926b88df49a66cb3c3e0 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 22 Apr 2015 20:25:20 +0200 Subject: mac80211: clean up global debugfs statistics The debugfs statistics macros are pointlessly verbose, so change that macro to just have a single argument. While at it, remove the unused counters and rename rx_expand_skb_head2 to the better rx_expand_skb_head_defrag. Signed-off-by: Johannes Berg --- net/mac80211/debugfs.c | 74 ++++++++++++++++------------------------------ net/mac80211/ieee80211_i.h | 4 +-- net/mac80211/rx.c | 2 +- 3 files changed, 28 insertions(+), 52 deletions(-) (limited to 'net') diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 23813ebb349c..854bf42c4804 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -219,8 +219,8 @@ static const struct file_operations stats_ ##name## _ops = { \ .llseek = generic_file_llseek, \ }; -#define DEBUGFS_STATS_ADD(name, field) \ - debugfs_create_u32(#name, 0400, statsd, (u32 *) &field); +#define DEBUGFS_STATS_ADD(name) \ + debugfs_create_u32(#name, 0400, statsd, &local->name); #define DEBUGFS_DEVSTATS_ADD(name) \ debugfs_create_file(#name, 0400, statsd, local, &stats_ ##name## _ops); @@ -255,53 +255,31 @@ void debugfs_hw_add(struct ieee80211_local *local) if (!statsd) return; - DEBUGFS_STATS_ADD(transmitted_fragment_count, - local->dot11TransmittedFragmentCount); - DEBUGFS_STATS_ADD(multicast_transmitted_frame_count, - local->dot11MulticastTransmittedFrameCount); - DEBUGFS_STATS_ADD(failed_count, local->dot11FailedCount); - DEBUGFS_STATS_ADD(retry_count, local->dot11RetryCount); - DEBUGFS_STATS_ADD(multiple_retry_count, - local->dot11MultipleRetryCount); - DEBUGFS_STATS_ADD(frame_duplicate_count, - local->dot11FrameDuplicateCount); - DEBUGFS_STATS_ADD(received_fragment_count, - local->dot11ReceivedFragmentCount); - DEBUGFS_STATS_ADD(multicast_received_frame_count, - local->dot11MulticastReceivedFrameCount); - DEBUGFS_STATS_ADD(transmitted_frame_count, - local->dot11TransmittedFrameCount); + DEBUGFS_STATS_ADD(dot11TransmittedFragmentCount); + DEBUGFS_STATS_ADD(dot11MulticastTransmittedFrameCount); + DEBUGFS_STATS_ADD(dot11FailedCount); + DEBUGFS_STATS_ADD(dot11RetryCount); + DEBUGFS_STATS_ADD(dot11MultipleRetryCount); + DEBUGFS_STATS_ADD(dot11FrameDuplicateCount); + DEBUGFS_STATS_ADD(dot11ReceivedFragmentCount); + DEBUGFS_STATS_ADD(dot11MulticastReceivedFrameCount); + DEBUGFS_STATS_ADD(dot11TransmittedFrameCount); #ifdef CONFIG_MAC80211_DEBUG_COUNTERS - DEBUGFS_STATS_ADD(tx_handlers_drop, local->tx_handlers_drop); - DEBUGFS_STATS_ADD(tx_handlers_queued, local->tx_handlers_queued); - DEBUGFS_STATS_ADD(tx_handlers_drop_fragment, - local->tx_handlers_drop_fragment); - DEBUGFS_STATS_ADD(tx_handlers_drop_wep, - local->tx_handlers_drop_wep); - DEBUGFS_STATS_ADD(tx_handlers_drop_not_assoc, - local->tx_handlers_drop_not_assoc); - DEBUGFS_STATS_ADD(tx_handlers_drop_unauth_port, - local->tx_handlers_drop_unauth_port); - DEBUGFS_STATS_ADD(rx_handlers_drop, local->rx_handlers_drop); - DEBUGFS_STATS_ADD(rx_handlers_queued, local->rx_handlers_queued); - DEBUGFS_STATS_ADD(rx_handlers_drop_nullfunc, - local->rx_handlers_drop_nullfunc); - DEBUGFS_STATS_ADD(rx_handlers_drop_defrag, - local->rx_handlers_drop_defrag); - DEBUGFS_STATS_ADD(rx_handlers_drop_short, - local->rx_handlers_drop_short); - DEBUGFS_STATS_ADD(tx_expand_skb_head, - local->tx_expand_skb_head); - DEBUGFS_STATS_ADD(tx_expand_skb_head_cloned, - local->tx_expand_skb_head_cloned); - DEBUGFS_STATS_ADD(rx_expand_skb_head, - local->rx_expand_skb_head); - DEBUGFS_STATS_ADD(rx_expand_skb_head2, - local->rx_expand_skb_head2); - DEBUGFS_STATS_ADD(rx_handlers_fragments, - local->rx_handlers_fragments); - DEBUGFS_STATS_ADD(tx_status_drop, - local->tx_status_drop); + DEBUGFS_STATS_ADD(tx_handlers_drop); + DEBUGFS_STATS_ADD(tx_handlers_queued); + DEBUGFS_STATS_ADD(tx_handlers_drop_wep); + DEBUGFS_STATS_ADD(tx_handlers_drop_not_assoc); + DEBUGFS_STATS_ADD(tx_handlers_drop_unauth_port); + DEBUGFS_STATS_ADD(rx_handlers_drop); + DEBUGFS_STATS_ADD(rx_handlers_queued); + DEBUGFS_STATS_ADD(rx_handlers_drop_nullfunc); + DEBUGFS_STATS_ADD(rx_handlers_drop_defrag); + DEBUGFS_STATS_ADD(rx_handlers_drop_short); + DEBUGFS_STATS_ADD(tx_expand_skb_head); + DEBUGFS_STATS_ADD(tx_expand_skb_head_cloned); + DEBUGFS_STATS_ADD(rx_expand_skb_head_defrag); + DEBUGFS_STATS_ADD(rx_handlers_fragments); + DEBUGFS_STATS_ADD(tx_status_drop); #endif DEBUGFS_DEVSTATS_ADD(dot11ACKFailureCount); DEBUGFS_DEVSTATS_ADD(dot11RTSFailureCount); diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index e9f36f7ec733..987ec235e690 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1282,7 +1282,6 @@ struct ieee80211_local { /* TX/RX handler statistics */ unsigned int tx_handlers_drop; unsigned int tx_handlers_queued; - unsigned int tx_handlers_drop_fragment; unsigned int tx_handlers_drop_wep; unsigned int tx_handlers_drop_not_assoc; unsigned int tx_handlers_drop_unauth_port; @@ -1293,8 +1292,7 @@ struct ieee80211_local { unsigned int rx_handlers_drop_short; unsigned int tx_expand_skb_head; unsigned int tx_expand_skb_head_cloned; - unsigned int rx_expand_skb_head; - unsigned int rx_expand_skb_head2; + unsigned int rx_expand_skb_head_defrag; unsigned int rx_handlers_fragments; unsigned int tx_status_drop; #define I802_DEBUG_INC(c) (c)++ diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index e08253547f42..19b39367f8bf 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1868,7 +1868,7 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) rx->skb = __skb_dequeue(&entry->skb_list); if (skb_tailroom(rx->skb) < entry->extra_len) { - I802_DEBUG_INC(rx->local->rx_expand_skb_head2); + I802_DEBUG_INC(rx->local->rx_expand_skb_head_defrag); if (unlikely(pskb_expand_head(rx->skb, 0, entry->extra_len, GFP_ATOMIC))) { I802_DEBUG_INC(rx->local->rx_handlers_drop_defrag); -- cgit From c206ca670974cefec7ac3732db5c8156e8081a8d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 22 Apr 2015 20:47:28 +0200 Subject: mac80211: move dot11 counters under MAC80211_DEBUG_COUNTERS Since these counters can only be read through debugfs, there's very little point in maintaining them all the time. However, even just making them depend on debugfs is pointless - they're not normally used. Additionally a number of them aren't even concurrency safe. Move them under MAC80211_DEBUG_COUNTERS so they're normally not even compiled in. Signed-off-by: Johannes Berg --- net/mac80211/Kconfig | 6 +++++- net/mac80211/debugfs.c | 2 +- net/mac80211/ieee80211_i.h | 16 ++++++++-------- net/mac80211/rx.c | 6 +++--- net/mac80211/status.c | 28 ++++++++++++++-------------- 5 files changed, 31 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/net/mac80211/Kconfig b/net/mac80211/Kconfig index 5f698d4ec53b..086de496a4c1 100644 --- a/net/mac80211/Kconfig +++ b/net/mac80211/Kconfig @@ -302,7 +302,11 @@ config MAC80211_DEBUG_COUNTERS ---help--- Selecting this option causes mac80211 to keep additional and very verbose statistics about TX and RX handler use - and show them in debugfs. + as well as a few selected dot11 counters. These will be + exposed in debugfs. + + Note that some of the counters are not concurrency safe + and may thus not always be accurate. If unsure, say N. diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 854bf42c4804..b17206db49b4 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -255,6 +255,7 @@ void debugfs_hw_add(struct ieee80211_local *local) if (!statsd) return; +#ifdef CONFIG_MAC80211_DEBUG_COUNTERS DEBUGFS_STATS_ADD(dot11TransmittedFragmentCount); DEBUGFS_STATS_ADD(dot11MulticastTransmittedFrameCount); DEBUGFS_STATS_ADD(dot11FailedCount); @@ -264,7 +265,6 @@ void debugfs_hw_add(struct ieee80211_local *local) DEBUGFS_STATS_ADD(dot11ReceivedFragmentCount); DEBUGFS_STATS_ADD(dot11MulticastReceivedFrameCount); DEBUGFS_STATS_ADD(dot11TransmittedFrameCount); -#ifdef CONFIG_MAC80211_DEBUG_COUNTERS DEBUGFS_STATS_ADD(tx_handlers_drop); DEBUGFS_STATS_ADD(tx_handlers_queued); DEBUGFS_STATS_ADD(tx_handlers_drop_wep); diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 987ec235e690..ca00127497f8 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1259,6 +1259,14 @@ struct ieee80211_local { struct list_head chanctx_list; struct mutex chanctx_mtx; +#ifdef CONFIG_MAC80211_LEDS + struct led_trigger *tx_led, *rx_led, *assoc_led, *radio_led; + struct tpt_led_trigger *tpt_led_trigger; + char tx_led_name[32], rx_led_name[32], + assoc_led_name[32], radio_led_name[32]; +#endif + +#ifdef CONFIG_MAC80211_DEBUG_COUNTERS /* SNMP counters */ /* dot11CountersTable */ u32 dot11TransmittedFragmentCount; @@ -1271,14 +1279,6 @@ struct ieee80211_local { u32 dot11MulticastReceivedFrameCount; u32 dot11TransmittedFrameCount; -#ifdef CONFIG_MAC80211_LEDS - struct led_trigger *tx_led, *rx_led, *assoc_led, *radio_led; - struct tpt_led_trigger *tpt_led_trigger; - char tx_led_name[32], rx_led_name[32], - assoc_led_name[32], radio_led_name[32]; -#endif - -#ifdef CONFIG_MAC80211_DEBUG_COUNTERS /* TX/RX handler statistics */ unsigned int tx_handlers_drop; unsigned int tx_handlers_queued; diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 19b39367f8bf..aa35977a9c4d 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1077,7 +1077,7 @@ ieee80211_rx_h_check_dup(struct ieee80211_rx_data *rx) if (unlikely(ieee80211_has_retry(hdr->frame_control) && rx->sta->last_seq_ctrl[rx->seqno_idx] == hdr->seq_ctrl)) { - rx->local->dot11FrameDuplicateCount++; + I802_DEBUG_INC(rx->local->dot11FrameDuplicateCount); rx->sta->num_duplicates++; return RX_DROP_UNUSABLE; } else if (!(status->flag & RX_FLAG_AMSDU_MORE)) { @@ -1785,7 +1785,7 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) frag = sc & IEEE80211_SCTL_FRAG; if (is_multicast_ether_addr(hdr->addr1)) { - rx->local->dot11MulticastReceivedFrameCount++; + I802_DEBUG_INC(rx->local->dot11MulticastReceivedFrameCount); goto out_no_led; } @@ -3410,7 +3410,7 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, rx.local = local; if (ieee80211_is_data(fc) || ieee80211_is_mgmt(fc)) - local->dot11ReceivedFragmentCount++; + I802_DEBUG_INC(local->dot11ReceivedFragmentCount); if (ieee80211_is_mgmt(fc)) { /* drop frame if too short for header */ diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 005fdbe39a8b..461594966b65 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -631,15 +631,15 @@ void ieee80211_tx_status_noskb(struct ieee80211_hw *hw, } if (acked || noack_success) { - local->dot11TransmittedFrameCount++; - if (!pubsta) - local->dot11MulticastTransmittedFrameCount++; - if (retry_count > 0) - local->dot11RetryCount++; - if (retry_count > 1) - local->dot11MultipleRetryCount++; + I802_DEBUG_INC(local->dot11TransmittedFrameCount); + if (!pubsta) + I802_DEBUG_INC(local->dot11MulticastTransmittedFrameCount); + if (retry_count > 0) + I802_DEBUG_INC(local->dot11RetryCount); + if (retry_count > 1) + I802_DEBUG_INC(local->dot11MultipleRetryCount); } else { - local->dot11FailedCount++; + I802_DEBUG_INC(local->dot11FailedCount); } } EXPORT_SYMBOL(ieee80211_tx_status_noskb); @@ -802,13 +802,13 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) if ((info->flags & IEEE80211_TX_STAT_ACK) || (info->flags & IEEE80211_TX_STAT_NOACK_TRANSMITTED)) { if (ieee80211_is_first_frag(hdr->seq_ctrl)) { - local->dot11TransmittedFrameCount++; + I802_DEBUG_INC(local->dot11TransmittedFrameCount); if (is_multicast_ether_addr(ieee80211_get_DA(hdr))) - local->dot11MulticastTransmittedFrameCount++; + I802_DEBUG_INC(local->dot11MulticastTransmittedFrameCount); if (retry_count > 0) - local->dot11RetryCount++; + I802_DEBUG_INC(local->dot11RetryCount); if (retry_count > 1) - local->dot11MultipleRetryCount++; + I802_DEBUG_INC(local->dot11MultipleRetryCount); } /* This counter shall be incremented for an acknowledged MPDU @@ -818,10 +818,10 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) if (!is_multicast_ether_addr(hdr->addr1) || ieee80211_is_data(fc) || ieee80211_is_mgmt(fc)) - local->dot11TransmittedFragmentCount++; + I802_DEBUG_INC(local->dot11TransmittedFragmentCount); } else { if (ieee80211_is_first_frag(hdr->seq_ctrl)) - local->dot11FailedCount++; + I802_DEBUG_INC(local->dot11FailedCount); } if (ieee80211_is_nullfunc(fc) && ieee80211_has_pm(fc) && -- cgit From f83f1c129a6f29830de74a47521dd68e57914579 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 22 Apr 2015 20:55:55 +0200 Subject: mac80211: remove sta->tx_fragments counter This counter is unsafe with concurrent TX and is only exposed through debugfs and ethtool. Instead of trying to fix it just remove it for now, if it's really needed then it should be exposed through nl80211 and in a way that drivers that do the fragmentation in the device could support it as well. Signed-off-by: Johannes Berg --- net/mac80211/debugfs_sta.c | 1 - net/mac80211/ethtool.c | 3 +-- net/mac80211/sta_info.h | 2 -- net/mac80211/tx.c | 2 -- 4 files changed, 1 insertion(+), 7 deletions(-) (limited to 'net') diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index 252859e90e8a..7ca601414000 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -452,7 +452,6 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta) DEBUGFS_ADD_COUNTER(rx_duplicates, num_duplicates); DEBUGFS_ADD_COUNTER(rx_fragments, rx_fragments); DEBUGFS_ADD_COUNTER(rx_dropped, rx_dropped); - DEBUGFS_ADD_COUNTER(tx_fragments, tx_fragments); DEBUGFS_ADD_COUNTER(tx_filtered, tx_filtered_count); DEBUGFS_ADD_COUNTER(tx_retry_failed, tx_retry_failed); DEBUGFS_ADD_COUNTER(tx_retry_count, tx_retry_count); diff --git a/net/mac80211/ethtool.c b/net/mac80211/ethtool.c index 52bcea6ad9e8..188faab11c24 100644 --- a/net/mac80211/ethtool.c +++ b/net/mac80211/ethtool.c @@ -38,7 +38,7 @@ static void ieee80211_get_ringparam(struct net_device *dev, static const char ieee80211_gstrings_sta_stats[][ETH_GSTRING_LEN] = { "rx_packets", "rx_bytes", "rx_duplicates", "rx_fragments", "rx_dropped", - "tx_packets", "tx_bytes", "tx_fragments", + "tx_packets", "tx_bytes", "tx_filtered", "tx_retry_failed", "tx_retries", "beacon_loss", "sta_state", "txrate", "rxrate", "signal", "channel", "noise", "ch_time", "ch_time_busy", @@ -87,7 +87,6 @@ static void ieee80211_get_stats(struct net_device *dev, \ data[i++] += sinfo.tx_packets; \ data[i++] += sinfo.tx_bytes; \ - data[i++] += sta->tx_fragments; \ data[i++] += sta->tx_filtered_count; \ data[i++] += sta->tx_retry_failed; \ data[i++] += sta->tx_retry_count; \ diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index c821fe47c2d1..fdd981c0fa21 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -323,7 +323,6 @@ struct ieee80211_fast_tx { * @fail_avg: moving percentage of failed MSDUs * @tx_packets: number of RX/TX MSDUs * @tx_bytes: number of bytes transmitted to this STA - * @tx_fragments: number of transmitted MPDUs * @tid_seq: per-TID sequence numbers for sending to this STA * @ampdu_mlme: A-MPDU state machine state * @timer_to_tid: identity mapping to ID timers @@ -433,7 +432,6 @@ struct sta_info { unsigned int fail_avg; /* Updated from TX path only, no locking requirements */ - u32 tx_fragments; u64 tx_packets[IEEE80211_NUM_ACS]; u64 tx_bytes[IEEE80211_NUM_ACS]; struct ieee80211_tx_rate last_tx_rate; diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 745fdf5c2722..db5e40360924 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -997,7 +997,6 @@ ieee80211_tx_h_stats(struct ieee80211_tx_data *tx) skb_queue_walk(&tx->skbs, skb) { ac = skb_get_queue_mapping(skb); - tx->sta->tx_fragments++; tx->sta->tx_bytes[ac] += skb->len; } if (ac >= 0) @@ -2804,7 +2803,6 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, /* statistics normally done by ieee80211_tx_h_stats (but that * has to consider fragmentation, so is more complex) */ - sta->tx_fragments++; sta->tx_bytes[skb_get_queue_mapping(skb)] += skb->len; sta->tx_packets[skb_get_queue_mapping(skb)]++; -- cgit From 56ff51084eadf61a93941e9a2a20608d8dd45711 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 22 Apr 2015 21:07:39 +0200 Subject: mac80211: clean up station debugfs Remove items that can be retrieved through nl80211. This also removes two items (tx_packets and tx_bytes) where only the VO counter was exposed since they are split up per AC but in the debugfs file only the first AC was shown. Also remove the useless "dev" file - the stations have long been in a sub-directory of the netdev so there's no need for that any more. Signed-off-by: Johannes Berg --- net/mac80211/debugfs_sta.c | 84 ---------------------------------------------- 1 file changed, 84 deletions(-) (limited to 'net') diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index 7ca601414000..06d52935036d 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -29,8 +29,6 @@ static ssize_t sta_ ##name## _read(struct file *file, \ format_string, sta->field); \ } #define STA_READ_D(name, field) STA_READ(name, field, "%d\n") -#define STA_READ_U(name, field) STA_READ(name, field, "%u\n") -#define STA_READ_S(name, field) STA_READ(name, field, "%s\n") #define STA_OPS(name) \ static const struct file_operations sta_ ##name## _ops = { \ @@ -52,10 +50,7 @@ static const struct file_operations sta_ ##name## _ops = { \ STA_OPS(name) STA_FILE(aid, sta.aid, D); -STA_FILE(dev, sdata->name, S); -STA_FILE(last_signal, last_signal, D); STA_FILE(last_ack_signal, last_ack_signal, D); -STA_FILE(beacon_loss_count, beacon_loss_count, D); static ssize_t sta_flags_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) @@ -101,40 +96,6 @@ static ssize_t sta_num_ps_buf_frames_read(struct file *file, } STA_OPS(num_ps_buf_frames); -static ssize_t sta_inactive_ms_read(struct file *file, char __user *userbuf, - size_t count, loff_t *ppos) -{ - struct sta_info *sta = file->private_data; - return mac80211_format_buffer(userbuf, count, ppos, "%d\n", - jiffies_to_msecs(jiffies - sta->last_rx)); -} -STA_OPS(inactive_ms); - - -static ssize_t sta_connected_time_read(struct file *file, char __user *userbuf, - size_t count, loff_t *ppos) -{ - struct sta_info *sta = file->private_data; - struct timespec uptime; - struct tm result; - long connected_time_secs; - char buf[100]; - int res; - ktime_get_ts(&uptime); - connected_time_secs = uptime.tv_sec - sta->last_connected; - time_to_tm(connected_time_secs, 0, &result); - result.tm_year -= 70; - result.tm_mday -= 1; - res = scnprintf(buf, sizeof(buf), - "years - %ld\nmonths - %d\ndays - %d\nclock - %d:%d:%d\n\n", - result.tm_year, result.tm_mon, result.tm_mday, - result.tm_hour, result.tm_min, result.tm_sec); - return simple_read_from_buffer(userbuf, count, ppos, buf, res); -} -STA_OPS(connected_time); - - - static ssize_t sta_last_seq_ctrl_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { @@ -359,37 +320,6 @@ static ssize_t sta_vht_capa_read(struct file *file, char __user *userbuf, } STA_OPS(vht_capa); -static ssize_t sta_current_tx_rate_read(struct file *file, char __user *userbuf, - size_t count, loff_t *ppos) -{ - struct sta_info *sta = file->private_data; - struct rate_info rinfo; - u16 rate; - sta_set_rate_info_tx(sta, &sta->last_tx_rate, &rinfo); - rate = cfg80211_calculate_bitrate(&rinfo); - - return mac80211_format_buffer(userbuf, count, ppos, - "%d.%d MBit/s\n", - rate/10, rate%10); -} -STA_OPS(current_tx_rate); - -static ssize_t sta_last_rx_rate_read(struct file *file, char __user *userbuf, - size_t count, loff_t *ppos) -{ - struct sta_info *sta = file->private_data; - struct rate_info rinfo; - u16 rate; - - sta_set_rate_info_rx(sta, &rinfo); - - rate = cfg80211_calculate_bitrate(&rinfo); - - return mac80211_format_buffer(userbuf, count, ppos, - "%d.%d MBit/s\n", - rate/10, rate%10); -} -STA_OPS(last_rx_rate); #define DEBUGFS_ADD(name) \ debugfs_create_file(#name, 0400, \ @@ -432,29 +362,15 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta) DEBUGFS_ADD(flags); DEBUGFS_ADD(num_ps_buf_frames); - DEBUGFS_ADD(inactive_ms); - DEBUGFS_ADD(connected_time); DEBUGFS_ADD(last_seq_ctrl); DEBUGFS_ADD(agg_status); - DEBUGFS_ADD(dev); - DEBUGFS_ADD(last_signal); - DEBUGFS_ADD(beacon_loss_count); DEBUGFS_ADD(ht_capa); DEBUGFS_ADD(vht_capa); DEBUGFS_ADD(last_ack_signal); - DEBUGFS_ADD(current_tx_rate); - DEBUGFS_ADD(last_rx_rate); - DEBUGFS_ADD_COUNTER(rx_packets, rx_packets); - DEBUGFS_ADD_COUNTER(tx_packets, tx_packets); - DEBUGFS_ADD_COUNTER(rx_bytes, rx_bytes); - DEBUGFS_ADD_COUNTER(tx_bytes, tx_bytes); DEBUGFS_ADD_COUNTER(rx_duplicates, num_duplicates); DEBUGFS_ADD_COUNTER(rx_fragments, rx_fragments); - DEBUGFS_ADD_COUNTER(rx_dropped, rx_dropped); DEBUGFS_ADD_COUNTER(tx_filtered, tx_filtered_count); - DEBUGFS_ADD_COUNTER(tx_retry_failed, tx_retry_failed); - DEBUGFS_ADD_COUNTER(tx_retry_count, tx_retry_count); if (sizeof(sta->driver_buffered_tids) == sizeof(u32)) debugfs_create_x32("driver_buffered_tids", 0400, -- cgit From f5c4ae07992ca64d8628a11439c184baf5595e4b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 23 Apr 2015 12:09:01 +0200 Subject: mac80211: make LED trigger names const This is just a code cleanup, make the LED trigger names const as they're not expected to be modified by drivers. Signed-off-by: Johannes Berg --- net/mac80211/led.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/mac80211/led.c b/net/mac80211/led.c index e2b836446af3..3e13eb86f85d 100644 --- a/net/mac80211/led.c +++ b/net/mac80211/led.c @@ -133,7 +133,7 @@ void ieee80211_led_exit(struct ieee80211_local *local) } } -char *__ieee80211_get_radio_led_name(struct ieee80211_hw *hw) +const char *__ieee80211_get_radio_led_name(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); @@ -141,7 +141,7 @@ char *__ieee80211_get_radio_led_name(struct ieee80211_hw *hw) } EXPORT_SYMBOL(__ieee80211_get_radio_led_name); -char *__ieee80211_get_assoc_led_name(struct ieee80211_hw *hw) +const char *__ieee80211_get_assoc_led_name(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); @@ -149,7 +149,7 @@ char *__ieee80211_get_assoc_led_name(struct ieee80211_hw *hw) } EXPORT_SYMBOL(__ieee80211_get_assoc_led_name); -char *__ieee80211_get_tx_led_name(struct ieee80211_hw *hw) +const char *__ieee80211_get_tx_led_name(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); @@ -157,7 +157,7 @@ char *__ieee80211_get_tx_led_name(struct ieee80211_hw *hw) } EXPORT_SYMBOL(__ieee80211_get_tx_led_name); -char *__ieee80211_get_rx_led_name(struct ieee80211_hw *hw) +const char *__ieee80211_get_rx_led_name(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); @@ -211,10 +211,11 @@ static void tpt_trig_timer(unsigned long data) read_unlock(&tpt_trig->trig.leddev_list_lock); } -char *__ieee80211_create_tpt_led_trigger(struct ieee80211_hw *hw, - unsigned int flags, - const struct ieee80211_tpt_blink *blink_table, - unsigned int blink_table_len) +const char * +__ieee80211_create_tpt_led_trigger(struct ieee80211_hw *hw, + unsigned int flags, + const struct ieee80211_tpt_blink *blink_table, + unsigned int blink_table_len) { struct ieee80211_local *local = hw_to_local(hw); struct tpt_led_trigger *tpt_trig; -- cgit From 8d5c25856859bd826aca4b88103552a80b344cef Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 23 Apr 2015 12:19:22 +0200 Subject: mac80211: make LED triggering depend on activation When LED triggers are compiled in, but not used, mac80211 will still call them to update the status. This isn't really a problem for the assoc and radio ones, but the TX/RX (and to a certain extend TPT) ones can be called very frequently (for every packet.) In order to avoid that when they're not used, track their activation and call the corresponding trigger (and in the TPT case, account for throughput) only when the trigger is actually used by an LED. Additionally, make those trigger functions inlines since theyre only used once in the remaining code. Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 7 +- net/mac80211/led.c | 239 +++++++++++++++++++++++++++++---------------- net/mac80211/led.h | 44 ++++++--- net/mac80211/main.c | 4 +- 4 files changed, 194 insertions(+), 100 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index ca00127497f8..241b74f3bd81 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1260,10 +1260,11 @@ struct ieee80211_local { struct mutex chanctx_mtx; #ifdef CONFIG_MAC80211_LEDS - struct led_trigger *tx_led, *rx_led, *assoc_led, *radio_led; + struct led_trigger tx_led, rx_led, assoc_led, radio_led; + struct led_trigger tpt_led; + atomic_t tx_led_active, rx_led_active, assoc_led_active; + atomic_t radio_led_active, tpt_led_active; struct tpt_led_trigger *tpt_led_trigger; - char tx_led_name[32], rx_led_name[32], - assoc_led_name[32], radio_led_name[32]; #endif #ifdef CONFIG_MAC80211_DEBUG_COUNTERS diff --git a/net/mac80211/led.c b/net/mac80211/led.c index 3e13eb86f85d..38f05565eaac 100644 --- a/net/mac80211/led.c +++ b/net/mac80211/led.c @@ -12,96 +12,175 @@ #include #include "led.h" -#define MAC80211_BLINK_DELAY 50 /* ms */ - -void ieee80211_led_rx(struct ieee80211_local *local) -{ - unsigned long led_delay = MAC80211_BLINK_DELAY; - if (unlikely(!local->rx_led)) - return; - led_trigger_blink_oneshot(local->rx_led, &led_delay, &led_delay, 0); -} - -void ieee80211_led_tx(struct ieee80211_local *local) -{ - unsigned long led_delay = MAC80211_BLINK_DELAY; - if (unlikely(!local->tx_led)) - return; - led_trigger_blink_oneshot(local->tx_led, &led_delay, &led_delay, 0); -} - void ieee80211_led_assoc(struct ieee80211_local *local, bool associated) { - if (unlikely(!local->assoc_led)) + if (!atomic_read(&local->assoc_led_active)) return; if (associated) - led_trigger_event(local->assoc_led, LED_FULL); + led_trigger_event(&local->assoc_led, LED_FULL); else - led_trigger_event(local->assoc_led, LED_OFF); + led_trigger_event(&local->assoc_led, LED_OFF); } void ieee80211_led_radio(struct ieee80211_local *local, bool enabled) { - if (unlikely(!local->radio_led)) + if (!atomic_read(&local->radio_led_active)) return; if (enabled) - led_trigger_event(local->radio_led, LED_FULL); + led_trigger_event(&local->radio_led, LED_FULL); else - led_trigger_event(local->radio_led, LED_OFF); + led_trigger_event(&local->radio_led, LED_OFF); +} + +void ieee80211_alloc_led_names(struct ieee80211_local *local) +{ + local->rx_led.name = kasprintf(GFP_KERNEL, "%srx", + wiphy_name(local->hw.wiphy)); + local->tx_led.name = kasprintf(GFP_KERNEL, "%stx", + wiphy_name(local->hw.wiphy)); + local->assoc_led.name = kasprintf(GFP_KERNEL, "%sassoc", + wiphy_name(local->hw.wiphy)); + local->radio_led.name = kasprintf(GFP_KERNEL, "%sradio", + wiphy_name(local->hw.wiphy)); +} + +void ieee80211_free_led_names(struct ieee80211_local *local) +{ + kfree(local->rx_led.name); + kfree(local->tx_led.name); + kfree(local->assoc_led.name); + kfree(local->radio_led.name); +} + +static void ieee80211_tx_led_activate(struct led_classdev *led_cdev) +{ + struct ieee80211_local *local = container_of(led_cdev->trigger, + struct ieee80211_local, + tx_led); + + atomic_inc(&local->tx_led_active); +} + +static void ieee80211_tx_led_deactivate(struct led_classdev *led_cdev) +{ + struct ieee80211_local *local = container_of(led_cdev->trigger, + struct ieee80211_local, + tx_led); + + atomic_dec(&local->tx_led_active); +} + +static void ieee80211_rx_led_activate(struct led_classdev *led_cdev) +{ + struct ieee80211_local *local = container_of(led_cdev->trigger, + struct ieee80211_local, + rx_led); + + atomic_inc(&local->rx_led_active); +} + +static void ieee80211_rx_led_deactivate(struct led_classdev *led_cdev) +{ + struct ieee80211_local *local = container_of(led_cdev->trigger, + struct ieee80211_local, + rx_led); + + atomic_dec(&local->rx_led_active); +} + +static void ieee80211_assoc_led_activate(struct led_classdev *led_cdev) +{ + struct ieee80211_local *local = container_of(led_cdev->trigger, + struct ieee80211_local, + assoc_led); + + atomic_inc(&local->assoc_led_active); +} + +static void ieee80211_assoc_led_deactivate(struct led_classdev *led_cdev) +{ + struct ieee80211_local *local = container_of(led_cdev->trigger, + struct ieee80211_local, + assoc_led); + + atomic_dec(&local->assoc_led_active); +} + +static void ieee80211_radio_led_activate(struct led_classdev *led_cdev) +{ + struct ieee80211_local *local = container_of(led_cdev->trigger, + struct ieee80211_local, + radio_led); + + atomic_inc(&local->radio_led_active); +} + +static void ieee80211_radio_led_deactivate(struct led_classdev *led_cdev) +{ + struct ieee80211_local *local = container_of(led_cdev->trigger, + struct ieee80211_local, + radio_led); + + atomic_dec(&local->radio_led_active); +} + +static void ieee80211_tpt_led_activate(struct led_classdev *led_cdev) +{ + struct ieee80211_local *local = container_of(led_cdev->trigger, + struct ieee80211_local, + tpt_led); + + atomic_inc(&local->tpt_led_active); } -void ieee80211_led_names(struct ieee80211_local *local) +static void ieee80211_tpt_led_deactivate(struct led_classdev *led_cdev) { - snprintf(local->rx_led_name, sizeof(local->rx_led_name), - "%srx", wiphy_name(local->hw.wiphy)); - snprintf(local->tx_led_name, sizeof(local->tx_led_name), - "%stx", wiphy_name(local->hw.wiphy)); - snprintf(local->assoc_led_name, sizeof(local->assoc_led_name), - "%sassoc", wiphy_name(local->hw.wiphy)); - snprintf(local->radio_led_name, sizeof(local->radio_led_name), - "%sradio", wiphy_name(local->hw.wiphy)); + struct ieee80211_local *local = container_of(led_cdev->trigger, + struct ieee80211_local, + tpt_led); + + atomic_dec(&local->tpt_led_active); } void ieee80211_led_init(struct ieee80211_local *local) { - local->rx_led = kzalloc(sizeof(struct led_trigger), GFP_KERNEL); - if (local->rx_led) { - local->rx_led->name = local->rx_led_name; - if (led_trigger_register(local->rx_led)) { - kfree(local->rx_led); - local->rx_led = NULL; - } + atomic_set(&local->rx_led_active, 0); + local->rx_led.activate = ieee80211_rx_led_activate; + local->rx_led.deactivate = ieee80211_rx_led_deactivate; + if (local->rx_led.name && led_trigger_register(&local->rx_led)) { + kfree(local->rx_led.name); + local->rx_led.name = NULL; } - local->tx_led = kzalloc(sizeof(struct led_trigger), GFP_KERNEL); - if (local->tx_led) { - local->tx_led->name = local->tx_led_name; - if (led_trigger_register(local->tx_led)) { - kfree(local->tx_led); - local->tx_led = NULL; - } + atomic_set(&local->tx_led_active, 0); + local->tx_led.activate = ieee80211_tx_led_activate; + local->tx_led.deactivate = ieee80211_tx_led_deactivate; + if (local->tx_led.name && led_trigger_register(&local->tx_led)) { + kfree(local->tx_led.name); + local->tx_led.name = NULL; } - local->assoc_led = kzalloc(sizeof(struct led_trigger), GFP_KERNEL); - if (local->assoc_led) { - local->assoc_led->name = local->assoc_led_name; - if (led_trigger_register(local->assoc_led)) { - kfree(local->assoc_led); - local->assoc_led = NULL; - } + atomic_set(&local->assoc_led_active, 0); + local->assoc_led.activate = ieee80211_assoc_led_activate; + local->assoc_led.deactivate = ieee80211_assoc_led_deactivate; + if (local->assoc_led.name && led_trigger_register(&local->assoc_led)) { + kfree(local->assoc_led.name); + local->assoc_led.name = NULL; } - local->radio_led = kzalloc(sizeof(struct led_trigger), GFP_KERNEL); - if (local->radio_led) { - local->radio_led->name = local->radio_led_name; - if (led_trigger_register(local->radio_led)) { - kfree(local->radio_led); - local->radio_led = NULL; - } + atomic_set(&local->radio_led_active, 0); + local->radio_led.activate = ieee80211_radio_led_activate; + local->radio_led.deactivate = ieee80211_radio_led_deactivate; + if (local->radio_led.name && led_trigger_register(&local->radio_led)) { + kfree(local->radio_led.name); + local->radio_led.name = NULL; } + atomic_set(&local->tpt_led_active, 0); if (local->tpt_led_trigger) { - if (led_trigger_register(&local->tpt_led_trigger->trig)) { + local->tpt_led.activate = ieee80211_tpt_led_activate; + local->tpt_led.deactivate = ieee80211_tpt_led_deactivate; + if (led_trigger_register(&local->tpt_led)) { kfree(local->tpt_led_trigger); local->tpt_led_trigger = NULL; } @@ -110,25 +189,17 @@ void ieee80211_led_init(struct ieee80211_local *local) void ieee80211_led_exit(struct ieee80211_local *local) { - if (local->radio_led) { - led_trigger_unregister(local->radio_led); - kfree(local->radio_led); - } - if (local->assoc_led) { - led_trigger_unregister(local->assoc_led); - kfree(local->assoc_led); - } - if (local->tx_led) { - led_trigger_unregister(local->tx_led); - kfree(local->tx_led); - } - if (local->rx_led) { - led_trigger_unregister(local->rx_led); - kfree(local->rx_led); - } + if (local->radio_led.name) + led_trigger_unregister(&local->radio_led); + if (local->assoc_led.name) + led_trigger_unregister(&local->assoc_led); + if (local->tx_led.name) + led_trigger_unregister(&local->tx_led); + if (local->rx_led.name) + led_trigger_unregister(&local->rx_led); if (local->tpt_led_trigger) { - led_trigger_unregister(&local->tpt_led_trigger->trig); + led_trigger_unregister(&local->tpt_led); kfree(local->tpt_led_trigger); } } @@ -137,7 +208,7 @@ const char *__ieee80211_get_radio_led_name(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); - return local->radio_led_name; + return local->radio_led.name; } EXPORT_SYMBOL(__ieee80211_get_radio_led_name); @@ -145,7 +216,7 @@ const char *__ieee80211_get_assoc_led_name(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); - return local->assoc_led_name; + return local->assoc_led.name; } EXPORT_SYMBOL(__ieee80211_get_assoc_led_name); @@ -153,7 +224,7 @@ const char *__ieee80211_get_tx_led_name(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); - return local->tx_led_name; + return local->tx_led.name; } EXPORT_SYMBOL(__ieee80211_get_tx_led_name); @@ -161,7 +232,7 @@ const char *__ieee80211_get_rx_led_name(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); - return local->rx_led_name; + return local->rx_led.name; } EXPORT_SYMBOL(__ieee80211_get_rx_led_name); @@ -230,7 +301,7 @@ __ieee80211_create_tpt_led_trigger(struct ieee80211_hw *hw, snprintf(tpt_trig->name, sizeof(tpt_trig->name), "%stpt", wiphy_name(local->hw.wiphy)); - tpt_trig->trig.name = tpt_trig->name; + local->tpt_led.name = tpt_trig->name; tpt_trig->blink_table = blink_table; tpt_trig->blink_table_len = blink_table_len; diff --git a/net/mac80211/led.h b/net/mac80211/led.h index 89f4344f13b9..a7893a1ac98b 100644 --- a/net/mac80211/led.h +++ b/net/mac80211/led.h @@ -11,25 +11,42 @@ #include #include "ieee80211_i.h" +#define MAC80211_BLINK_DELAY 50 /* ms */ + +static inline void ieee80211_led_rx(struct ieee80211_local *local) +{ +#ifdef CONFIG_MAC80211_LEDS + unsigned long led_delay = MAC80211_BLINK_DELAY; + + if (!atomic_read(&local->rx_led_active)) + return; + led_trigger_blink_oneshot(&local->rx_led, &led_delay, &led_delay, 0); +#endif +} + +static inline void ieee80211_led_tx(struct ieee80211_local *local) +{ +#ifdef CONFIG_MAC80211_LEDS + unsigned long led_delay = MAC80211_BLINK_DELAY; + + if (!atomic_read(&local->tx_led_active)) + return; + led_trigger_blink_oneshot(&local->tx_led, &led_delay, &led_delay, 0); +#endif +} + #ifdef CONFIG_MAC80211_LEDS -void ieee80211_led_rx(struct ieee80211_local *local); -void ieee80211_led_tx(struct ieee80211_local *local); void ieee80211_led_assoc(struct ieee80211_local *local, bool associated); void ieee80211_led_radio(struct ieee80211_local *local, bool enabled); -void ieee80211_led_names(struct ieee80211_local *local); +void ieee80211_alloc_led_names(struct ieee80211_local *local); +void ieee80211_free_led_names(struct ieee80211_local *local); void ieee80211_led_init(struct ieee80211_local *local); void ieee80211_led_exit(struct ieee80211_local *local); void ieee80211_mod_tpt_led_trig(struct ieee80211_local *local, unsigned int types_on, unsigned int types_off); #else -static inline void ieee80211_led_rx(struct ieee80211_local *local) -{ -} -static inline void ieee80211_led_tx(struct ieee80211_local *local) -{ -} static inline void ieee80211_led_assoc(struct ieee80211_local *local, bool associated) { @@ -38,7 +55,10 @@ static inline void ieee80211_led_radio(struct ieee80211_local *local, bool enabled) { } -static inline void ieee80211_led_names(struct ieee80211_local *local) +static inline void ieee80211_alloc_led_names(struct ieee80211_local *local) +{ +} +static inline void ieee80211_free_led_names(struct ieee80211_local *local) { } static inline void ieee80211_led_init(struct ieee80211_local *local) @@ -58,7 +78,7 @@ static inline void ieee80211_tpt_led_trig_tx(struct ieee80211_local *local, __le16 fc, int bytes) { #ifdef CONFIG_MAC80211_LEDS - if (local->tpt_led_trigger && ieee80211_is_data(fc)) + if (ieee80211_is_data(fc) && atomic_read(&local->tpt_led_active)) local->tpt_led_trigger->tx_bytes += bytes; #endif } @@ -67,7 +87,7 @@ static inline void ieee80211_tpt_led_trig_rx(struct ieee80211_local *local, __le16 fc, int bytes) { #ifdef CONFIG_MAC80211_LEDS - if (local->tpt_led_trigger && ieee80211_is_data(fc)) + if (ieee80211_is_data(fc) && atomic_read(&local->tpt_led_active)) local->tpt_led_trigger->rx_bytes += bytes; #endif } diff --git a/net/mac80211/main.c b/net/mac80211/main.c index b144de971366..effe9d39cd7e 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -643,7 +643,7 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, skb_queue_head_init(&local->skb_queue); skb_queue_head_init(&local->skb_queue_unreliable); - ieee80211_led_names(local); + ieee80211_alloc_led_names(local); ieee80211_roc_setup(local); @@ -1207,6 +1207,8 @@ void ieee80211_free_hw(struct ieee80211_hw *hw) sta_info_stop(local); + ieee80211_free_led_names(local); + wiphy_free(local->hw.wiphy); } EXPORT_SYMBOL(ieee80211_free_hw); -- cgit From f603f1f342547b597450b9d1e1e00b3e07c6e41b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 5 May 2015 15:25:33 +0200 Subject: mac80211: remove useless skb->encapsulation check No current (and planned, as far as I know) wifi devices support encapsulation checksum offload, so remove the useless test here. Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index db5e40360924..8df134213adf 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -2893,12 +2893,8 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, * fix it up in software before we handle anything else. */ if (skb->ip_summed == CHECKSUM_PARTIAL) { - if (skb->encapsulation) - skb_set_inner_transport_header(skb, - skb_checksum_start_offset(skb)); - else - skb_set_transport_header(skb, - skb_checksum_start_offset(skb)); + skb_set_transport_header(skb, + skb_checksum_start_offset(skb)); if (skb_checksum_help(skb)) goto out_free; } -- cgit From cd8ae85299d54155702a56811b2e035e63064d3d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 3 May 2015 21:34:46 -0700 Subject: tcp: provide SYN headers for passive connections This patch allows a server application to get the TCP SYN headers for its passive connections. This is useful if the server is doing fingerprinting of clients based on SYN packet contents. Two socket options are added: TCP_SAVE_SYN and TCP_SAVED_SYN. The first is used on a socket to enable saving the SYN headers for child connections. This can be set before or after the listen() call. The latter is used to retrieve the SYN headers for passive connections, if the parent listener has enabled TCP_SAVE_SYN. TCP_SAVED_SYN is read once, it frees the saved SYN headers. The data returned in TCP_SAVED_SYN are network (IPv4/IPv6) and TCP headers. Original patch was written by Tom Herbert, I changed it to not hold a full skb (and associated dst and conntracking reference). We have used such patch for about 3 years at Google. Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Tested-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 35 +++++++++++++++++++++++++++++++++++ net/ipv4/tcp_input.c | 18 ++++++++++++++++++ net/ipv4/tcp_ipv4.c | 1 + net/ipv4/tcp_minisocks.c | 3 +++ 4 files changed, 57 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 46efa03d2b11..ecccfdc50d76 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2482,6 +2482,13 @@ static int do_tcp_setsockopt(struct sock *sk, int level, icsk->icsk_syn_retries = val; break; + case TCP_SAVE_SYN: + if (val < 0 || val > 1) + err = -EINVAL; + else + tp->save_syn = val; + break; + case TCP_LINGER2: if (val < 0) tp->linger2 = -1; @@ -2818,6 +2825,34 @@ static int do_tcp_getsockopt(struct sock *sk, int level, case TCP_NOTSENT_LOWAT: val = tp->notsent_lowat; break; + case TCP_SAVE_SYN: + val = tp->save_syn; + break; + case TCP_SAVED_SYN: { + if (get_user(len, optlen)) + return -EFAULT; + + lock_sock(sk); + if (tp->saved_syn) { + len = min_t(unsigned int, tp->saved_syn[0], len); + if (put_user(len, optlen)) { + release_sock(sk); + return -EFAULT; + } + if (copy_to_user(optval, tp->saved_syn + 1, len)) { + release_sock(sk); + return -EFAULT; + } + tcp_saved_syn_free(tp); + release_sock(sk); + } else { + release_sock(sk); + len = 0; + if (put_user(len, optlen)) + return -EFAULT; + } + return 0; + } default: return -ENOPROTOOPT; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 09bdc4abfcbb..df2ca615cd0c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6060,6 +6060,23 @@ static bool tcp_syn_flood_action(struct sock *sk, return want_cookie; } +static void tcp_reqsk_record_syn(const struct sock *sk, + struct request_sock *req, + const struct sk_buff *skb) +{ + if (tcp_sk(sk)->save_syn) { + u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb); + u32 *copy; + + copy = kmalloc(len + sizeof(u32), GFP_ATOMIC); + if (copy) { + copy[0] = len; + memcpy(©[1], skb_network_header(skb), len); + req->saved_syn = copy; + } + } +} + int tcp_conn_request(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct sk_buff *skb) @@ -6192,6 +6209,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_rsk(req)->tfo_listener = false; af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT); } + tcp_reqsk_record_syn(sk, req, skb); return 0; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index fc1c658ec6c1..91cb4768a860 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1802,6 +1802,7 @@ void tcp_v4_destroy_sock(struct sock *sk) /* If socket is aborted during connect operation */ tcp_free_fastopen_req(tp); + tcp_saved_syn_free(tp); sk_sockets_allocated_dec(sk); sock_release_memcg(sk); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index e5d7649136fc..ebe2ab2596ed 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -536,6 +536,9 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->fastopen_rsk = NULL; newtp->syn_data_acked = 0; + newtp->saved_syn = req->saved_syn; + req->saved_syn = NULL; + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS); } return newsk; -- cgit From 784b58a327ad16967ab64bbfa558df81980d31e9 Mon Sep 17 00:00:00 2001 From: Bernhard Thaler Date: Mon, 4 May 2015 22:47:13 +0200 Subject: bridge: change BR_GROUPFWD_RESTRICTED to allow forwarding of LLDP frames BR_GROUPFWD_RESTRICTED bitmask restricts users from setting values to /sys/class/net/brX/bridge/group_fwd_mask that allow forwarding of some IEEE 802.1D Table 7-10 Reserved addresses: (MAC Control) 802.3 01-80-C2-00-00-01 (Link Aggregation) 802.3 01-80-C2-00-00-02 802.1AB LLDP 01-80-C2-00-00-0E Change BR_GROUPFWD_RESTRICTED to allow to forward LLDP frames and document group_fwd_mask. e.g. echo 16384 > /sys/class/net/brX/bridge/group_fwd_mask allows to forward LLDP frames. This may be needed for bridge setups used for network troubleshooting or any other scenario where forwarding of LLDP frames is desired (e.g. bridge connecting a virtual machine to real switch transmitting LLDP frames that virtual machine needs to receive). Tested on a simple bridge setup with two interfaces and host transmitting LLDP frames on one side of this bridge (used lldpd). Setting group_fwd_mask as described above lets LLDP frames traverse bridge. Signed-off-by: Bernhard Thaler Signed-off-by: David S. Miller --- net/bridge/br_private.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 3362c29400f1..1f36fa70639b 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -33,8 +33,8 @@ /* Control of forwarding link local multicast */ #define BR_GROUPFWD_DEFAULT 0 -/* Don't allow forwarding control protocols like STP and LLDP */ -#define BR_GROUPFWD_RESTRICTED 0x4007u +/* Don't allow forwarding of control protocols like STP, MAC PAUSE and LACP */ +#define BR_GROUPFWD_RESTRICTED 0x0007u /* The Nearest Customer Bridge Group Address, 01-80-C2-00-00-[00,0B,0C,0D,0F] */ #define BR_GROUPFWD_8021AD 0xB801u -- cgit From 2c7a88c252bf3381958cf716f31b6b2e0f2f3fa7 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 4 May 2015 14:33:48 -0700 Subject: etherdev: Fix sparse error, make test usable by other functions This change does two things. First it fixes a sparse error for the fact that the __be16 degrades to an integer. Since that is actually what I am kind of doing I am simply working around that by forcing both sides of the comparison to u16. Also I realized on some compilers I was generating another instruction for big endian systems such as PowerPC since it was masking the value before doing the comparison. So to resolve that I have simply pulled the mask out and wrapped it in an #ifndef __BIG_ENDIAN. Lastly I pulled this all out into its own function. I notices there are similar checks in a number of other places so this function can be reused there to help reduce overhead in these paths as well. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ethernet/eth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 314e4c5a5a5e..9045e2a1108f 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -179,7 +179,7 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) if (unlikely(netdev_uses_dsa(dev))) return htons(ETH_P_XDSA); - if (likely((eth->h_proto & htons(0xFF00)) >= htons(ETH_P_802_3_MIN))) + if (likely(eth_proto_is_802_3(eth->h_proto))) return eth->h_proto; /* -- cgit From 27cf6a6e69a52b07adc68af11e3e39548593baf0 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 4 May 2015 14:33:54 -0700 Subject: ebtables: Use eth_proto_is_802_3 Replace "ntohs(proto) >= ETH_P_802_3_MIN" w/ eth_proto_is_802_3(proto). Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/bridge/netfilter/ebtables.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 91180a7fc943..5149d9e71114 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -139,7 +139,7 @@ ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb, ethproto = h->h_proto; if (e->bitmask & EBT_802_3) { - if (FWINV2(ntohs(ethproto) >= ETH_P_802_3_MIN, EBT_IPROTO)) + if (FWINV2(eth_proto_is_802_3(ethproto), EBT_IPROTO)) return 1; } else if (!(e->bitmask & EBT_NOPROTO) && FWINV2(e->ethproto != ethproto, EBT_IPROTO)) -- cgit From d181ddca89cf8680db6c767ae38e3b66224d1657 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 4 May 2015 14:33:59 -0700 Subject: ipv4/ip_tunnel_core: Use eth_proto_is_802_3 Replace "ntohs(proto) >= ETH_P_802_3_MIN" w/ eth_proto_is_802_3(proto). Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index ce63ab21b6cd..3998b1822d85 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -98,7 +98,7 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto) return -ENOMEM; eh = (struct ethhdr *)skb->data; - if (likely(ntohs(eh->h_proto) >= ETH_P_802_3_MIN)) + if (likely(eth_proto_is_802_3(eh->h_proto))) skb->protocol = eh->h_proto; else skb->protocol = htons(ETH_P_802_2); -- cgit From 6713fc9b8fa33444aa000f0f31076f6a859ccb34 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 4 May 2015 14:34:05 -0700 Subject: openvswitch: Use eth_proto_is_802_3 Replace "ntohs(proto) >= ETH_P_802_3_MIN" w/ eth_proto_is_802_3(proto). Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/openvswitch/datapath.c | 2 +- net/openvswitch/flow.c | 4 ++-- net/openvswitch/flow_netlink.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 096c6276e6b9..3b90461317ec 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -545,7 +545,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) /* Normally, setting the skb 'protocol' field would be handled by a * call to eth_type_trans(), but it assumes there's a sending * device, which we may not have. */ - if (ntohs(eth->h_proto) >= ETH_P_802_3_MIN) + if (eth_proto_is_802_3(eth->h_proto)) packet->protocol = eth->h_proto; else packet->protocol = htons(ETH_P_802_2); diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 2dacc7b5af23..bc7b0aba994a 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -332,7 +332,7 @@ static __be16 parse_ethertype(struct sk_buff *skb) proto = *(__be16 *) skb->data; __skb_pull(skb, sizeof(__be16)); - if (ntohs(proto) >= ETH_P_802_3_MIN) + if (eth_proto_is_802_3(proto)) return proto; if (skb->len < sizeof(struct llc_snap_hdr)) @@ -349,7 +349,7 @@ static __be16 parse_ethertype(struct sk_buff *skb) __skb_pull(skb, sizeof(struct llc_snap_hdr)); - if (ntohs(llc->ethertype) >= ETH_P_802_3_MIN) + if (eth_proto_is_802_3(llc->ethertype)) return llc->ethertype; return htons(ETH_P_802_2); diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index c691b1a1eee0..624e41c4267f 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -816,7 +816,7 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, if (is_mask) { /* Always exact match EtherType. */ eth_type = htons(0xffff); - } else if (ntohs(eth_type) < ETH_P_802_3_MIN) { + } else if (!eth_proto_is_802_3(eth_type)) { OVS_NLERR(log, "EtherType %x is less than min %x", ntohs(eth_type), ETH_P_802_3_MIN); return -EINVAL; -- cgit From 9352c19f639354f093cb5457315c01bcb94aa82a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 20 Apr 2015 18:12:41 +0200 Subject: mac80211: extend get_tkip_seq to all keys Extend the function to read the TKIP IV32/IV16 to read the IV/PN for all ciphers in order to allow drivers with full hardware crypto to properly support this. Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 91 ++++++++++++++++++++++++++++++----------------- net/mac80211/driver-ops.h | 11 +++--- net/mac80211/trace.h | 42 +++++++++++----------- 3 files changed, 87 insertions(+), 57 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 72a0178af737..dd7014b09396 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -312,6 +312,7 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev, u32 iv32; u16 iv16; int err = -ENOENT; + struct ieee80211_key_seq kseq = {}; sdata = IEEE80211_DEV_TO_SUB_IF(dev); @@ -342,10 +343,12 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev, iv32 = key->u.tkip.tx.iv32; iv16 = key->u.tkip.tx.iv16; - if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) - drv_get_tkip_seq(sdata->local, - key->conf.hw_key_idx, - &iv32, &iv16); + if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE && + !(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)) { + drv_get_key_seq(sdata->local, key, &kseq); + iv32 = kseq.tkip.iv32; + iv16 = kseq.tkip.iv16; + } seq[0] = iv16 & 0xff; seq[1] = (iv16 >> 8) & 0xff; @@ -358,49 +361,73 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev, break; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: - pn64 = atomic64_read(&key->u.ccmp.tx_pn); - seq[0] = pn64; - seq[1] = pn64 >> 8; - seq[2] = pn64 >> 16; - seq[3] = pn64 >> 24; - seq[4] = pn64 >> 32; - seq[5] = pn64 >> 40; + if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE && + !(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)) { + drv_get_key_seq(sdata->local, key, &kseq); + memcpy(seq, kseq.ccmp.pn, 6); + } else { + pn64 = atomic64_read(&key->u.ccmp.tx_pn); + seq[0] = pn64; + seq[1] = pn64 >> 8; + seq[2] = pn64 >> 16; + seq[3] = pn64 >> 24; + seq[4] = pn64 >> 32; + seq[5] = pn64 >> 40; + } params.seq = seq; params.seq_len = 6; break; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: - pn64 = atomic64_read(&key->u.aes_cmac.tx_pn); - seq[0] = pn64; - seq[1] = pn64 >> 8; - seq[2] = pn64 >> 16; - seq[3] = pn64 >> 24; - seq[4] = pn64 >> 32; - seq[5] = pn64 >> 40; + if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE && + !(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)) { + drv_get_key_seq(sdata->local, key, &kseq); + memcpy(seq, kseq.aes_cmac.pn, 6); + } else { + pn64 = atomic64_read(&key->u.aes_cmac.tx_pn); + seq[0] = pn64; + seq[1] = pn64 >> 8; + seq[2] = pn64 >> 16; + seq[3] = pn64 >> 24; + seq[4] = pn64 >> 32; + seq[5] = pn64 >> 40; + } params.seq = seq; params.seq_len = 6; break; case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: - pn64 = atomic64_read(&key->u.aes_gmac.tx_pn); - seq[0] = pn64; - seq[1] = pn64 >> 8; - seq[2] = pn64 >> 16; - seq[3] = pn64 >> 24; - seq[4] = pn64 >> 32; - seq[5] = pn64 >> 40; + if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE && + !(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)) { + drv_get_key_seq(sdata->local, key, &kseq); + memcpy(seq, kseq.aes_gmac.pn, 6); + } else { + pn64 = atomic64_read(&key->u.aes_gmac.tx_pn); + seq[0] = pn64; + seq[1] = pn64 >> 8; + seq[2] = pn64 >> 16; + seq[3] = pn64 >> 24; + seq[4] = pn64 >> 32; + seq[5] = pn64 >> 40; + } params.seq = seq; params.seq_len = 6; break; case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: - pn64 = atomic64_read(&key->u.gcmp.tx_pn); - seq[0] = pn64; - seq[1] = pn64 >> 8; - seq[2] = pn64 >> 16; - seq[3] = pn64 >> 24; - seq[4] = pn64 >> 32; - seq[5] = pn64 >> 40; + if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE && + !(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)) { + drv_get_key_seq(sdata->local, key, &kseq); + memcpy(seq, kseq.gcmp.pn, 6); + } else { + pn64 = atomic64_read(&key->u.gcmp.tx_pn); + seq[0] = pn64; + seq[1] = pn64 >> 8; + seq[2] = pn64 >> 16; + seq[3] = pn64 >> 24; + seq[4] = pn64 >> 32; + seq[5] = pn64 >> 40; + } params.seq = seq; params.seq_len = 6; break; diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 26e1ca8a474a..c01e681b90fb 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -417,12 +417,13 @@ static inline int drv_get_stats(struct ieee80211_local *local, return ret; } -static inline void drv_get_tkip_seq(struct ieee80211_local *local, - u8 hw_key_idx, u32 *iv32, u16 *iv16) +static inline void drv_get_key_seq(struct ieee80211_local *local, + struct ieee80211_key *key, + struct ieee80211_key_seq *seq) { - if (local->ops->get_tkip_seq) - local->ops->get_tkip_seq(&local->hw, hw_key_idx, iv32, iv16); - trace_drv_get_tkip_seq(local, hw_key_idx, iv32, iv16); + if (local->ops->get_key_seq) + local->ops->get_key_seq(&local->hw, &key->conf, seq); + trace_drv_get_key_seq(local, &key->conf); } static inline int drv_set_frag_threshold(struct ieee80211_local *local, diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 4c2e7690226a..6f14591d8ca9 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -69,6 +69,17 @@ #define CHANCTX_PR_ARG CHANDEF_PR_ARG, MIN_CHANDEF_PR_ARG, \ __entry->rx_chains_static, __entry->rx_chains_dynamic +#define KEY_ENTRY __field(u32, cipher) \ + __field(u8, hw_key_idx) \ + __field(u8, flags) \ + __field(s8, keyidx) +#define KEY_ASSIGN(k) __entry->cipher = (k)->cipher; \ + __entry->flags = (k)->flags; \ + __entry->keyidx = (k)->keyidx; \ + __entry->hw_key_idx = (k)->hw_key_idx; +#define KEY_PR_FMT " cipher:0x%x, flags=%#x, keyidx=%d, hw_key_idx=%d" +#define KEY_PR_ARG __entry->cipher, __entry->flags, __entry->keyidx, __entry->hw_key_idx + /* @@ -522,25 +533,19 @@ TRACE_EVENT(drv_set_key, LOCAL_ENTRY VIF_ENTRY STA_ENTRY - __field(u32, cipher) - __field(u8, hw_key_idx) - __field(u8, flags) - __field(s8, keyidx) + KEY_ENTRY ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; STA_ASSIGN; - __entry->cipher = key->cipher; - __entry->flags = key->flags; - __entry->keyidx = key->keyidx; - __entry->hw_key_idx = key->hw_key_idx; + KEY_ASSIGN(key); ), TP_printk( - LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT, - LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG + LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT KEY_PR_FMT, + LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, KEY_PR_ARG ) ); @@ -656,28 +661,25 @@ TRACE_EVENT(drv_get_stats, ) ); -TRACE_EVENT(drv_get_tkip_seq, +TRACE_EVENT(drv_get_key_seq, TP_PROTO(struct ieee80211_local *local, - u8 hw_key_idx, u32 *iv32, u16 *iv16), + struct ieee80211_key_conf *key), - TP_ARGS(local, hw_key_idx, iv32, iv16), + TP_ARGS(local, key), TP_STRUCT__entry( LOCAL_ENTRY - __field(u8, hw_key_idx) - __field(u32, iv32) - __field(u16, iv16) + KEY_ENTRY ), TP_fast_assign( LOCAL_ASSIGN; - __entry->hw_key_idx = hw_key_idx; - __entry->iv32 = *iv32; - __entry->iv16 = *iv16; + KEY_ASSIGN(key); ), TP_printk( - LOCAL_PR_FMT, LOCAL_PR_ARG + LOCAL_PR_FMT KEY_PR_FMT, + LOCAL_PR_ARG, KEY_PR_ARG ) ); -- cgit From a31cf1c69e89e0c2d5515b04aca313f1014a714d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 20 Apr 2015 18:21:58 +0200 Subject: mac80211: extend get_key() to return PN for all ciphers For ciphers not supported by mac80211, the function currently doesn't return any PN data. Fix this by extending the driver's get_key_seq() a little more to allow moving arbitrary PN data. Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 9 +++++++++ net/mac80211/key.c | 4 ++-- net/mac80211/key.h | 3 +-- 3 files changed, 12 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index dd7014b09396..3469bbdc891c 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -431,6 +431,15 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev, params.seq = seq; params.seq_len = 6; break; + default: + if (!(key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE)) + break; + if (WARN_ON(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)) + break; + drv_get_key_seq(sdata->local, key, &kseq); + params.seq = kseq.hw.seq; + params.seq_len = kseq.hw.seq_len; + break; } params.key = key->conf.key; diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 3e0b814f4db3..0a5d5c5ad30f 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -485,8 +485,8 @@ ieee80211_key_alloc(u32 cipher, int idx, size_t key_len, break; default: if (cs) { - size_t len = (seq_len > MAX_PN_LEN) ? - MAX_PN_LEN : seq_len; + size_t len = (seq_len > IEEE80211_MAX_PN_LEN) ? + IEEE80211_MAX_PN_LEN : seq_len; key->conf.iv_len = cs->hdr_len; key->conf.icv_len = cs->mic_len; diff --git a/net/mac80211/key.h b/net/mac80211/key.h index c5a31835be0e..df430a618764 100644 --- a/net/mac80211/key.h +++ b/net/mac80211/key.h @@ -18,7 +18,6 @@ #define NUM_DEFAULT_KEYS 4 #define NUM_DEFAULT_MGMT_KEYS 2 -#define MAX_PN_LEN 16 struct ieee80211_local; struct ieee80211_sub_if_data; @@ -116,7 +115,7 @@ struct ieee80211_key { } gcmp; struct { /* generic cipher scheme */ - u8 rx_pn[IEEE80211_NUM_TIDS + 1][MAX_PN_LEN]; + u8 rx_pn[IEEE80211_NUM_TIDS + 1][IEEE80211_MAX_PN_LEN]; } gen; } u; -- cgit From e3a55b5399d55200c024fe0c2984dc7ad049da44 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 5 May 2015 16:32:29 +0200 Subject: mac80211: validate cipher scheme PN length better Currently, a cipher scheme can advertise an arbitrarily long sequence counter, but mac80211 only supports up to 16 bytes and the initial value from userspace will be truncated. Fix two things: * don't allow the driver to register anything longer than the 16 bytes that mac80211 reserves space for * require userspace to specify a starting value with the correct length (or none at all) Signed-off-by: Johannes Berg --- net/mac80211/key.c | 10 ++++++---- net/mac80211/main.c | 5 ++++- 2 files changed, 10 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 0a5d5c5ad30f..2e677376c958 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -485,15 +485,17 @@ ieee80211_key_alloc(u32 cipher, int idx, size_t key_len, break; default: if (cs) { - size_t len = (seq_len > IEEE80211_MAX_PN_LEN) ? - IEEE80211_MAX_PN_LEN : seq_len; + if (seq_len && seq_len != cs->pn_len) { + kfree(key); + return ERR_PTR(-EINVAL); + } key->conf.iv_len = cs->hdr_len; key->conf.icv_len = cs->mic_len; for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++) - for (j = 0; j < len; j++) + for (j = 0; j < seq_len; j++) key->u.gen.rx_pn[i][j] = - seq[len - j - 1]; + seq[seq_len - j - 1]; key->flags |= KEY_FLAG_CIPHER_SCHEME; } } diff --git a/net/mac80211/main.c b/net/mac80211/main.c index effe9d39cd7e..3c956c5f99b2 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -768,8 +768,11 @@ static int ieee80211_init_cipher_suites(struct ieee80211_local *local) suites[w++] = WLAN_CIPHER_SUITE_BIP_GMAC_256; } - for (r = 0; r < local->hw.n_cipher_schemes; r++) + for (r = 0; r < local->hw.n_cipher_schemes; r++) { suites[w++] = cs[r].cipher; + if (WARN_ON(cs[r].pn_len > IEEE80211_MAX_PN_LEN)) + return -EINVAL; + } } local->hw.wiphy->cipher_suites = suites; -- cgit From be69c24a184490f941fa8ab0fefe0d6cee96afbf Mon Sep 17 00:00:00 2001 From: Avraham Stern Date: Mon, 27 Apr 2015 16:52:16 +0300 Subject: cfg80211: Allow GO concurrent relaxation after BSS disconnection If a P2P GO was allowed on a channel because of the GO concurrent relaxation, i.e., another station interface was associated to an AP on the same channel or the same UNII band, and the station interface disconnected from the AP, allow the following use cases unless the channel is marked as indoor only and the device is not operating in an indoor environment: 1. Allow the P2P GO to stay on its current channel. The rationale behind this is that if the channel or UNII band were allowed by the AP they could still be used to continue the P2P GO operation, and avoid connection breakage. 2. Allow another P2P GO to start on the same channel or another channel that is in the same UNII band as the previous instantiated P2P GO. Signed-off-by: Avraham Stern Signed-off-by: Arik Nemtsov Signed-off-by: Ilan Peer Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- net/wireless/chan.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/wireless/chan.c b/net/wireless/chan.c index 7aaf7415dc4c..5bcffdbf3e88 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -709,7 +709,7 @@ EXPORT_SYMBOL(cfg80211_chandef_usable); static bool cfg80211_go_permissive_chan(struct cfg80211_registered_device *rdev, struct ieee80211_channel *chan) { - struct wireless_dev *wdev_iter; + struct wireless_dev *wdev; struct wiphy *wiphy = wiphy_idx_to_wiphy(rdev->wiphy_idx); ASSERT_RTNL(); @@ -732,18 +732,27 @@ static bool cfg80211_go_permissive_chan(struct cfg80211_registered_device *rdev, * and thus fail the GO instantiation, consider only the interfaces of * the current registered device. */ - list_for_each_entry(wdev_iter, &rdev->wdev_list, list) { + list_for_each_entry(wdev, &rdev->wdev_list, list) { struct ieee80211_channel *other_chan = NULL; int r1, r2; - if (wdev_iter->iftype != NL80211_IFTYPE_STATION || - !netif_running(wdev_iter->netdev)) - continue; - - wdev_lock(wdev_iter); - if (wdev_iter->current_bss) - other_chan = wdev_iter->current_bss->pub.channel; - wdev_unlock(wdev_iter); + wdev_lock(wdev); + if (wdev->iftype == NL80211_IFTYPE_STATION && + wdev->current_bss) + other_chan = wdev->current_bss->pub.channel; + + /* + * If a GO already operates on the same GO_CONCURRENT channel, + * this one (maybe the same one) can beacon as well. We allow + * the operation even if the station we relied on with + * GO_CONCURRENT is disconnected now. But then we must make sure + * we're not outdoor on an indoor-only channel. + */ + if (wdev->iftype == NL80211_IFTYPE_P2P_GO && + wdev->beacon_interval && + !(chan->flags & IEEE80211_CHAN_INDOOR_ONLY)) + other_chan = wdev->chandef.chan; + wdev_unlock(wdev); if (!other_chan) continue; -- cgit From 06f207fc541862ba8902ceda0ddeade6ea6bce72 Mon Sep 17 00:00:00 2001 From: Arik Nemtsov Date: Wed, 6 May 2015 16:28:31 +0300 Subject: cfg80211: change GO_CONCURRENT to IR_CONCURRENT for STA The GO_CONCURRENT regulatory definition can be extended to station interfaces requesting to IR as part of TDLS off-channel operations. Rename the GO_CONCURRENT flag to IR_CONCURRENT and allow the added use-case. Change internal users of GO_CONCURRENT to use the new definition. Signed-off-by: Arik Nemtsov Reviewed-by: Johannes Berg Signed-off-by: Johannes Berg --- net/wireless/chan.c | 38 ++++++++++++++++++++++---------------- net/wireless/nl80211.c | 4 ++-- net/wireless/reg.c | 4 ++-- 3 files changed, 26 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/wireless/chan.c b/net/wireless/chan.c index 5bcffdbf3e88..915b328b9ac5 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -698,19 +698,20 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy, EXPORT_SYMBOL(cfg80211_chandef_usable); /* - * For GO only, check if the channel can be used under permissive conditions - * mandated by the some regulatory bodies, i.e., the channel is marked with - * IEEE80211_CHAN_GO_CONCURRENT and there is an additional station interface + * Check if the channel can be used under permissive conditions mandated by + * some regulatory bodies, i.e., the channel is marked with + * IEEE80211_CHAN_IR_CONCURRENT and there is an additional station interface * associated to an AP on the same channel or on the same UNII band * (assuming that the AP is an authorized master). - * In addition allow the GO to operate on a channel on which indoor operation is + * In addition allow operation on a channel on which indoor operation is * allowed, iff we are currently operating in an indoor environment. */ -static bool cfg80211_go_permissive_chan(struct cfg80211_registered_device *rdev, +static bool cfg80211_ir_permissive_chan(struct wiphy *wiphy, + enum nl80211_iftype iftype, struct ieee80211_channel *chan) { struct wireless_dev *wdev; - struct wiphy *wiphy = wiphy_idx_to_wiphy(rdev->wiphy_idx); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); ASSERT_RTNL(); @@ -718,16 +719,22 @@ static bool cfg80211_go_permissive_chan(struct cfg80211_registered_device *rdev, !(wiphy->regulatory_flags & REGULATORY_ENABLE_RELAX_NO_IR)) return false; + /* only valid for GO and TDLS off-channel (station/p2p-CL) */ + if (iftype != NL80211_IFTYPE_P2P_GO && + iftype != NL80211_IFTYPE_STATION && + iftype != NL80211_IFTYPE_P2P_CLIENT) + return false; + if (regulatory_indoor_allowed() && (chan->flags & IEEE80211_CHAN_INDOOR_ONLY)) return true; - if (!(chan->flags & IEEE80211_CHAN_GO_CONCURRENT)) + if (!(chan->flags & IEEE80211_CHAN_IR_CONCURRENT)) return false; /* * Generally, it is possible to rely on another device/driver to allow - * the GO concurrent relaxation, however, since the device can further + * the IR concurrent relaxation, however, since the device can further * enforce the relaxation (by doing a similar verifications as this), * and thus fail the GO instantiation, consider only the interfaces of * the current registered device. @@ -748,7 +755,8 @@ static bool cfg80211_go_permissive_chan(struct cfg80211_registered_device *rdev, * GO_CONCURRENT is disconnected now. But then we must make sure * we're not outdoor on an indoor-only channel. */ - if (wdev->iftype == NL80211_IFTYPE_P2P_GO && + if (iftype == NL80211_IFTYPE_P2P_GO && + wdev->iftype == NL80211_IFTYPE_P2P_GO && wdev->beacon_interval && !(chan->flags & IEEE80211_CHAN_INDOOR_ONLY)) other_chan = wdev->chandef.chan; @@ -793,7 +801,6 @@ bool cfg80211_reg_can_beacon(struct wiphy *wiphy, struct cfg80211_chan_def *chandef, enum nl80211_iftype iftype) { - struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); bool res; u32 prohibited_flags = IEEE80211_CHAN_DISABLED | IEEE80211_CHAN_RADAR; @@ -801,13 +808,12 @@ bool cfg80211_reg_can_beacon(struct wiphy *wiphy, trace_cfg80211_reg_can_beacon(wiphy, chandef, iftype); /* - * Under certain conditions suggested by the some regulatory bodies - * a GO can operate on channels marked with IEEE80211_NO_IR - * so set this flag only if such relaxations are not enabled and - * the conditions are not met. + * Under certain conditions suggested by some regulatory bodies a + * GO/STA can IR on channels marked with IEEE80211_NO_IR. Set this flag + * only if such relaxations are not enabled and the conditions are not + * met. */ - if (iftype != NL80211_IFTYPE_P2P_GO || - !cfg80211_go_permissive_chan(rdev, chandef->chan)) + if (!cfg80211_ir_permissive_chan(wiphy, iftype, chandef->chan)) prohibited_flags |= IEEE80211_CHAN_NO_IR; if (cfg80211_chandef_dfs_required(wiphy, chandef, iftype) > 0 && diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 8a33bbae9ec5..c264effd00a6 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -639,8 +639,8 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, if ((chan->flags & IEEE80211_CHAN_INDOOR_ONLY) && nla_put_flag(msg, NL80211_FREQUENCY_ATTR_INDOOR_ONLY)) goto nla_put_failure; - if ((chan->flags & IEEE80211_CHAN_GO_CONCURRENT) && - nla_put_flag(msg, NL80211_FREQUENCY_ATTR_GO_CONCURRENT)) + if ((chan->flags & IEEE80211_CHAN_IR_CONCURRENT) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_IR_CONCURRENT)) goto nla_put_failure; if ((chan->flags & IEEE80211_CHAN_NO_20MHZ) && nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_20MHZ)) diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 0e347f888fe9..d359e0610198 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -989,8 +989,8 @@ static u32 map_regdom_flags(u32 rd_flags) channel_flags |= IEEE80211_CHAN_NO_OFDM; if (rd_flags & NL80211_RRF_NO_OUTDOOR) channel_flags |= IEEE80211_CHAN_INDOOR_ONLY; - if (rd_flags & NL80211_RRF_GO_CONCURRENT) - channel_flags |= IEEE80211_CHAN_GO_CONCURRENT; + if (rd_flags & NL80211_RRF_IR_CONCURRENT) + channel_flags |= IEEE80211_CHAN_IR_CONCURRENT; if (rd_flags & NL80211_RRF_NO_HT40MINUS) channel_flags |= IEEE80211_CHAN_NO_HT40MINUS; if (rd_flags & NL80211_RRF_NO_HT40PLUS) -- cgit From bbbe8c8c596b3784a2ed08772900e827f8ba72c5 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 6 May 2015 16:00:32 +0200 Subject: mac80211: add missing documentation for rate_ctrl_lock This was missed in the previous patch, add some documentation for rate_ctrl_lock to avoid docbook warnings. Signed-off-by: Johannes Berg --- net/mac80211/sta_info.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index fdd981c0fa21..9bd1e97876bd 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -285,6 +285,8 @@ struct ieee80211_fast_tx { * @gtk: group keys negotiated with this station, if any * @gtk_idx: last installed group key index * @rate_ctrl: rate control algorithm reference + * @rate_ctrl_lock: spinlock used to protect rate control data + * (data inside the algorithm, so serializes calls there) * @rate_ctrl_priv: rate control private per-STA pointer * @last_tx_rate: rate used for last transmit, to report to userspace as * "the" transmit rate -- cgit From bf45a24243eb0673f52eaaac620617601e484f07 Mon Sep 17 00:00:00 2001 From: Andrei Otcheretianski Date: Wed, 6 May 2015 18:30:50 +0300 Subject: mac80211: adjust reserved chan_ctx when assigned to vif When a vif starts using a reserved channel context (during CSA, for example) the required chandef was recalculated, however it was never applied. This could result in using chanctx with narrower width than actually required. Fix this by calling ieee80211_change_chanctx with the recalculated chandef. This both changes the chanctx's width and recalcs min_def. Signed-off-by: Andrei Otcheretianski Reviewed-by: Luciano Coelho Signed-off-by: Johannes Berg --- net/mac80211/chan.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index 7e9b62475400..f01c18a3160e 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -1010,6 +1010,8 @@ ieee80211_vif_use_reserved_reassign(struct ieee80211_sub_if_data *sdata) if (WARN_ON(!chandef)) return -EINVAL; + ieee80211_change_chanctx(local, new_ctx, chandef); + vif_chsw[0].vif = &sdata->vif; vif_chsw[0].old_ctx = &old_ctx->conf; vif_chsw[0].new_ctx = &new_ctx->conf; @@ -1083,6 +1085,8 @@ ieee80211_vif_use_reserved_assign(struct ieee80211_sub_if_data *sdata) if (WARN_ON(!chandef)) return -EINVAL; + ieee80211_change_chanctx(local, new_ctx, chandef); + list_del(&sdata->reserved_chanctx_list); sdata->reserved_chanctx = NULL; -- cgit From 448b4482c6713c5b62828622b7bf46900ff9569b Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 6 May 2015 01:09:56 +0200 Subject: net: dsa: Add lockdep class to tx queues to avoid lockdep splat DSA stacks an Ethernet device on top of an Ethernet device. This can cause false positive lockdep splats for the transmit queue: Acked-by: Florian Fainelli ============================================= [ INFO: possible recursive locking detected ] 4.0.0-rc7-01838-g70621a215fc7 #386 Not tainted --------------------------------------------- kworker/0:0/4 is trying to acquire lock: (_xmit_ETHER#2){+.-...}, at: [] sch_direct_xmit+0xa8/0x1fc but task is already holding lock: (_xmit_ETHER#2){+.-...}, at: [] __dev_queue_xmit+0x4d4/0x56c other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(_xmit_ETHER#2); lock(_xmit_ETHER#2); To avoid this, walk the tq queues of the dsa slaves and set a lockdep class. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- net/dsa/slave.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 827cda560a55..03e041addea3 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -810,12 +810,19 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p, return 0; } +static struct lock_class_key dsa_slave_netdev_xmit_lock_key; +static void dsa_slave_set_lockdep_class_one(struct net_device *dev, + struct netdev_queue *txq, + void *_unused) +{ + lockdep_set_class(&txq->_xmit_lock, + &dsa_slave_netdev_xmit_lock_key); +} + int dsa_slave_suspend(struct net_device *slave_dev) { struct dsa_slave_priv *p = netdev_priv(slave_dev); - netif_device_detach(slave_dev); - if (p->phy) { phy_stop(p->phy); p->old_pause = -1; @@ -861,6 +868,9 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent, slave_dev->netdev_ops = &dsa_slave_netdev_ops; slave_dev->swdev_ops = &dsa_slave_swdev_ops; + netdev_for_each_tx_queue(slave_dev, dsa_slave_set_lockdep_class_one, + NULL); + SET_NETDEV_DEV(slave_dev, parent); slave_dev->dev.of_node = ds->pd->port_dn[port]; slave_dev->vlan_features = master->vlan_features; -- cgit From c3d6fb85b2ed6a57b57b322a470b3b4eefb34fb7 Mon Sep 17 00:00:00 2001 From: Richard Alpe Date: Wed, 6 May 2015 13:58:54 +0200 Subject: tipc: fix default link prop regression in nl compat Default link properties can be set for media or bearer. This functionality was missed when introducing the NL compatibility layer. This patch implements this functionality in the compat netlink layer. It works the same way as it did in the old API. We search for media and bearers matching the "link name". If we find a matching media or bearer the link tolerance, priority or window is used as default for new links on that media or bearer. Fixes: 37e2d4843f9e (tipc: convert legacy nl link prop set to nl compat) Reported-by: Tomi Ollila Signed-off-by: Richard Alpe Reviewed-by: Erik Hugne Reviewed-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/bearer.c | 2 +- net/tipc/netlink_compat.c | 135 ++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 114 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 70e3dacbf84a..99c0bd43feed 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -812,7 +812,7 @@ int tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info) char *name; struct tipc_bearer *b; struct nlattr *attrs[TIPC_NLA_BEARER_MAX + 1]; - struct net *net = genl_info_net(info); + struct net *net = sock_net(skb->sk); if (!info->attrs[TIPC_NLA_BEARER]) return -EINVAL; diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index ce9121e8e990..809aaf027876 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -55,6 +55,7 @@ struct tipc_nl_compat_msg { int rep_type; int rep_size; int req_type; + struct net *net; struct sk_buff *rep; struct tlv_desc *req; struct sock *dst_sk; @@ -68,7 +69,8 @@ struct tipc_nl_compat_cmd_dump { struct tipc_nl_compat_cmd_doit { int (*doit)(struct sk_buff *skb, struct genl_info *info); - int (*transcode)(struct sk_buff *skb, struct tipc_nl_compat_msg *msg); + int (*transcode)(struct tipc_nl_compat_cmd_doit *cmd, + struct sk_buff *skb, struct tipc_nl_compat_msg *msg); }; static int tipc_skb_tailroom(struct sk_buff *skb) @@ -281,7 +283,7 @@ static int __tipc_nl_compat_doit(struct tipc_nl_compat_cmd_doit *cmd, if (!trans_buf) return -ENOMEM; - err = (*cmd->transcode)(trans_buf, msg); + err = (*cmd->transcode)(cmd, trans_buf, msg); if (err) goto trans_out; @@ -353,7 +355,8 @@ static int tipc_nl_compat_bearer_dump(struct tipc_nl_compat_msg *msg, nla_len(bearer[TIPC_NLA_BEARER_NAME])); } -static int tipc_nl_compat_bearer_enable(struct sk_buff *skb, +static int tipc_nl_compat_bearer_enable(struct tipc_nl_compat_cmd_doit *cmd, + struct sk_buff *skb, struct tipc_nl_compat_msg *msg) { struct nlattr *prop; @@ -385,7 +388,8 @@ static int tipc_nl_compat_bearer_enable(struct sk_buff *skb, return 0; } -static int tipc_nl_compat_bearer_disable(struct sk_buff *skb, +static int tipc_nl_compat_bearer_disable(struct tipc_nl_compat_cmd_doit *cmd, + struct sk_buff *skb, struct tipc_nl_compat_msg *msg) { char *name; @@ -576,11 +580,81 @@ static int tipc_nl_compat_link_dump(struct tipc_nl_compat_msg *msg, &link_info, sizeof(link_info)); } -static int tipc_nl_compat_link_set(struct sk_buff *skb, - struct tipc_nl_compat_msg *msg) +static int __tipc_add_link_prop(struct sk_buff *skb, + struct tipc_nl_compat_msg *msg, + struct tipc_link_config *lc) +{ + switch (msg->cmd) { + case TIPC_CMD_SET_LINK_PRI: + return nla_put_u32(skb, TIPC_NLA_PROP_PRIO, ntohl(lc->value)); + case TIPC_CMD_SET_LINK_TOL: + return nla_put_u32(skb, TIPC_NLA_PROP_TOL, ntohl(lc->value)); + case TIPC_CMD_SET_LINK_WINDOW: + return nla_put_u32(skb, TIPC_NLA_PROP_WIN, ntohl(lc->value)); + } + + return -EINVAL; +} + +static int tipc_nl_compat_media_set(struct sk_buff *skb, + struct tipc_nl_compat_msg *msg) { - struct nlattr *link; struct nlattr *prop; + struct nlattr *media; + struct tipc_link_config *lc; + + lc = (struct tipc_link_config *)TLV_DATA(msg->req); + + media = nla_nest_start(skb, TIPC_NLA_MEDIA); + if (!media) + return -EMSGSIZE; + + if (nla_put_string(skb, TIPC_NLA_MEDIA_NAME, lc->name)) + return -EMSGSIZE; + + prop = nla_nest_start(skb, TIPC_NLA_MEDIA_PROP); + if (!prop) + return -EMSGSIZE; + + __tipc_add_link_prop(skb, msg, lc); + nla_nest_end(skb, prop); + nla_nest_end(skb, media); + + return 0; +} + +static int tipc_nl_compat_bearer_set(struct sk_buff *skb, + struct tipc_nl_compat_msg *msg) +{ + struct nlattr *prop; + struct nlattr *bearer; + struct tipc_link_config *lc; + + lc = (struct tipc_link_config *)TLV_DATA(msg->req); + + bearer = nla_nest_start(skb, TIPC_NLA_BEARER); + if (!bearer) + return -EMSGSIZE; + + if (nla_put_string(skb, TIPC_NLA_BEARER_NAME, lc->name)) + return -EMSGSIZE; + + prop = nla_nest_start(skb, TIPC_NLA_BEARER_PROP); + if (!prop) + return -EMSGSIZE; + + __tipc_add_link_prop(skb, msg, lc); + nla_nest_end(skb, prop); + nla_nest_end(skb, bearer); + + return 0; +} + +static int __tipc_nl_compat_link_set(struct sk_buff *skb, + struct tipc_nl_compat_msg *msg) +{ + struct nlattr *prop; + struct nlattr *link; struct tipc_link_config *lc; lc = (struct tipc_link_config *)TLV_DATA(msg->req); @@ -596,24 +670,40 @@ static int tipc_nl_compat_link_set(struct sk_buff *skb, if (!prop) return -EMSGSIZE; - if (msg->cmd == TIPC_CMD_SET_LINK_PRI) { - if (nla_put_u32(skb, TIPC_NLA_PROP_PRIO, ntohl(lc->value))) - return -EMSGSIZE; - } else if (msg->cmd == TIPC_CMD_SET_LINK_TOL) { - if (nla_put_u32(skb, TIPC_NLA_PROP_TOL, ntohl(lc->value))) - return -EMSGSIZE; - } else if (msg->cmd == TIPC_CMD_SET_LINK_WINDOW) { - if (nla_put_u32(skb, TIPC_NLA_PROP_WIN, ntohl(lc->value))) - return -EMSGSIZE; - } - + __tipc_add_link_prop(skb, msg, lc); nla_nest_end(skb, prop); nla_nest_end(skb, link); return 0; } -static int tipc_nl_compat_link_reset_stats(struct sk_buff *skb, +static int tipc_nl_compat_link_set(struct tipc_nl_compat_cmd_doit *cmd, + struct sk_buff *skb, + struct tipc_nl_compat_msg *msg) +{ + struct tipc_link_config *lc; + struct tipc_bearer *bearer; + struct tipc_media *media; + + lc = (struct tipc_link_config *)TLV_DATA(msg->req); + + media = tipc_media_find(lc->name); + if (media) { + cmd->doit = &tipc_nl_media_set; + return tipc_nl_compat_media_set(skb, msg); + } + + bearer = tipc_bearer_find(msg->net, lc->name); + if (bearer) { + cmd->doit = &tipc_nl_bearer_set; + return tipc_nl_compat_bearer_set(skb, msg); + } + + return __tipc_nl_compat_link_set(skb, msg); +} + +static int tipc_nl_compat_link_reset_stats(struct tipc_nl_compat_cmd_doit *cmd, + struct sk_buff *skb, struct tipc_nl_compat_msg *msg) { char *name; @@ -851,7 +941,8 @@ static int tipc_nl_compat_node_dump(struct tipc_nl_compat_msg *msg, sizeof(node_info)); } -static int tipc_nl_compat_net_set(struct sk_buff *skb, +static int tipc_nl_compat_net_set(struct tipc_nl_compat_cmd_doit *cmd, + struct sk_buff *skb, struct tipc_nl_compat_msg *msg) { u32 val; @@ -1007,7 +1098,6 @@ static int tipc_nl_compat_recv(struct sk_buff *skb, struct genl_info *info) struct nlmsghdr *req_nlh; struct nlmsghdr *rep_nlh; struct tipc_genlmsghdr *req_userhdr = info->userhdr; - struct net *net = genl_info_net(info); memset(&msg, 0, sizeof(msg)); @@ -1015,6 +1105,7 @@ static int tipc_nl_compat_recv(struct sk_buff *skb, struct genl_info *info) msg.req = nlmsg_data(req_nlh) + GENL_HDRLEN + TIPC_GENL_HDRLEN; msg.cmd = req_userhdr->cmd; msg.dst_sk = info->dst_sk; + msg.net = genl_info_net(info); if ((msg.cmd & 0xC000) && (!netlink_net_capable(skb, CAP_NET_ADMIN))) { msg.rep = tipc_get_err_tlv(TIPC_CFG_NOT_NET_ADMIN); @@ -1043,7 +1134,7 @@ send: rep_nlh = nlmsg_hdr(msg.rep); memcpy(rep_nlh, info->nlhdr, len); rep_nlh->nlmsg_len = msg.rep->len; - genlmsg_unicast(net, msg.rep, NETLINK_CB(skb).portid); + genlmsg_unicast(msg.net, msg.rep, NETLINK_CB(skb).portid); return err; } -- cgit From 670f4f8818ee54d0c1831e3165dadd5b8a3f713e Mon Sep 17 00:00:00 2001 From: Richard Alpe Date: Wed, 6 May 2015 13:58:55 +0200 Subject: tipc: add broadcast link window set/get to nl api Add the ability to get or set the broadcast link window through the new netlink API. The functionality was unintentionally missing from the new netlink API. Adding this means that we also fix the breakage in the old API when coming through the compat layer. Fixes: 37e2d4843f9e (tipc: convert legacy nl link prop set to nl compat) Reported-by: Tomi Ollila Signed-off-by: Richard Alpe Reviewed-by: Erik Hugne Reviewed-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/bcast.c | 21 ++++++++++++++++++ net/tipc/bcast.h | 1 + net/tipc/link.c | 66 ++++++++++++++++++++++++++++++-------------------------- 3 files changed, 58 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index c5cbdcb1f0b5..3e18fc6c7877 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -866,6 +866,27 @@ int tipc_bclink_set_queue_limits(struct net *net, u32 limit) return 0; } +int tipc_nl_bc_link_set(struct net *net, struct nlattr *attrs[]) +{ + int err; + u32 win; + struct nlattr *props[TIPC_NLA_PROP_MAX + 1]; + + if (!attrs[TIPC_NLA_LINK_PROP]) + return -EINVAL; + + err = tipc_nl_parse_link_prop(attrs[TIPC_NLA_LINK_PROP], props); + if (err) + return err; + + if (!props[TIPC_NLA_PROP_WIN]) + return -EOPNOTSUPP; + + win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); + + return tipc_bclink_set_queue_limits(net, win); +} + int tipc_bclink_init(struct net *net) { struct tipc_net *tn = net_generic(net, tipc_net_id); diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h index 4bdc12277d33..3c290a48f720 100644 --- a/net/tipc/bcast.h +++ b/net/tipc/bcast.h @@ -131,6 +131,7 @@ uint tipc_bclink_get_mtu(void); int tipc_bclink_xmit(struct net *net, struct sk_buff_head *list); void tipc_bclink_wakeup_users(struct net *net); int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg); +int tipc_nl_bc_link_set(struct net *net, struct nlattr *attrs[]); void tipc_bclink_input(struct net *net); #endif diff --git a/net/tipc/link.c b/net/tipc/link.c index 43a515dc97b0..374d52335168 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1893,6 +1893,9 @@ int tipc_nl_link_set(struct sk_buff *skb, struct genl_info *info) name = nla_data(attrs[TIPC_NLA_LINK_NAME]); + if (strcmp(name, tipc_bclink_name) == 0) + return tipc_nl_bc_link_set(net, attrs); + node = tipc_link_find_owner(net, name, &bearer_id); if (!node) return -EINVAL; @@ -2175,50 +2178,53 @@ out: int tipc_nl_link_get(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); - struct sk_buff *ans_skb; struct tipc_nl_msg msg; - struct tipc_link *link; - struct tipc_node *node; char *name; - int bearer_id; int err; + msg.portid = info->snd_portid; + msg.seq = info->snd_seq; + if (!info->attrs[TIPC_NLA_LINK_NAME]) return -EINVAL; - name = nla_data(info->attrs[TIPC_NLA_LINK_NAME]); - node = tipc_link_find_owner(net, name, &bearer_id); - if (!node) - return -EINVAL; - ans_skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); - if (!ans_skb) + msg.skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!msg.skb) return -ENOMEM; - msg.skb = ans_skb; - msg.portid = info->snd_portid; - msg.seq = info->snd_seq; - - tipc_node_lock(node); - link = node->links[bearer_id]; - if (!link) { - err = -EINVAL; - goto err_out; - } - - err = __tipc_nl_add_link(net, &msg, link, 0); - if (err) - goto err_out; + if (strcmp(name, tipc_bclink_name) == 0) { + err = tipc_nl_add_bc_link(net, &msg); + if (err) { + nlmsg_free(msg.skb); + return err; + } + } else { + int bearer_id; + struct tipc_node *node; + struct tipc_link *link; - tipc_node_unlock(node); + node = tipc_link_find_owner(net, name, &bearer_id); + if (!node) + return -EINVAL; - return genlmsg_reply(ans_skb, info); + tipc_node_lock(node); + link = node->links[bearer_id]; + if (!link) { + tipc_node_unlock(node); + nlmsg_free(msg.skb); + return -EINVAL; + } -err_out: - tipc_node_unlock(node); - nlmsg_free(ans_skb); + err = __tipc_nl_add_link(net, &msg, link, 0); + tipc_node_unlock(node); + if (err) { + nlmsg_free(msg.skb); + return err; + } + } - return err; + return genlmsg_reply(msg.skb, info); } int tipc_nl_link_reset_stats(struct sk_buff *skb, struct genl_info *info) -- cgit From b063bc5ea77b1c1c0e7798f641f53504d0f64bf8 Mon Sep 17 00:00:00 2001 From: Richard Alpe Date: Wed, 6 May 2015 13:58:56 +0200 Subject: tipc: send explicit not supported error in nl compat The legacy netlink API treated EPERM (permission denied) as "operation not supported". Reported-by: Tomi Ollila Signed-off-by: Richard Alpe Reviewed-by: Erik Hugne Reviewed-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/netlink_compat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index 809aaf027876..53e0fee80086 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -1121,7 +1121,7 @@ static int tipc_nl_compat_recv(struct sk_buff *skb, struct genl_info *info) } err = tipc_nl_compat_handle(&msg); - if (err == -EOPNOTSUPP) + if ((err == -EOPNOTSUPP) || (err == -EPERM)) msg.rep = tipc_get_err_tlv(TIPC_CFG_NOT_SUPPORTED); else if (err == -EINVAL) msg.rep = tipc_get_err_tlv(TIPC_CFG_TLV_ERROR); -- cgit From 21c8fe9915276d923f8c1e43434fd6d37a3b9aef Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 May 2015 14:26:24 -0700 Subject: tcp: adjust window probe timers to safer values With the advent of small rto timers in datacenter TCP, (ip route ... rto_min x), the following can happen : 1) Qdisc is full, transmit fails. TCP sets a timer based on icsk_rto to retry the transmit, without exponential backoff. With low icsk_rto, and lot of sockets, all cpus are servicing timer interrupts like crazy. Intent of the code was to retry with a timer between 200 (TCP_RTO_MIN) and 500ms (TCP_RESOURCE_PROBE_INTERVAL) 2) Receivers can send zero windows if they don't drain their receive queue. TCP sends zero window probes, based on icsk_rto current value, with exponential backoff. With /proc/sys/net/ipv4/tcp_retries2 being 15 (or even smaller in some cases), sender can abort in less than one or two minutes ! If receiver stops the sender, it obviously doesn't care of very tight rto. Probability of dropping the ACK reopening the window is not worth the risk. Lets change the base timer to be at least 200ms (TCP_RTO_MIN) for these events (but not normal RTO based retransmits) A followup patch adds a new SNMP counter, as it would have helped a lot diagnosing this issue. Signed-off-by: Eric Dumazet Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_output.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index df2ca615cd0c..cf8b20ff6658 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3233,7 +3233,7 @@ static void tcp_ack_probe(struct sock *sk) * This function is not for random using! */ } else { - unsigned long when = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); + unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX); inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, TCP_RTO_MAX); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a369e8a70b2c..b76c719e1979 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3490,7 +3490,7 @@ void tcp_send_probe0(struct sock *sk) probe_max = TCP_RESOURCE_PROBE_INTERVAL; } inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - inet_csk_rto_backoff(icsk, probe_max), + tcp_probe0_when(sk, probe_max), TCP_RTO_MAX); } -- cgit From e520af48c7e5acae5f17f82a79ba7ab7cf156f3b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 May 2015 14:26:25 -0700 Subject: tcp: add TCPWinProbe and TCPKeepAlive SNMP counters Diagnosing problems related to Window Probes has been hard because we lack a counter. TCPWinProbe counts the number of ACK packets a sender has to send at regular intervals to make sure a reverse ACK packet opening back a window had not been lost. TCPKeepAlive counts the number of ACK packets sent to keep TCP flows alive (SO_KEEPALIVE) Signed-off-by: Eric Dumazet Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Acked-by: Nandita Dukkipati Signed-off-by: David S. Miller --- net/ipv4/proc.c | 2 ++ net/ipv4/tcp_output.c | 13 +++++++------ net/ipv4/tcp_timer.c | 2 +- 3 files changed, 10 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index e1f3b911dd1e..da5d483e236a 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -298,6 +298,8 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPACKSkippedFinWait2", LINUX_MIB_TCPACKSKIPPEDFINWAIT2), SNMP_MIB_ITEM("TCPACKSkippedTimeWait", LINUX_MIB_TCPACKSKIPPEDTIMEWAIT), SNMP_MIB_ITEM("TCPACKSkippedChallenge", LINUX_MIB_TCPACKSKIPPEDCHALLENGE), + SNMP_MIB_ITEM("TCPWinProbe", LINUX_MIB_TCPWINPROBE), + SNMP_MIB_ITEM("TCPKeepAlive", LINUX_MIB_TCPKEEPALIVE), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b76c719e1979..7386d32cd670 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3382,7 +3382,7 @@ EXPORT_SYMBOL_GPL(tcp_send_ack); * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is * out-of-date with SND.UNA-1 to probe window. */ -static int tcp_xmit_probe_skb(struct sock *sk, int urgent) +static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; @@ -3400,6 +3400,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) */ tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK); skb_mstamp_get(&skb->skb_mstamp); + NET_INC_STATS_BH(sock_net(sk), mib); return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); } @@ -3407,12 +3408,12 @@ void tcp_send_window_probe(struct sock *sk) { if (sk->sk_state == TCP_ESTABLISHED) { tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1; - tcp_xmit_probe_skb(sk, 0); + tcp_xmit_probe_skb(sk, 0, LINUX_MIB_TCPWINPROBE); } } /* Initiate keepalive or window probe from timer. */ -int tcp_write_wakeup(struct sock *sk) +int tcp_write_wakeup(struct sock *sk, int mib) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; @@ -3449,8 +3450,8 @@ int tcp_write_wakeup(struct sock *sk) return err; } else { if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF)) - tcp_xmit_probe_skb(sk, 1); - return tcp_xmit_probe_skb(sk, 0); + tcp_xmit_probe_skb(sk, 1, mib); + return tcp_xmit_probe_skb(sk, 0, mib); } } @@ -3464,7 +3465,7 @@ void tcp_send_probe0(struct sock *sk) unsigned long probe_max; int err; - err = tcp_write_wakeup(sk); + err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE); if (tp->packets_out || !tcp_send_head(sk)) { /* Cancel probe timer, if it is not required. */ diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 8c65dc147d8b..65bf670e8714 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -616,7 +616,7 @@ static void tcp_keepalive_timer (unsigned long data) tcp_write_err(sk); goto out; } - if (tcp_write_wakeup(sk) <= 0) { + if (tcp_write_wakeup(sk, LINUX_MIB_TCPKEEPALIVE) <= 0) { icsk->icsk_probes_out++; elapsed = keepalive_intvl_when(tp); } else { -- cgit From 4ae92bc77ac8e620f7c8d59b5882a4cb0d1c4ef1 Mon Sep 17 00:00:00 2001 From: Nicolas Schichan Date: Wed, 6 May 2015 16:12:27 +0200 Subject: net: filter: add a callback to allow classic post-verifier transformations This is in preparation for use by the seccomp code, the rationale is not to duplicate additional code within the seccomp layer, but instead, have it abstracted and hidden within the classic BPF API. As an interim step, this now also makes bpf_prepare_filter() visible (not as exported symbol though), so that seccomp can reuse that code path instead of reimplementing it. Joint work with Daniel Borkmann. Signed-off-by: Nicolas Schichan Signed-off-by: Daniel Borkmann Cc: Alexei Starovoitov Cc: Kees Cook Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index bf831a85c315..e670494f1d83 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -988,7 +988,8 @@ out_err: return ERR_PTR(err); } -static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp) +struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp, + bpf_aux_classic_check_t trans) { int err; @@ -1001,6 +1002,17 @@ static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp) return ERR_PTR(err); } + /* There might be additional checks and transformations + * needed on classic filters, f.e. in case of seccomp. + */ + if (trans) { + err = trans(fp->insns, fp->len); + if (err) { + __bpf_prog_release(fp); + return ERR_PTR(err); + } + } + /* Probe if we can JIT compile the filter and if so, do * the compilation of the filter. */ @@ -1050,7 +1062,7 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog) /* bpf_prepare_filter() already takes care of freeing * memory in case something goes wrong. */ - fp = bpf_prepare_filter(fp); + fp = bpf_prepare_filter(fp, NULL); if (IS_ERR(fp)) return PTR_ERR(fp); @@ -1135,7 +1147,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) /* bpf_prepare_filter() already takes care of freeing * memory in case something goes wrong. */ - prog = bpf_prepare_filter(prog); + prog = bpf_prepare_filter(prog, NULL); if (IS_ERR(prog)) return PTR_ERR(prog); -- cgit From d9e12f42e58da475379b9080708b94f2095904af Mon Sep 17 00:00:00 2001 From: Nicolas Schichan Date: Wed, 6 May 2015 16:12:28 +0200 Subject: seccomp: simplify seccomp_prepare_filter and reuse bpf_prepare_filter Remove the calls to bpf_check_classic(), bpf_convert_filter() and bpf_migrate_runtime() and let bpf_prepare_filter() take care of that instead. seccomp_check_filter() is passed to bpf_prepare_filter() so that it gets called from there, after bpf_check_classic(). We can now remove exposure of two internal classic BPF functions previously used by seccomp. The export of bpf_check_classic() symbol, previously known as sk_chk_filter(), was there since pre git times, and no in-tree module was using it, therefore remove it. Joint work with Daniel Borkmann. Signed-off-by: Nicolas Schichan Signed-off-by: Daniel Borkmann Cc: Alexei Starovoitov Cc: Kees Cook Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index e670494f1d83..f887084740cd 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -355,8 +355,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp, * for socket filters: ctx == 'struct sk_buff *', for seccomp: * ctx == 'struct seccomp_data *'. */ -int bpf_convert_filter(struct sock_filter *prog, int len, - struct bpf_insn *new_prog, int *new_len) +static int bpf_convert_filter(struct sock_filter *prog, int len, + struct bpf_insn *new_prog, int *new_len) { int new_flen = 0, pass = 0, target, i; struct bpf_insn *new_insn; @@ -751,7 +751,8 @@ static bool chk_code_allowed(u16 code_to_probe) * * Returns 0 if the rule set is legal or -EINVAL if not. */ -int bpf_check_classic(const struct sock_filter *filter, unsigned int flen) +static int bpf_check_classic(const struct sock_filter *filter, + unsigned int flen) { bool anc_found; int pc; @@ -825,7 +826,6 @@ int bpf_check_classic(const struct sock_filter *filter, unsigned int flen) return -EINVAL; } -EXPORT_SYMBOL(bpf_check_classic); static int bpf_prog_store_orig_filter(struct bpf_prog *fp, const struct sock_fprog *fprog) -- cgit From 658da9379d0e1aa1e3e40c9b795357ada818b1c8 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 6 May 2015 16:12:29 +0200 Subject: net: filter: add __GFP_NOWARN flag for larger kmem allocs When seccomp BPF was added, it was discussed to add __GFP_NOWARN flag for their configuration path as f.e. up to 32K allocations are more prone to fail under stress. As we're going to reuse BPF API, add __GFP_NOWARN flags where larger kmalloc() and friends allocations could fail. It doesn't make much sense to pass around __GFP_NOWARN everywhere as an extra argument only for seccomp while we just as well could run into similar issues for socket filters, where it's not desired to have a user application throw a WARN() due to allocation failure. Signed-off-by: Daniel Borkmann Cc: Nicolas Schichan Cc: Alexei Starovoitov Cc: Kees Cook Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index f887084740cd..45c015d6b974 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -371,7 +371,8 @@ static int bpf_convert_filter(struct sock_filter *prog, int len, return -EINVAL; if (new_prog) { - addrs = kcalloc(len, sizeof(*addrs), GFP_KERNEL); + addrs = kcalloc(len, sizeof(*addrs), + GFP_KERNEL | __GFP_NOWARN); if (!addrs) return -ENOMEM; } @@ -839,7 +840,9 @@ static int bpf_prog_store_orig_filter(struct bpf_prog *fp, fkprog = fp->orig_prog; fkprog->len = fprog->len; - fkprog->filter = kmemdup(fp->insns, fsize, GFP_KERNEL); + + fkprog->filter = kmemdup(fp->insns, fsize, + GFP_KERNEL | __GFP_NOWARN); if (!fkprog->filter) { kfree(fp->orig_prog); return -ENOMEM; @@ -941,7 +944,7 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) * pass. At this time, the user BPF is stored in fp->insns. */ old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter), - GFP_KERNEL); + GFP_KERNEL | __GFP_NOWARN); if (!old_prog) { err = -ENOMEM; goto out_err; -- cgit From ac67eb2c5347bd9976308c0e0cf1d9e7ca690342 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 6 May 2015 16:12:30 +0200 Subject: seccomp, filter: add and use bpf_prog_create_from_user from seccomp Seccomp has always been a special candidate when it comes to preparation of its filters in seccomp_prepare_filter(). Due to the extra checks and filter rewrite it partially duplicates code and has BPF internals exposed. This patch adds a generic API inside the BPF code code that seccomp can use and thus keep it's filter preparation code minimal and better maintainable. The other side-effect is that now classic JITs can add seccomp support as well by only providing a BPF_LDX | BPF_W | BPF_ABS translation. Tested with seccomp and BPF test suites. Signed-off-by: Daniel Borkmann Cc: Nicolas Schichan Cc: Alexei Starovoitov Cc: Kees Cook Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 45c015d6b974..a831f193e2c7 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -991,8 +991,8 @@ out_err: return ERR_PTR(err); } -struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp, - bpf_aux_classic_check_t trans) +static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp, + bpf_aux_classic_check_t trans) { int err; @@ -1074,6 +1074,53 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog) } EXPORT_SYMBOL_GPL(bpf_prog_create); +/** + * bpf_prog_create_from_user - create an unattached filter from user buffer + * @pfp: the unattached filter that is created + * @fprog: the filter program + * @trans: post-classic verifier transformation handler + * + * This function effectively does the same as bpf_prog_create(), only + * that it builds up its insns buffer from user space provided buffer. + * It also allows for passing a bpf_aux_classic_check_t handler. + */ +int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, + bpf_aux_classic_check_t trans) +{ + unsigned int fsize = bpf_classic_proglen(fprog); + struct bpf_prog *fp; + + /* Make sure new filter is there and in the right amounts. */ + if (fprog->filter == NULL) + return -EINVAL; + + fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); + if (!fp) + return -ENOMEM; + + if (copy_from_user(fp->insns, fprog->filter, fsize)) { + __bpf_prog_free(fp); + return -EFAULT; + } + + fp->len = fprog->len; + /* Since unattached filters are not copied back to user + * space through sk_get_filter(), we do not need to hold + * a copy here, and can spare us the work. + */ + fp->orig_prog = NULL; + + /* bpf_prepare_filter() already takes care of freeing + * memory in case something goes wrong. + */ + fp = bpf_prepare_filter(fp, trans); + if (IS_ERR(fp)) + return PTR_ERR(fp); + + *pfp = fp; + return 0; +} + void bpf_prog_destroy(struct bpf_prog *fp) { __bpf_prog_release(fp); -- cgit From 790ba4566c1a709462550463b222d23b6f352b47 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Wed, 6 May 2015 15:52:23 +0000 Subject: tcp: set SOCK_NOSPACE under memory pressure Under tcp memory pressure, calling epoll_wait() in edge triggered mode after -EAGAIN, can result in an indefinite hang in epoll_wait(), even when there is sufficient memory available to continue making progress. The problem is that when __sk_mem_schedule() returns 0 under memory pressure, we do not set the SOCK_NOSPACE flag in the tcp write paths (tcp_sendmsg() or do_tcp_sendpages()). Then, since SOCK_NOSPACE is used to trigger wakeups when incoming acks create sufficient new space in the write queue, all outstanding packets are acked, but we never wake up with the the EPOLLOUT that we are expecting from epoll_wait(). This issue is currently limited to epoll() when used in edge trigger mode, since 'tcp_poll()', does in fact currently set SOCK_NOSPACE. This is sufficient for poll()/select() and epoll() in level trigger mode. However, in edge trigger mode, epoll() is relying on the write path to set SOCK_NOSPACE. EPOLL(7) says that in edge-trigger mode we can only call epoll_wait() after read/write return -EAGAIN. Thus, in the case of the socket write, we are relying on the fact that tcp_sendmsg()/network write paths are going to issue a wakeup for us at some point in the future when we get -EAGAIN. Normally, epoll() edge trigger works fine when we've exceeded the sk->sndbuf because in that case we do set SOCK_NOSPACE. However, when we return -EAGAIN from the write path b/c we are over the tcp memory limits and not b/c we are over the sndbuf, we are never going to get another wakeup. I can reproduce this issue, using SO_SNDBUF, since __sk_mem_schedule() will return 0, or failure more readily with SO_SNDBUF: 1) create socket and set SO_SNDBUF to N 2) add socket as edge trigger 3) write to socket and block in epoll on -EAGAIN 4) cause tcp mem pressure via: echo "" > net.ipv4.tcp_mem The fix here is simply to set SOCK_NOSPACE in sk_stream_wait_memory() when the socket is non-blocking. Note that SOCK_NOSPACE, in addition to waking up outstanding waiters is also used to expand the size of the sk->sndbuf. However, we will not expand it by setting it in this case because tcp_should_expand_sndbuf(), ensures that no expansion occurs when we are under tcp memory pressure. Note that we could still hang if sk->sk_wmem_queue is 0, when we get the -EAGAIN. In this case the SOCK_NOSPACE bit will not help, since we are waiting for and event that will never happen. I believe that this case is harder to hit (and did not hit in my testing), in that over the tcp 'soft' memory limits, we continue to guarantee a minimum write buffer size. Perhaps, we could return -ENOSPC in this case, or maybe we simply issue a wakeup in this case, such that we keep retrying the write. Note that this case is not specific to epoll() ET, but rather would affect blocking sockets as well. So I view this patch as bringing epoll() edge-trigger into sync with the current poll()/select()/epoll() level trigger and blocking sockets behavior. Signed-off-by: Jason Baron Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/stream.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/stream.c b/net/core/stream.c index 301c05f26060..d70f77a0c889 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -119,6 +119,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) int err = 0; long vm_wait = 0; long current_timeo = *timeo_p; + bool noblock = (*timeo_p ? false : true); DEFINE_WAIT(wait); if (sk_stream_memory_free(sk)) @@ -131,8 +132,11 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) goto do_error; - if (!*timeo_p) + if (!*timeo_p) { + if (noblock) + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); goto do_nonblock; + } if (signal_pending(current)) goto do_interrupted; clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); -- cgit From 109582af18b9aade9385ea6609a792f80a7d70ca Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 7 May 2015 11:02:47 +0200 Subject: netns: returns always an id in __peernet2id() All callers of this function expect a nsid, not an error. Thus, returns NETNSA_NSID_NOT_ASSIGNED in case of error so that callers don't have to convert the error to NETNSA_NSID_NOT_ASSIGNED. Signed-off-by: Nicolas Dichtel Acked-by: Thomas Graf Signed-off-by: David S. Miller --- net/core/net_namespace.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 78fc04ad36fc..294d38742e2a 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -192,10 +192,12 @@ static int __peernet2id(struct net *net, struct net *peer, bool alloc) if (id > 0) return id; - if (alloc) - return alloc_netid(net, peer, -1); + if (alloc) { + id = alloc_netid(net, peer, -1); + return id >= 0 ? id : NETNSA_NSID_NOT_ASSIGNED; + } - return -ENOENT; + return NETNSA_NSID_NOT_ASSIGNED; } /* This function returns the id of a peer netns. If no id is assigned, one will @@ -204,10 +206,8 @@ static int __peernet2id(struct net *net, struct net *peer, bool alloc) int peernet2id(struct net *net, struct net *peer) { bool alloc = atomic_read(&peer->count) == 0 ? false : true; - int id; - id = __peernet2id(net, peer, alloc); - return id >= 0 ? id : NETNSA_NSID_NOT_ASSIGNED; + return __peernet2id(net, peer, alloc); } EXPORT_SYMBOL(peernet2id); @@ -554,13 +554,10 @@ static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags, rth = nlmsg_data(nlh); rth->rtgen_family = AF_UNSPEC; - if (nsid >= 0) { + if (nsid >= 0) id = nsid; - } else { + else id = __peernet2id(net, peer, false); - if (id < 0) - id = NETNSA_NSID_NOT_ASSIGNED; - } if (nla_put_s32(skb, NETNSA_NSID, id)) goto nla_put_failure; -- cgit From cab3c8ec8d57ef48ed754ee7acf2b9bdce80fa5f Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 7 May 2015 11:02:48 +0200 Subject: netns: always provide the id to rtnl_net_fill() The goal of this commit is to prepare the rework of the locking of nsnid protection. After this patch, rtnl_net_notifyid() will not call anymore __peernet2id(), ie no idr_* operation into this function. Signed-off-by: Nicolas Dichtel Acked-by: Thomas Graf Signed-off-by: David S. Miller --- net/core/net_namespace.c | 31 +++++++++++-------------------- 1 file changed, 11 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 294d38742e2a..37c68bb72db3 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -147,8 +147,7 @@ static void ops_free_list(const struct pernet_operations *ops, } } -static void rtnl_net_notifyid(struct net *net, struct net *peer, int cmd, - int id); +static void rtnl_net_notifyid(struct net *net, int cmd, int id); static int alloc_netid(struct net *net, struct net *peer, int reqid) { int min = 0, max = 0, id; @@ -162,7 +161,7 @@ static int alloc_netid(struct net *net, struct net *peer, int reqid) id = idr_alloc(&net->netns_ids, peer, min, max, GFP_KERNEL); if (id >= 0) - rtnl_net_notifyid(net, peer, RTM_NEWNSID, id); + rtnl_net_notifyid(net, RTM_NEWNSID, id); return id; } @@ -365,7 +364,7 @@ static void cleanup_net(struct work_struct *work) int id = __peernet2id(tmp, net, false); if (id >= 0) { - rtnl_net_notifyid(tmp, net, RTM_DELNSID, id); + rtnl_net_notifyid(tmp, RTM_DELNSID, id); idr_remove(&tmp->netns_ids, id); } } @@ -538,14 +537,10 @@ static int rtnl_net_get_size(void) } static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags, - int cmd, struct net *net, struct net *peer, - int nsid) + int cmd, struct net *net, int nsid) { struct nlmsghdr *nlh; struct rtgenmsg *rth; - int id; - - ASSERT_RTNL(); nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rth), flags); if (!nlh) @@ -554,11 +549,7 @@ static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags, rth = nlmsg_data(nlh); rth->rtgen_family = AF_UNSPEC; - if (nsid >= 0) - id = nsid; - else - id = __peernet2id(net, peer, false); - if (nla_put_s32(skb, NETNSA_NSID, id)) + if (nla_put_s32(skb, NETNSA_NSID, nsid)) goto nla_put_failure; nlmsg_end(skb, nlh); @@ -575,7 +566,7 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh) struct nlattr *tb[NETNSA_MAX + 1]; struct sk_buff *msg; struct net *peer; - int err; + int err, id; err = nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, rtnl_net_policy); @@ -597,8 +588,9 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh) goto out; } + id = __peernet2id(net, peer, false); err = rtnl_net_fill(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, - RTM_GETNSID, net, peer, -1); + RTM_GETNSID, net, id); if (err < 0) goto err_out; @@ -630,7 +622,7 @@ static int rtnl_net_dumpid_one(int id, void *peer, void *data) ret = rtnl_net_fill(net_cb->skb, NETLINK_CB(net_cb->cb->skb).portid, net_cb->cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWNSID, net_cb->net, peer, id); + RTM_NEWNSID, net_cb->net, id); if (ret < 0) return ret; @@ -658,8 +650,7 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -static void rtnl_net_notifyid(struct net *net, struct net *peer, int cmd, - int id) +static void rtnl_net_notifyid(struct net *net, int cmd, int id) { struct sk_buff *msg; int err = -ENOMEM; @@ -668,7 +659,7 @@ static void rtnl_net_notifyid(struct net *net, struct net *peer, int cmd, if (!msg) goto out; - err = rtnl_net_fill(msg, 0, 0, 0, cmd, net, peer, id); + err = rtnl_net_fill(msg, 0, 0, 0, cmd, net, id); if (err < 0) goto err_out; -- cgit From 7a0877d4b438886b72be61632eaa774d13262f70 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 7 May 2015 11:02:49 +0200 Subject: netns: rename peernet2id() to peernet2id_alloc() In a following commit, a new function will be introduced to only lookup for a nsid (no allocation if the nsid doesn't exist). To avoid confusion, the existing function is renamed. Signed-off-by: Nicolas Dichtel Acked-by: Thomas Graf Signed-off-by: David S. Miller --- net/core/net_namespace.c | 4 ++-- net/core/rtnetlink.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 37c68bb72db3..9c806ac569f9 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -202,13 +202,13 @@ static int __peernet2id(struct net *net, struct net *peer, bool alloc) /* This function returns the id of a peer netns. If no id is assigned, one will * be allocated and returned. */ -int peernet2id(struct net *net, struct net *peer) +int peernet2id_alloc(struct net *net, struct net *peer) { bool alloc = atomic_read(&peer->count) == 0 ? false : true; return __peernet2id(net, peer, alloc); } -EXPORT_SYMBOL(peernet2id); +EXPORT_SYMBOL(peernet2id_alloc); struct net *get_net_ns_by_id(struct net *net, int id) { diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 666e0928ba40..83e08323fdcd 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1204,7 +1204,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, struct net *link_net = dev->rtnl_link_ops->get_link_net(dev); if (!net_eq(dev_net(dev), link_net)) { - int id = peernet2id(dev_net(dev), link_net); + int id = peernet2id_alloc(dev_net(dev), link_net); if (nla_put_s32(skb, IFLA_LINK_NETNSID, id)) goto nla_put_failure; -- cgit From 3138dbf881274cb20d9aa1b307861f689e820fbe Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 7 May 2015 11:02:50 +0200 Subject: netns: notify new nsid outside __peernet2id() There is no functional change with this patch. It will ease the refactoring of the locking system that protects nsids and the support of the netlink socket option NETLINK_LISTEN_ALL_NSID. Signed-off-by: Nicolas Dichtel Acked-by: Thomas Graf Signed-off-by: David S. Miller --- net/core/net_namespace.c | 41 +++++++++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 9c806ac569f9..ee864241f8d6 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -147,10 +147,9 @@ static void ops_free_list(const struct pernet_operations *ops, } } -static void rtnl_net_notifyid(struct net *net, int cmd, int id); static int alloc_netid(struct net *net, struct net *peer, int reqid) { - int min = 0, max = 0, id; + int min = 0, max = 0; ASSERT_RTNL(); @@ -159,11 +158,7 @@ static int alloc_netid(struct net *net, struct net *peer, int reqid) max = reqid + 1; } - id = idr_alloc(&net->netns_ids, peer, min, max, GFP_KERNEL); - if (id >= 0) - rtnl_net_notifyid(net, RTM_NEWNSID, id); - - return id; + return idr_alloc(&net->netns_ids, peer, min, max, GFP_KERNEL); } /* This function is used by idr_for_each(). If net is equal to peer, the @@ -179,34 +174,50 @@ static int net_eq_idr(int id, void *net, void *peer) return 0; } -static int __peernet2id(struct net *net, struct net *peer, bool alloc) +static int __peernet2id_alloc(struct net *net, struct net *peer, bool *alloc) { int id = idr_for_each(&net->netns_ids, net_eq_idr, peer); + bool alloc_it = *alloc; ASSERT_RTNL(); + *alloc = false; + /* Magic value for id 0. */ if (id == NET_ID_ZERO) return 0; if (id > 0) return id; - if (alloc) { + if (alloc_it) { id = alloc_netid(net, peer, -1); + *alloc = true; return id >= 0 ? id : NETNSA_NSID_NOT_ASSIGNED; } return NETNSA_NSID_NOT_ASSIGNED; } +static int __peernet2id(struct net *net, struct net *peer) +{ + bool no = false; + + return __peernet2id_alloc(net, peer, &no); +} + +static void rtnl_net_notifyid(struct net *net, int cmd, int id); /* This function returns the id of a peer netns. If no id is assigned, one will * be allocated and returned. */ int peernet2id_alloc(struct net *net, struct net *peer) { bool alloc = atomic_read(&peer->count) == 0 ? false : true; + int id; - return __peernet2id(net, peer, alloc); + id = __peernet2id_alloc(net, peer, &alloc); + if (alloc && id >= 0) + rtnl_net_notifyid(net, RTM_NEWNSID, id); + return id; } EXPORT_SYMBOL(peernet2id_alloc); @@ -361,7 +372,7 @@ static void cleanup_net(struct work_struct *work) list_del_rcu(&net->list); list_add_tail(&net->exit_list, &net_exit_list); for_each_net(tmp) { - int id = __peernet2id(tmp, net, false); + int id = __peernet2id(tmp, net); if (id >= 0) { rtnl_net_notifyid(tmp, RTM_DELNSID, id); @@ -516,14 +527,16 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh) if (IS_ERR(peer)) return PTR_ERR(peer); - if (__peernet2id(net, peer, false) >= 0) { + if (__peernet2id(net, peer) >= 0) { err = -EEXIST; goto out; } err = alloc_netid(net, peer, nsid); - if (err > 0) + if (err >= 0) { + rtnl_net_notifyid(net, RTM_NEWNSID, err); err = 0; + } out: put_net(peer); return err; @@ -588,7 +601,7 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh) goto out; } - id = __peernet2id(net, peer, false); + id = __peernet2id(net, peer); err = rtnl_net_fill(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, RTM_GETNSID, net, id); if (err < 0) -- cgit From 95f38411df055a0ecefe3a3d119d98241087d5ca Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 7 May 2015 11:02:51 +0200 Subject: netns: use a spin_lock to protect nsid management Before this patch, nsid were protected by the rtnl lock. The goal of this patch is to be able to find a nsid without needing to hold the rtnl lock. The next patch will introduce a netlink socket option to listen to all netns that have a nsid assigned into the netns where the socket is opened. Thus, it's important to call rtnl_net_notifyid() outside the spinlock, to avoid a recursive lock (nsid are notified via rtnl). This was the main reason of the previous patch. Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/core/net_namespace.c | 57 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 44 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index ee864241f8d6..ae5008b097de 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -28,6 +28,7 @@ static LIST_HEAD(pernet_list); static struct list_head *first_device = &pernet_list; DEFINE_MUTEX(net_mutex); +static DEFINE_SPINLOCK(nsid_lock); LIST_HEAD(net_namespace_list); EXPORT_SYMBOL_GPL(net_namespace_list); @@ -147,18 +148,17 @@ static void ops_free_list(const struct pernet_operations *ops, } } +/* should be called with nsid_lock held */ static int alloc_netid(struct net *net, struct net *peer, int reqid) { int min = 0, max = 0; - ASSERT_RTNL(); - if (reqid >= 0) { min = reqid; max = reqid + 1; } - return idr_alloc(&net->netns_ids, peer, min, max, GFP_KERNEL); + return idr_alloc(&net->netns_ids, peer, min, max, GFP_ATOMIC); } /* This function is used by idr_for_each(). If net is equal to peer, the @@ -174,13 +174,15 @@ static int net_eq_idr(int id, void *net, void *peer) return 0; } +/* Should be called with nsid_lock held. If a new id is assigned, the bool alloc + * is set to true, thus the caller knows that the new id must be notified via + * rtnl. + */ static int __peernet2id_alloc(struct net *net, struct net *peer, bool *alloc) { int id = idr_for_each(&net->netns_ids, net_eq_idr, peer); bool alloc_it = *alloc; - ASSERT_RTNL(); - *alloc = false; /* Magic value for id 0. */ @@ -198,6 +200,7 @@ static int __peernet2id_alloc(struct net *net, struct net *peer, bool *alloc) return NETNSA_NSID_NOT_ASSIGNED; } +/* should be called with nsid_lock held */ static int __peernet2id(struct net *net, struct net *peer) { bool no = false; @@ -211,27 +214,46 @@ static void rtnl_net_notifyid(struct net *net, int cmd, int id); */ int peernet2id_alloc(struct net *net, struct net *peer) { - bool alloc = atomic_read(&peer->count) == 0 ? false : true; + unsigned long flags; + bool alloc; int id; + spin_lock_irqsave(&nsid_lock, flags); + alloc = atomic_read(&peer->count) == 0 ? false : true; id = __peernet2id_alloc(net, peer, &alloc); + spin_unlock_irqrestore(&nsid_lock, flags); if (alloc && id >= 0) rtnl_net_notifyid(net, RTM_NEWNSID, id); return id; } EXPORT_SYMBOL(peernet2id_alloc); +/* This function returns, if assigned, the id of a peer netns. */ +static int peernet2id(struct net *net, struct net *peer) +{ + unsigned long flags; + int id; + + spin_lock_irqsave(&nsid_lock, flags); + id = __peernet2id(net, peer); + spin_unlock_irqrestore(&nsid_lock, flags); + return id; +} + struct net *get_net_ns_by_id(struct net *net, int id) { + unsigned long flags; struct net *peer; if (id < 0) return NULL; rcu_read_lock(); + spin_lock_irqsave(&nsid_lock, flags); peer = idr_find(&net->netns_ids, id); if (peer) get_net(peer); + spin_unlock_irqrestore(&nsid_lock, flags); rcu_read_unlock(); return peer; @@ -372,14 +394,19 @@ static void cleanup_net(struct work_struct *work) list_del_rcu(&net->list); list_add_tail(&net->exit_list, &net_exit_list); for_each_net(tmp) { - int id = __peernet2id(tmp, net); + int id; - if (id >= 0) { - rtnl_net_notifyid(tmp, RTM_DELNSID, id); + spin_lock_irq(&nsid_lock); + id = __peernet2id(tmp, net); + if (id >= 0) idr_remove(&tmp->netns_ids, id); - } + spin_unlock_irq(&nsid_lock); + if (id >= 0) + rtnl_net_notifyid(tmp, RTM_DELNSID, id); } + spin_lock_irq(&nsid_lock); idr_destroy(&net->netns_ids); + spin_unlock_irq(&nsid_lock); } rtnl_unlock(); @@ -507,6 +534,7 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct nlattr *tb[NETNSA_MAX + 1]; + unsigned long flags; struct net *peer; int nsid, err; @@ -527,12 +555,14 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh) if (IS_ERR(peer)) return PTR_ERR(peer); + spin_lock_irqsave(&nsid_lock, flags); if (__peernet2id(net, peer) >= 0) { err = -EEXIST; goto out; } err = alloc_netid(net, peer, nsid); + spin_unlock_irqrestore(&nsid_lock, flags); if (err >= 0) { rtnl_net_notifyid(net, RTM_NEWNSID, err); err = 0; @@ -601,7 +631,7 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh) goto out; } - id = __peernet2id(net, peer); + id = peernet2id(net, peer); err = rtnl_net_fill(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, RTM_GETNSID, net, id); if (err < 0) @@ -654,10 +684,11 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb) .idx = 0, .s_idx = cb->args[0], }; + unsigned long flags; - ASSERT_RTNL(); - + spin_lock_irqsave(&nsid_lock, flags); idr_for_each(&net->netns_ids, rtnl_net_dumpid_one, &net_cb); + spin_unlock_irqrestore(&nsid_lock, flags); cb->args[0] = net_cb.idx; return skb->len; -- cgit From cc3a572fe6cf586f478546215bc5d3694357d71e Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 7 May 2015 11:02:52 +0200 Subject: netlink: rename private flags and states These flags and states have the same prefix (NETLINK_) that netlink socket options. To avoid confusion and to be able to name a flag like a socket option, let's use an other prefix: NETLINK_[S|F]_. Note: a comment has been fixed, it was talking about NETLINK_RECV_NO_ENOBUFS socket option instead of NETLINK_NO_ENOBUFS. Signed-off-by: Nicolas Dichtel Acked-by: Thomas Graf Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 59 ++++++++++++++++++++++++------------------------ 1 file changed, 30 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index ec4adbdcb9b4..bf7f56d7a9aa 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -76,17 +76,17 @@ struct listeners { }; /* state bits */ -#define NETLINK_CONGESTED 0x0 +#define NETLINK_S_CONGESTED 0x0 /* flags */ -#define NETLINK_KERNEL_SOCKET 0x1 -#define NETLINK_RECV_PKTINFO 0x2 -#define NETLINK_BROADCAST_SEND_ERROR 0x4 -#define NETLINK_RECV_NO_ENOBUFS 0x8 +#define NETLINK_F_KERNEL_SOCKET 0x1 +#define NETLINK_F_RECV_PKTINFO 0x2 +#define NETLINK_F_BROADCAST_SEND_ERROR 0x4 +#define NETLINK_F_RECV_NO_ENOBUFS 0x8 static inline int netlink_is_kernel(struct sock *sk) { - return nlk_sk(sk)->flags & NETLINK_KERNEL_SOCKET; + return nlk_sk(sk)->flags & NETLINK_F_KERNEL_SOCKET; } struct netlink_table *nl_table; @@ -256,8 +256,9 @@ static void netlink_overrun(struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); - if (!(nlk->flags & NETLINK_RECV_NO_ENOBUFS)) { - if (!test_and_set_bit(NETLINK_CONGESTED, &nlk_sk(sk)->state)) { + if (!(nlk->flags & NETLINK_F_RECV_NO_ENOBUFS)) { + if (!test_and_set_bit(NETLINK_S_CONGESTED, + &nlk_sk(sk)->state)) { sk->sk_err = ENOBUFS; sk->sk_error_report(sk); } @@ -270,8 +271,8 @@ static void netlink_rcv_wake(struct sock *sk) struct netlink_sock *nlk = nlk_sk(sk); if (skb_queue_empty(&sk->sk_receive_queue)) - clear_bit(NETLINK_CONGESTED, &nlk->state); - if (!test_bit(NETLINK_CONGESTED, &nlk->state)) + clear_bit(NETLINK_S_CONGESTED, &nlk->state); + if (!test_bit(NETLINK_S_CONGESTED, &nlk->state)) wake_up_interruptible(&nlk->wait); } @@ -1656,7 +1657,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, nlk = nlk_sk(sk); if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - test_bit(NETLINK_CONGESTED, &nlk->state)) && + test_bit(NETLINK_S_CONGESTED, &nlk->state)) && !netlink_skb_is_mmaped(skb)) { DECLARE_WAITQUEUE(wait, current); if (!*timeo) { @@ -1671,7 +1672,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, add_wait_queue(&nlk->wait, &wait); if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - test_bit(NETLINK_CONGESTED, &nlk->state)) && + test_bit(NETLINK_S_CONGESTED, &nlk->state)) && !sock_flag(sk, SOCK_DEAD)) *timeo = schedule_timeout(*timeo); @@ -1895,7 +1896,7 @@ static int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb) struct netlink_sock *nlk = nlk_sk(sk); if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && - !test_bit(NETLINK_CONGESTED, &nlk->state)) { + !test_bit(NETLINK_S_CONGESTED, &nlk->state)) { netlink_skb_set_owner_r(skb, sk); __netlink_sendskb(sk, skb); return atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1); @@ -1956,7 +1957,7 @@ static void do_one_broadcast(struct sock *sk, netlink_overrun(sk); /* Clone failed. Notify ALL listeners. */ p->failure = 1; - if (nlk->flags & NETLINK_BROADCAST_SEND_ERROR) + if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR) p->delivery_failure = 1; } else if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) { kfree_skb(p->skb2); @@ -1966,7 +1967,7 @@ static void do_one_broadcast(struct sock *sk, p->skb2 = NULL; } else if ((val = netlink_broadcast_deliver(sk, p->skb2)) < 0) { netlink_overrun(sk); - if (nlk->flags & NETLINK_BROADCAST_SEND_ERROR) + if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR) p->delivery_failure = 1; } else { p->congested |= val; @@ -2057,7 +2058,7 @@ static int do_one_set_err(struct sock *sk, struct netlink_set_err_data *p) !test_bit(p->group - 1, nlk->groups)) goto out; - if (p->code == ENOBUFS && nlk->flags & NETLINK_RECV_NO_ENOBUFS) { + if (p->code == ENOBUFS && nlk->flags & NETLINK_F_RECV_NO_ENOBUFS) { ret = 1; goto out; } @@ -2076,7 +2077,7 @@ out: * @code: error code, must be negative (as usual in kernelspace) * * This function returns the number of broadcast listeners that have set the - * NETLINK_RECV_NO_ENOBUFS socket option. + * NETLINK_NO_ENOBUFS socket option. */ int netlink_set_err(struct sock *ssk, u32 portid, u32 group, int code) { @@ -2136,9 +2137,9 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, switch (optname) { case NETLINK_PKTINFO: if (val) - nlk->flags |= NETLINK_RECV_PKTINFO; + nlk->flags |= NETLINK_F_RECV_PKTINFO; else - nlk->flags &= ~NETLINK_RECV_PKTINFO; + nlk->flags &= ~NETLINK_F_RECV_PKTINFO; err = 0; break; case NETLINK_ADD_MEMBERSHIP: @@ -2167,18 +2168,18 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, } case NETLINK_BROADCAST_ERROR: if (val) - nlk->flags |= NETLINK_BROADCAST_SEND_ERROR; + nlk->flags |= NETLINK_F_BROADCAST_SEND_ERROR; else - nlk->flags &= ~NETLINK_BROADCAST_SEND_ERROR; + nlk->flags &= ~NETLINK_F_BROADCAST_SEND_ERROR; err = 0; break; case NETLINK_NO_ENOBUFS: if (val) { - nlk->flags |= NETLINK_RECV_NO_ENOBUFS; - clear_bit(NETLINK_CONGESTED, &nlk->state); + nlk->flags |= NETLINK_F_RECV_NO_ENOBUFS; + clear_bit(NETLINK_S_CONGESTED, &nlk->state); wake_up_interruptible(&nlk->wait); } else { - nlk->flags &= ~NETLINK_RECV_NO_ENOBUFS; + nlk->flags &= ~NETLINK_F_RECV_NO_ENOBUFS; } err = 0; break; @@ -2227,7 +2228,7 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname, if (len < sizeof(int)) return -EINVAL; len = sizeof(int); - val = nlk->flags & NETLINK_RECV_PKTINFO ? 1 : 0; + val = nlk->flags & NETLINK_F_RECV_PKTINFO ? 1 : 0; if (put_user(len, optlen) || put_user(val, optval)) return -EFAULT; @@ -2237,7 +2238,7 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname, if (len < sizeof(int)) return -EINVAL; len = sizeof(int); - val = nlk->flags & NETLINK_BROADCAST_SEND_ERROR ? 1 : 0; + val = nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR ? 1 : 0; if (put_user(len, optlen) || put_user(val, optval)) return -EFAULT; @@ -2247,7 +2248,7 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname, if (len < sizeof(int)) return -EINVAL; len = sizeof(int); - val = nlk->flags & NETLINK_RECV_NO_ENOBUFS ? 1 : 0; + val = nlk->flags & NETLINK_F_RECV_NO_ENOBUFS ? 1 : 0; if (put_user(len, optlen) || put_user(val, optval)) return -EFAULT; @@ -2418,7 +2419,7 @@ static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, msg->msg_namelen = sizeof(*addr); } - if (nlk->flags & NETLINK_RECV_PKTINFO) + if (nlk->flags & NETLINK_F_RECV_PKTINFO) netlink_cmsg_recv_pktinfo(msg, skb); memset(&scm, 0, sizeof(scm)); @@ -2502,7 +2503,7 @@ __netlink_kernel_create(struct net *net, int unit, struct module *module, goto out_sock_release; nlk = nlk_sk(sk); - nlk->flags |= NETLINK_KERNEL_SOCKET; + nlk->flags |= NETLINK_F_KERNEL_SOCKET; netlink_table_grab(); if (!nl_table[unit].registered) { -- cgit From 59324cf35aba5336b611074028777838a963d03b Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 7 May 2015 11:02:53 +0200 Subject: netlink: allow to listen "all" netns More accurately, listen all netns that have a nsid assigned into the netns where the netlink socket is opened. For this purpose, a netlink socket option is added: NETLINK_LISTEN_ALL_NSID. When this option is set on a netlink socket, this socket will receive netlink notifications from all netns that have a nsid assigned into the netns where the socket has been opened. The nsid is sent to userland via an anscillary data. With this patch, a daemon needs only one socket to listen many netns. This is useful when the number of netns is high. Because 0 is a valid value for a nsid, the field nsid_is_set indicates if the field nsid is valid or not. skb->cb is initialized to 0 on skb allocation, thus we are sure that we will never send a nsid 0 by error to the userland. Signed-off-by: Nicolas Dichtel Acked-by: Thomas Graf Signed-off-by: David S. Miller --- net/core/net_namespace.c | 10 +++++++++- net/netlink/af_netlink.c | 52 +++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 56 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index ae5008b097de..a665bf490c88 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -229,7 +229,7 @@ int peernet2id_alloc(struct net *net, struct net *peer) EXPORT_SYMBOL(peernet2id_alloc); /* This function returns, if assigned, the id of a peer netns. */ -static int peernet2id(struct net *net, struct net *peer) +int peernet2id(struct net *net, struct net *peer) { unsigned long flags; int id; @@ -240,6 +240,14 @@ static int peernet2id(struct net *net, struct net *peer) return id; } +/* This function returns true is the peer netns has an id assigned into the + * current netns. + */ +bool peernet_has_id(struct net *net, struct net *peer) +{ + return peernet2id(net, peer) >= 0; +} + struct net *get_net_ns_by_id(struct net *net, int id) { unsigned long flags; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index bf7f56d7a9aa..a5fff75accf8 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -83,6 +83,7 @@ struct listeners { #define NETLINK_F_RECV_PKTINFO 0x2 #define NETLINK_F_BROADCAST_SEND_ERROR 0x4 #define NETLINK_F_RECV_NO_ENOBUFS 0x8 +#define NETLINK_F_LISTEN_ALL_NSID 0x10 static inline int netlink_is_kernel(struct sock *sk) { @@ -1932,8 +1933,17 @@ static void do_one_broadcast(struct sock *sk, !test_bit(p->group - 1, nlk->groups)) return; - if (!net_eq(sock_net(sk), p->net)) - return; + if (!net_eq(sock_net(sk), p->net)) { + if (!(nlk->flags & NETLINK_F_LISTEN_ALL_NSID)) + return; + + if (!peernet_has_id(sock_net(sk), p->net)) + return; + + if (!file_ns_capable(sk->sk_socket->file, p->net->user_ns, + CAP_NET_BROADCAST)) + return; + } if (p->failure) { netlink_overrun(sk); @@ -1959,13 +1969,22 @@ static void do_one_broadcast(struct sock *sk, p->failure = 1; if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR) p->delivery_failure = 1; - } else if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) { + goto out; + } + if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) { kfree_skb(p->skb2); p->skb2 = NULL; - } else if (sk_filter(sk, p->skb2)) { + goto out; + } + if (sk_filter(sk, p->skb2)) { kfree_skb(p->skb2); p->skb2 = NULL; - } else if ((val = netlink_broadcast_deliver(sk, p->skb2)) < 0) { + goto out; + } + NETLINK_CB(p->skb2).nsid = peernet2id(sock_net(sk), p->net); + NETLINK_CB(p->skb2).nsid_is_set = true; + val = netlink_broadcast_deliver(sk, p->skb2); + if (val < 0) { netlink_overrun(sk); if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR) p->delivery_failure = 1; @@ -1974,6 +1993,7 @@ static void do_one_broadcast(struct sock *sk, p->delivered = 1; p->skb2 = NULL; } +out: sock_put(sk); } @@ -2202,6 +2222,16 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, break; } #endif /* CONFIG_NETLINK_MMAP */ + case NETLINK_LISTEN_ALL_NSID: + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_BROADCAST)) + return -EPERM; + + if (val) + nlk->flags |= NETLINK_F_LISTEN_ALL_NSID; + else + nlk->flags &= ~NETLINK_F_LISTEN_ALL_NSID; + err = 0; + break; default: err = -ENOPROTOOPT; } @@ -2268,6 +2298,16 @@ static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) put_cmsg(msg, SOL_NETLINK, NETLINK_PKTINFO, sizeof(info), &info); } +static void netlink_cmsg_listen_all_nsid(struct sock *sk, struct msghdr *msg, + struct sk_buff *skb) +{ + if (!NETLINK_CB(skb).nsid_is_set) + return; + + put_cmsg(msg, SOL_NETLINK, NETLINK_LISTEN_ALL_NSID, sizeof(int), + &NETLINK_CB(skb).nsid); +} + static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; @@ -2421,6 +2461,8 @@ static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, if (nlk->flags & NETLINK_F_RECV_PKTINFO) netlink_cmsg_recv_pktinfo(msg, skb); + if (nlk->flags & NETLINK_F_LISTEN_ALL_NSID) + netlink_cmsg_listen_all_nsid(sk, msg, skb); memset(&scm, 0, sizeof(scm)); scm.creds = *NETLINK_CREDS(skb); -- cgit From f1f00d8ff60ca056db3805406507483eeb3794d7 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 7 May 2015 16:34:51 +0200 Subject: pktgen: adjust flag NO_TIMESTAMP to be more pktgen compliant Allow flag NO_TIMESTAMP to turn timestamping on again, like other flags, with a negation of the flag like !NO_TIMESTAMP. Also document the option flag NO_TIMESTAMP. Fixes: afb84b626184 ("pktgen: add flag NO_TIMESTAMP to disable timestamping") Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- net/core/pktgen.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 508155b283dd..43bb21507373 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -1267,6 +1267,9 @@ static ssize_t pktgen_if_write(struct file *file, else if (strcmp(f, "NO_TIMESTAMP") == 0) pkt_dev->flags |= F_NO_TIMESTAMP; + else if (strcmp(f, "!NO_TIMESTAMP") == 0) + pkt_dev->flags &= ~F_NO_TIMESTAMP; + else { sprintf(pg_result, "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s", -- cgit From 62f64aed622b6055b5fc447e3e421c9351563fc8 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 7 May 2015 16:35:32 +0200 Subject: pktgen: introduce xmit_mode '' Introduce xmit_mode 'netif_receive' for pktgen which generates the packets using familiar pktgen commands, but feeds them into netif_receive_skb() instead of ndo_start_xmit(). Default mode is called 'start_xmit'. It is designed to test netif_receive_skb and ingress qdisc performace only. Make sure to understand how it works before using it for other rx benchmarking. Sample script 'pktgen.sh': \#!/bin/bash function pgset() { local result echo $1 > $PGDEV result=`cat $PGDEV | fgrep "Result: OK:"` if [ "$result" = "" ]; then cat $PGDEV | fgrep Result: fi } [ -z "$1" ] && echo "Usage: $0 DEV" && exit 1 ETH=$1 PGDEV=/proc/net/pktgen/kpktgend_0 pgset "rem_device_all" pgset "add_device $ETH" PGDEV=/proc/net/pktgen/$ETH pgset "xmit_mode netif_receive" pgset "pkt_size 60" pgset "dst 198.18.0.1" pgset "dst_mac 90:e2:ba:ff:ff:ff" pgset "count 10000000" pgset "burst 32" PGDEV=/proc/net/pktgen/pgctrl echo "Running... ctrl^C to stop" pgset "start" echo "Done" cat /proc/net/pktgen/$ETH Usage: $ sudo ./pktgen.sh eth2 ... Result: OK: 232376(c232372+d3) usec, 10000000 (60byte,0frags) 43033682pps 20656Mb/sec (20656167360bps) errors: 10000000 Raw netif_receive_skb speed should be ~43 million packet per second on 3.7Ghz x86 and 'perf report' should look like: 37.69% kpktgend_0 [kernel.vmlinux] [k] __netif_receive_skb_core 25.81% kpktgend_0 [kernel.vmlinux] [k] kfree_skb 7.22% kpktgend_0 [kernel.vmlinux] [k] ip_rcv 5.68% kpktgend_0 [pktgen] [k] pktgen_thread_worker If fib_table_lookup is seen on top, it means skb was processed by the stack. To benchmark netif_receive_skb only make sure that 'dst_mac' of your pktgen script is different from receiving device mac and it will be dropped by ip_rcv Signed-off-by: Alexei Starovoitov Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- net/core/pktgen.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 77 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 43bb21507373..8f2687da058e 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -210,6 +210,10 @@ #define T_REMDEVALL (1<<2) /* Remove all devs */ #define T_REMDEV (1<<3) /* Remove one dev */ +/* Xmit modes */ +#define M_START_XMIT 0 /* Default normal TX */ +#define M_NETIF_RECEIVE 1 /* Inject packets into stack */ + /* If lock -- protects updating of if_list */ #define if_lock(t) spin_lock(&(t->if_lock)); #define if_unlock(t) spin_unlock(&(t->if_lock)); @@ -251,13 +255,14 @@ struct pktgen_dev { * we will do a random selection from within the range. */ __u32 flags; - int removal_mark; /* non-zero => the device is marked for - * removal by worker thread */ - + int xmit_mode; int min_pkt_size; int max_pkt_size; int pkt_overhead; /* overhead for MPLS, VLANs, IPSEC etc */ int nfrags; + int removal_mark; /* non-zero => the device is marked for + * removal by worker thread */ + struct page *page; u64 delay; /* nano-seconds */ @@ -620,6 +625,9 @@ static int pktgen_if_show(struct seq_file *seq, void *v) if (pkt_dev->node >= 0) seq_printf(seq, " node: %d\n", pkt_dev->node); + if (pkt_dev->xmit_mode == M_NETIF_RECEIVE) + seq_puts(seq, " xmit_mode: netif_receive\n"); + seq_puts(seq, " Flags: "); if (pkt_dev->flags & F_IPV6) @@ -1081,7 +1089,8 @@ static ssize_t pktgen_if_write(struct file *file, if (len < 0) return len; if ((value > 0) && - (!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING))) + ((pkt_dev->xmit_mode == M_NETIF_RECEIVE) || + !(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING))) return -ENOTSUPP; i += len; pkt_dev->clone_skb = value; @@ -1134,7 +1143,7 @@ static ssize_t pktgen_if_write(struct file *file, return len; i += len; - if ((value > 1) && + if ((value > 1) && (pkt_dev->xmit_mode == M_START_XMIT) && (!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING))) return -ENOTSUPP; pkt_dev->burst = value < 1 ? 1 : value; @@ -1160,6 +1169,35 @@ static ssize_t pktgen_if_write(struct file *file, sprintf(pg_result, "ERROR: node not possible"); return count; } + if (!strcmp(name, "xmit_mode")) { + char f[32]; + + memset(f, 0, 32); + len = strn_len(&user_buffer[i], sizeof(f) - 1); + if (len < 0) + return len; + + if (copy_from_user(f, &user_buffer[i], len)) + return -EFAULT; + i += len; + + if (strcmp(f, "start_xmit") == 0) { + pkt_dev->xmit_mode = M_START_XMIT; + } else if (strcmp(f, "netif_receive") == 0) { + /* clone_skb set earlier, not supported in this mode */ + if (pkt_dev->clone_skb > 0) + return -ENOTSUPP; + + pkt_dev->xmit_mode = M_NETIF_RECEIVE; + } else { + sprintf(pg_result, + "xmit_mode -:%s:- unknown\nAvailable modes: %s", + f, "start_xmit, netif_receive\n"); + return count; + } + sprintf(pg_result, "OK: xmit_mode=%s", f); + return count; + } if (!strcmp(name, "flag")) { char f[32]; memset(f, 0, 32); @@ -3320,6 +3358,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) unsigned int burst = ACCESS_ONCE(pkt_dev->burst); struct net_device *odev = pkt_dev->odev; struct netdev_queue *txq; + struct sk_buff *skb; int ret; /* If device is offline, then don't send */ @@ -3357,6 +3396,38 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) if (pkt_dev->delay && pkt_dev->last_ok) spin(pkt_dev, pkt_dev->next_tx); + if (pkt_dev->xmit_mode == M_NETIF_RECEIVE) { + skb = pkt_dev->skb; + skb->protocol = eth_type_trans(skb, skb->dev); + atomic_add(burst, &skb->users); + local_bh_disable(); + do { + ret = netif_receive_skb(skb); + if (ret == NET_RX_DROP) + pkt_dev->errors++; + pkt_dev->sofar++; + pkt_dev->seq_num++; + if (atomic_read(&skb->users) != burst) { + /* skb was queued by rps/rfs or taps, + * so cannot reuse this skb + */ + atomic_sub(burst - 1, &skb->users); + /* get out of the loop and wait + * until skb is consumed + */ + pkt_dev->last_ok = 1; + break; + } + /* skb was 'freed' by stack, so clean few + * bits and reuse it + */ +#ifdef CONFIG_NET_CLS_ACT + skb->tc_verd = 0; /* reset reclass/redir ttl */ +#endif + } while (--burst > 0); + goto out; /* Skips xmit_mode M_START_XMIT */ + } + txq = skb_get_tx_queue(odev, pkt_dev->skb); local_bh_disable(); @@ -3404,6 +3475,7 @@ xmit_more: unlock: HARD_TX_UNLOCK(odev, txq); +out: local_bh_enable(); /* If pkt_dev->count is zero, then run forever */ -- cgit From 80ba92fa1a92dea128283f69f55b02242e213650 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 8 May 2015 15:05:12 -0700 Subject: codel: add ce_threshold attribute For DCTCP or similar ECN based deployments on fabrics with shallow buffers, hosts are responsible for a good part of the buffering. This patch adds an optional ce_threshold to codel & fq_codel qdiscs, so that DCTCP can have feedback from queuing in the host. A DCTCP enabled egress port simply have a queue occupancy threshold above which ECT packets get CE mark. In codel language this translates to a sojourn time, so that one doesn't have to worry about bytes or bandwidth but delays. This makes the host an active participant in the health of the whole network. This also helps experimenting DCTCP in a setup without DCTCP compliant fabric. On following example, ce_threshold is set to 1ms, and we can see from 'ldelay xxx us' that TCP is not trying to go around the 5ms codel target. Queue has more capacity to absorb inelastic bursts (say from UDP traffic), as queues are maintained to an optimal level. lpaa23:~# ./tc -s -d qd sh dev eth1 qdisc mq 1: dev eth1 root Sent 87910654696 bytes 58065331 pkt (dropped 0, overlimits 0 requeues 42961) backlog 3108242b 364p requeues 42961 qdisc codel 8063: dev eth1 parent 1:1 limit 1000p target 5.0ms ce_threshold 1.0ms interval 100.0ms Sent 7363778701 bytes 4863809 pkt (dropped 0, overlimits 0 requeues 5503) rate 2348Mbit 193919pps backlog 255866b 46p requeues 5503 count 0 lastcount 0 ldelay 1.0ms drop_next 0us maxpacket 68130 ecn_mark 0 drop_overlimit 0 ce_mark 72384 qdisc codel 8064: dev eth1 parent 1:2 limit 1000p target 5.0ms ce_threshold 1.0ms interval 100.0ms Sent 7636486190 bytes 5043942 pkt (dropped 0, overlimits 0 requeues 5186) rate 2319Mbit 191538pps backlog 207418b 64p requeues 5186 count 0 lastcount 0 ldelay 694us drop_next 0us maxpacket 68130 ecn_mark 0 drop_overlimit 0 ce_mark 69873 qdisc codel 8065: dev eth1 parent 1:3 limit 1000p target 5.0ms ce_threshold 1.0ms interval 100.0ms Sent 11569360142 bytes 7641602 pkt (dropped 0, overlimits 0 requeues 5554) rate 3041Mbit 251096pps backlog 210446b 59p requeues 5554 count 0 lastcount 0 ldelay 889us drop_next 0us maxpacket 68130 ecn_mark 0 drop_overlimit 0 ce_mark 37780 ... Signed-off-by: Eric Dumazet Cc: Florian Westphal Cc: Daniel Borkmann Cc: Glenn Judd Cc: Nandita Dukkipati Cc: Neal Cardwell Cc: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/sched/sch_codel.c | 15 +++++++++++++-- net/sched/sch_fq_codel.c | 15 ++++++++++++++- 2 files changed, 27 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index de28f8e968e8..1474b6560fac 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -6,7 +6,7 @@ * * Implemented on linux by : * Copyright (C) 2012 Michael D. Taht - * Copyright (C) 2012 Eric Dumazet + * Copyright (C) 2012,2015 Eric Dumazet * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -109,6 +109,7 @@ static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = { [TCA_CODEL_LIMIT] = { .type = NLA_U32 }, [TCA_CODEL_INTERVAL] = { .type = NLA_U32 }, [TCA_CODEL_ECN] = { .type = NLA_U32 }, + [TCA_CODEL_CE_THRESHOLD]= { .type = NLA_U32 }, }; static int codel_change(struct Qdisc *sch, struct nlattr *opt) @@ -133,6 +134,12 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt) q->params.target = ((u64)target * NSEC_PER_USEC) >> CODEL_SHIFT; } + if (tb[TCA_CODEL_CE_THRESHOLD]) { + u64 val = nla_get_u32(tb[TCA_CODEL_CE_THRESHOLD]); + + q->params.ce_threshold = (val * NSEC_PER_USEC) >> CODEL_SHIFT; + } + if (tb[TCA_CODEL_INTERVAL]) { u32 interval = nla_get_u32(tb[TCA_CODEL_INTERVAL]); @@ -201,7 +208,10 @@ static int codel_dump(struct Qdisc *sch, struct sk_buff *skb) nla_put_u32(skb, TCA_CODEL_ECN, q->params.ecn)) goto nla_put_failure; - + if (q->params.ce_threshold != CODEL_DISABLED_THRESHOLD && + nla_put_u32(skb, TCA_CODEL_CE_THRESHOLD, + codel_time_to_us(q->params.ce_threshold))) + goto nla_put_failure; return nla_nest_end(skb, opts); nla_put_failure: @@ -220,6 +230,7 @@ static int codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d) .ldelay = codel_time_to_us(q->vars.ldelay), .dropping = q->vars.dropping, .ecn_mark = q->stats.ecn_mark, + .ce_mark = q->stats.ce_mark, }; if (q->vars.dropping) { diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index a6fc53d69513..778739786b32 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -6,7 +6,7 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * Copyright (C) 2012 Eric Dumazet + * Copyright (C) 2012,2015 Eric Dumazet */ #include @@ -292,6 +292,7 @@ static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = { [TCA_FQ_CODEL_ECN] = { .type = NLA_U32 }, [TCA_FQ_CODEL_FLOWS] = { .type = NLA_U32 }, [TCA_FQ_CODEL_QUANTUM] = { .type = NLA_U32 }, + [TCA_FQ_CODEL_CE_THRESHOLD] = { .type = NLA_U32 }, }; static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt) @@ -322,6 +323,12 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt) q->cparams.target = (target * NSEC_PER_USEC) >> CODEL_SHIFT; } + if (tb[TCA_FQ_CODEL_CE_THRESHOLD]) { + u64 val = nla_get_u32(tb[TCA_FQ_CODEL_CE_THRESHOLD]); + + q->cparams.ce_threshold = (val * NSEC_PER_USEC) >> CODEL_SHIFT; + } + if (tb[TCA_FQ_CODEL_INTERVAL]) { u64 interval = nla_get_u32(tb[TCA_FQ_CODEL_INTERVAL]); @@ -441,6 +448,11 @@ static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb) q->flows_cnt)) goto nla_put_failure; + if (q->cparams.ce_threshold != CODEL_DISABLED_THRESHOLD && + nla_put_u32(skb, TCA_FQ_CODEL_CE_THRESHOLD, + codel_time_to_us(q->cparams.ce_threshold))) + goto nla_put_failure; + return nla_nest_end(skb, opts); nla_put_failure: @@ -459,6 +471,7 @@ static int fq_codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d) st.qdisc_stats.drop_overlimit = q->drop_overlimit; st.qdisc_stats.ecn_mark = q->cstats.ecn_mark; st.qdisc_stats.new_flow_count = q->new_flow_count; + st.qdisc_stats.ce_mark = q->cstats.ce_mark; list_for_each(pos, &q->new_flows) st.qdisc_stats.new_flows_len++; -- cgit From 96a4688edb5e638a9f81c787c031bbdbfa291527 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 11 May 2015 14:01:36 +0200 Subject: mac80211: check fast-xmit if IBSS STA QoS changed When an IBSS station gets QoS enabled after having been added, check fast-xmit to make sure the QoS header gets added to the cache properly and frames can go out with QoS and higher rates. Reported-by: Janusz Dziedzic Signed-off-by: Johannes Berg --- net/mac80211/ibss.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index bfef1b215050..21716af8bec3 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -1031,8 +1031,11 @@ static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata, } } - if (sta && elems->wmm_info && local->hw.queues >= IEEE80211_NUM_ACS) + if (sta && !sta->sta.wme && + elems->wmm_info && local->hw.queues >= IEEE80211_NUM_ACS) { sta->sta.wme = true; + ieee80211_check_fast_xmit(sta); + } if (sta && elems->ht_operation && elems->ht_cap_elem && sdata->u.ibss.chandef.width != NL80211_CHAN_WIDTH_20_NOHT && -- cgit From 890b7878e4435518c45d80c5e33c335aab39d4e2 Mon Sep 17 00:00:00 2001 From: Arik Nemtsov Date: Thu, 7 May 2015 15:30:41 +0300 Subject: mac80211: TDLS: use the BSS chandef for HT/VHT operation IEs The chandef of the current channel context might be wider (though compatible). The TDLS link cares only about the channel of the BSS. In addition make sure to specify the VHT operation IE when VHT is supported on a non-2.4GHz band, as required by IEEE802.11ac-2013. This is not the same as HT-operation, to be specified only if the BSS doesn't support HT. Signed-off-by: Arik Nemtsov Reviewed-by: Johannes Berg Signed-off-by: Johannes Berg --- net/mac80211/tdls.c | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index fff0d864adfa..8a92a920ff17 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -527,30 +527,19 @@ ieee80211_tdls_add_setup_cfm_ies(struct ieee80211_sub_if_data *sdata, /* if HT support is only added in TDLS, we need an HT-operation IE */ if (!ap_sta->sta.ht_cap.ht_supported && sta->sta.ht_cap.ht_supported) { - struct ieee80211_chanctx_conf *chanctx_conf = - rcu_dereference(sdata->vif.chanctx_conf); - if (!WARN_ON(!chanctx_conf)) { - pos = skb_put(skb, 2 + - sizeof(struct ieee80211_ht_operation)); - /* send an empty HT operation IE */ - ieee80211_ie_build_ht_oper(pos, &sta->sta.ht_cap, - &chanctx_conf->def, 0); - } + pos = skb_put(skb, 2 + sizeof(struct ieee80211_ht_operation)); + /* send an empty HT operation IE */ + ieee80211_ie_build_ht_oper(pos, &sta->sta.ht_cap, + &sdata->vif.bss_conf.chandef, 0); } ieee80211_tdls_add_link_ie(sdata, skb, peer, initiator); /* only include VHT-operation if not on the 2.4GHz band */ - if (band != IEEE80211_BAND_2GHZ && !ap_sta->sta.vht_cap.vht_supported && - sta->sta.vht_cap.vht_supported) { - struct ieee80211_chanctx_conf *chanctx_conf = - rcu_dereference(sdata->vif.chanctx_conf); - if (!WARN_ON(!chanctx_conf)) { - pos = skb_put(skb, 2 + - sizeof(struct ieee80211_vht_operation)); - ieee80211_ie_build_vht_oper(pos, &sta->sta.vht_cap, - &chanctx_conf->def); - } + if (band != IEEE80211_BAND_2GHZ && sta->sta.vht_cap.vht_supported) { + pos = skb_put(skb, 2 + sizeof(struct ieee80211_vht_operation)); + ieee80211_ie_build_vht_oper(pos, &sta->sta.vht_cap, + &sdata->vif.bss_conf.chandef); } rcu_read_unlock(); -- cgit From 140e807da12988e2a925fe029336e7bb67a8d4de Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 8 May 2015 21:07:08 -0500 Subject: tun: Utilize the normal socket network namespace refcounting. There is no need for tun to do the weird network namespace refcounting. The existing network namespace refcounting in tfile has almost exactly the same lifetime. So rewrite the code to use the struct sock network namespace refcounting and remove the unnecessary hand rolled network namespace refcounting and the unncesary tfile->net. This change allows the tun code to directly call sock_put bypassing sock_release and making SOCK_EXTERNALLY_ALLOCATED unnecessary. Remove the now unncessary tun_release so that if anything tries to use the sock_release code path the kernel will oops, and let us know about the bug. The macvtap code already uses it's internal socket this way. Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/socket.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/socket.c b/net/socket.c index 884e32997698..b5f1f43ed8f4 100644 --- a/net/socket.c +++ b/net/socket.c @@ -576,9 +576,6 @@ void sock_release(struct socket *sock) if (rcu_dereference_protected(sock->wq, 1)->fasync_list) pr_err("%s: fasync list not empty!\n", __func__); - if (test_bit(SOCK_EXTERNALLY_ALLOCATED, &sock->flags)) - return; - this_cpu_sub(sockets_in_use, 1); if (!sock->file) { iput(SOCK_INODE(sock)); -- cgit From eeb1bd5c40edb0e2fd925c8535e2fdebdbc5cef2 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 8 May 2015 21:08:05 -0500 Subject: net: Add a struct net parameter to sock_create_kern This is long overdue, and is part of cleaning up how we allocate kernel sockets that don't reference count struct net. Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/bluetooth/rfcomm/core.c | 2 +- net/ceph/messenger.c | 4 ++-- net/ipv4/af_inet.c | 2 +- net/ipv4/udp_tunnel.c | 2 +- net/ipv6/ip6_udp_tunnel.c | 2 +- net/l2tp/l2tp_core.c | 4 ++-- net/netfilter/ipvs/ip_vs_sync.c | 4 ++-- net/rxrpc/ar-local.c | 4 ++-- net/socket.c | 4 ++-- 9 files changed, 14 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index 4fea24275b17..29709fbfd1f5 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -200,7 +200,7 @@ static int rfcomm_l2sock_create(struct socket **sock) BT_DBG(""); - err = sock_create_kern(PF_BLUETOOTH, SOCK_SEQPACKET, BTPROTO_L2CAP, sock); + err = sock_create_kern(&init_net, PF_BLUETOOTH, SOCK_SEQPACKET, BTPROTO_L2CAP, sock); if (!err) { struct sock *sk = (*sock)->sk; sk->sk_data_ready = rfcomm_l2data_ready; diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 967080a9f043..073262fea6dd 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -480,8 +480,8 @@ static int ceph_tcp_connect(struct ceph_connection *con) int ret; BUG_ON(con->sock); - ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM, - IPPROTO_TCP, &sock); + ret = sock_create_kern(&init_net, con->peer_addr.in_addr.ss_family, + SOCK_STREAM, IPPROTO_TCP, &sock); if (ret) return ret; sock->sk->sk_allocation = GFP_NOFS; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 8b47a4d79d04..09f4d024dfe5 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1430,7 +1430,7 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family, struct net *net) { struct socket *sock; - int rc = sock_create_kern(family, type, protocol, &sock); + int rc = sock_create_kern(&init_net, family, type, protocol, &sock); if (rc == 0) { *sk = sock->sk; diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index 6bb98cc193c9..4e2837476967 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -15,7 +15,7 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, struct socket *sock = NULL; struct sockaddr_in udp_addr; - err = sock_create_kern(AF_INET, SOCK_DGRAM, 0, &sock); + err = sock_create_kern(&init_net, AF_INET, SOCK_DGRAM, 0, &sock); if (err < 0) goto error; diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index bba8903e871f..478576b61214 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -19,7 +19,7 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, int err; struct socket *sock = NULL; - err = sock_create_kern(AF_INET6, SOCK_DGRAM, 0, &sock); + err = sock_create_kern(&init_net, AF_INET6, SOCK_DGRAM, 0, &sock); if (err < 0) goto error; diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index a29a504492af..ae513a2fe7f3 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1399,7 +1399,7 @@ static int l2tp_tunnel_sock_create(struct net *net, if (cfg->local_ip6 && cfg->peer_ip6) { struct sockaddr_l2tpip6 ip6_addr = {0}; - err = sock_create_kern(AF_INET6, SOCK_DGRAM, + err = sock_create_kern(&init_net, AF_INET6, SOCK_DGRAM, IPPROTO_L2TP, &sock); if (err < 0) goto out; @@ -1429,7 +1429,7 @@ static int l2tp_tunnel_sock_create(struct net *net, { struct sockaddr_l2tpip ip_addr = {0}; - err = sock_create_kern(AF_INET, SOCK_DGRAM, + err = sock_create_kern(&init_net, AF_INET, SOCK_DGRAM, IPPROTO_L2TP, &sock); if (err < 0) goto out; diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 19b9cce6c210..2e9a5b5d1239 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1458,7 +1458,7 @@ static struct socket *make_send_sock(struct net *net, int id) int result; /* First create a socket move it to right name space later */ - result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); + result = sock_create_kern(&init_net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); if (result < 0) { pr_err("Error during creation of socket; terminating\n"); return ERR_PTR(result); @@ -1518,7 +1518,7 @@ static struct socket *make_receive_sock(struct net *net, int id) int result; /* First create a socket */ - result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); + result = sock_create_kern(&init_net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); if (result < 0) { pr_err("Error during creation of socket; terminating\n"); return ERR_PTR(result); diff --git a/net/rxrpc/ar-local.c b/net/rxrpc/ar-local.c index ca904ed5400a..78483b4602bf 100644 --- a/net/rxrpc/ar-local.c +++ b/net/rxrpc/ar-local.c @@ -73,8 +73,8 @@ static int rxrpc_create_local(struct rxrpc_local *local) _enter("%p{%d}", local, local->srx.transport_type); /* create a socket to represent the local endpoint */ - ret = sock_create_kern(PF_INET, local->srx.transport_type, IPPROTO_UDP, - &local->socket); + ret = sock_create_kern(&init_net, PF_INET, local->srx.transport_type, + IPPROTO_UDP, &local->socket); if (ret < 0) { _leave(" = %d [socket]", ret); return ret; diff --git a/net/socket.c b/net/socket.c index b5f1f43ed8f4..9963a0b53a64 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1210,9 +1210,9 @@ int sock_create(int family, int type, int protocol, struct socket **res) } EXPORT_SYMBOL(sock_create); -int sock_create_kern(int family, int type, int protocol, struct socket **res) +int sock_create_kern(struct net *net, int family, int type, int protocol, struct socket **res) { - return __sock_create(&init_net, family, type, protocol, res, 1); + return __sock_create(net, family, type, protocol, res, 1); } EXPORT_SYMBOL(sock_create_kern); -- cgit From 11aa9c28b4209242a9de0a661a7b3405adb568a0 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 8 May 2015 21:09:13 -0500 Subject: net: Pass kern from net_proto_family.create to sk_alloc In preparation for changing how struct net is refcounted on kernel sockets pass the knowledge that we are creating a kernel socket from sock_create_kern through to sk_alloc. Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/appletalk/ddp.c | 2 +- net/atm/common.c | 4 ++-- net/atm/common.h | 2 +- net/atm/pvc.c | 2 +- net/atm/svc.c | 2 +- net/ax25/af_ax25.c | 4 ++-- net/bluetooth/bnep/sock.c | 2 +- net/bluetooth/cmtp/sock.c | 2 +- net/bluetooth/hci_sock.c | 2 +- net/bluetooth/hidp/sock.c | 2 +- net/bluetooth/l2cap_sock.c | 10 +++++----- net/bluetooth/rfcomm/sock.c | 8 ++++---- net/bluetooth/sco.c | 8 ++++---- net/caif/caif_socket.c | 2 +- net/can/af_can.c | 2 +- net/core/sock.c | 3 ++- net/decnet/af_decnet.c | 8 ++++---- net/ieee802154/socket.c | 2 +- net/ipv4/af_inet.c | 2 +- net/ipv6/af_inet6.c | 2 +- net/ipx/af_ipx.c | 2 +- net/irda/af_irda.c | 2 +- net/iucv/af_iucv.c | 10 +++++----- net/key/af_key.c | 2 +- net/l2tp/l2tp_ppp.c | 4 ++-- net/llc/af_llc.c | 2 +- net/llc/llc_conn.c | 6 +++--- net/netlink/af_netlink.c | 11 +++++------ net/netrom/af_netrom.c | 4 ++-- net/nfc/af_nfc.c | 2 +- net/nfc/llcp.h | 2 +- net/nfc/llcp_core.c | 2 +- net/nfc/llcp_sock.c | 8 ++++---- net/nfc/nfc.h | 2 +- net/nfc/rawsock.c | 4 ++-- net/packet/af_packet.c | 2 +- net/phonet/af_phonet.c | 2 +- net/phonet/pep.c | 2 +- net/rds/af_rds.c | 2 +- net/rose/af_rose.c | 4 ++-- net/rxrpc/af_rxrpc.c | 2 +- net/sctp/ipv6.c | 2 +- net/sctp/protocol.c | 2 +- net/tipc/socket.c | 2 +- net/unix/af_unix.c | 8 ++++---- net/vmw_vsock/af_vsock.c | 7 ++++--- net/vmw_vsock/vmci_transport.c | 2 +- net/x25/af_x25.c | 8 ++++---- 48 files changed, 90 insertions(+), 89 deletions(-) (limited to 'net') diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 3b7ad43c7dad..d5871ac493eb 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1030,7 +1030,7 @@ static int atalk_create(struct net *net, struct socket *sock, int protocol, if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM) goto out; rc = -ENOMEM; - sk = sk_alloc(net, PF_APPLETALK, GFP_KERNEL, &ddp_proto); + sk = sk_alloc(net, PF_APPLETALK, GFP_KERNEL, &ddp_proto, kern); if (!sk) goto out; rc = 0; diff --git a/net/atm/common.c b/net/atm/common.c index ed0466637e13..49a872db7e42 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -141,7 +141,7 @@ static struct proto vcc_proto = { .release_cb = vcc_release_cb, }; -int vcc_create(struct net *net, struct socket *sock, int protocol, int family) +int vcc_create(struct net *net, struct socket *sock, int protocol, int family, int kern) { struct sock *sk; struct atm_vcc *vcc; @@ -149,7 +149,7 @@ int vcc_create(struct net *net, struct socket *sock, int protocol, int family) sock->sk = NULL; if (sock->type == SOCK_STREAM) return -EINVAL; - sk = sk_alloc(net, family, GFP_KERNEL, &vcc_proto); + sk = sk_alloc(net, family, GFP_KERNEL, &vcc_proto, kern); if (!sk) return -ENOMEM; sock_init_data(sock, sk); diff --git a/net/atm/common.h b/net/atm/common.h index 4d6f5b2068ac..959436b87182 100644 --- a/net/atm/common.h +++ b/net/atm/common.h @@ -10,7 +10,7 @@ #include /* for poll_table */ -int vcc_create(struct net *net, struct socket *sock, int protocol, int family); +int vcc_create(struct net *net, struct socket *sock, int protocol, int family, int kern); int vcc_release(struct socket *sock); int vcc_connect(struct socket *sock, int itf, short vpi, int vci); int vcc_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, diff --git a/net/atm/pvc.c b/net/atm/pvc.c index ae0324021407..040207ec399f 100644 --- a/net/atm/pvc.c +++ b/net/atm/pvc.c @@ -136,7 +136,7 @@ static int pvc_create(struct net *net, struct socket *sock, int protocol, return -EAFNOSUPPORT; sock->ops = &pvc_proto_ops; - return vcc_create(net, sock, protocol, PF_ATMPVC); + return vcc_create(net, sock, protocol, PF_ATMPVC, kern); } static const struct net_proto_family pvc_family_ops = { diff --git a/net/atm/svc.c b/net/atm/svc.c index 1ba23f5018e7..3fa0a9ee98d1 100644 --- a/net/atm/svc.c +++ b/net/atm/svc.c @@ -660,7 +660,7 @@ static int svc_create(struct net *net, struct socket *sock, int protocol, return -EAFNOSUPPORT; sock->ops = &svc_proto_ops; - error = vcc_create(net, sock, protocol, AF_ATMSVC); + error = vcc_create(net, sock, protocol, AF_ATMSVC, kern); if (error) return error; ATM_SD(sock)->local.sas_family = AF_ATMSVC; diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 330c1f4a5a0b..4273533d22b1 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -855,7 +855,7 @@ static int ax25_create(struct net *net, struct socket *sock, int protocol, return -ESOCKTNOSUPPORT; } - sk = sk_alloc(net, PF_AX25, GFP_ATOMIC, &ax25_proto); + sk = sk_alloc(net, PF_AX25, GFP_ATOMIC, &ax25_proto, kern); if (sk == NULL) return -ENOMEM; @@ -881,7 +881,7 @@ struct sock *ax25_make_new(struct sock *osk, struct ax25_dev *ax25_dev) struct sock *sk; ax25_cb *ax25, *oax25; - sk = sk_alloc(sock_net(osk), PF_AX25, GFP_ATOMIC, osk->sk_prot); + sk = sk_alloc(sock_net(osk), PF_AX25, GFP_ATOMIC, osk->sk_prot, 0); if (sk == NULL) return NULL; diff --git a/net/bluetooth/bnep/sock.c b/net/bluetooth/bnep/sock.c index bde2bdd9e929..b5116fa9835e 100644 --- a/net/bluetooth/bnep/sock.c +++ b/net/bluetooth/bnep/sock.c @@ -202,7 +202,7 @@ static int bnep_sock_create(struct net *net, struct socket *sock, int protocol, if (sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; - sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &bnep_proto); + sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &bnep_proto, kern); if (!sk) return -ENOMEM; diff --git a/net/bluetooth/cmtp/sock.c b/net/bluetooth/cmtp/sock.c index d82787d417bd..ce86a7bae844 100644 --- a/net/bluetooth/cmtp/sock.c +++ b/net/bluetooth/cmtp/sock.c @@ -205,7 +205,7 @@ static int cmtp_sock_create(struct net *net, struct socket *sock, int protocol, if (sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; - sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &cmtp_proto); + sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &cmtp_proto, kern); if (!sk) return -ENOMEM; diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 56f9edbf3d05..5b14dcafcd08 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -1377,7 +1377,7 @@ static int hci_sock_create(struct net *net, struct socket *sock, int protocol, sock->ops = &hci_sock_ops; - sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &hci_sk_proto); + sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &hci_sk_proto, kern); if (!sk) return -ENOMEM; diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c index cb3fdde1968a..008ba439bd62 100644 --- a/net/bluetooth/hidp/sock.c +++ b/net/bluetooth/hidp/sock.c @@ -235,7 +235,7 @@ static int hidp_sock_create(struct net *net, struct socket *sock, int protocol, if (sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; - sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &hidp_proto); + sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &hidp_proto, kern); if (!sk) return -ENOMEM; diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index a7278f05eafb..244287706f91 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -43,7 +43,7 @@ static struct bt_sock_list l2cap_sk_list = { static const struct proto_ops l2cap_sock_ops; static void l2cap_sock_init(struct sock *sk, struct sock *parent); static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock, - int proto, gfp_t prio); + int proto, gfp_t prio, int kern); bool l2cap_is_socket(struct socket *sock) { @@ -1193,7 +1193,7 @@ static struct l2cap_chan *l2cap_sock_new_connection_cb(struct l2cap_chan *chan) } sk = l2cap_sock_alloc(sock_net(parent), NULL, BTPROTO_L2CAP, - GFP_ATOMIC); + GFP_ATOMIC, 0); if (!sk) { release_sock(parent); return NULL; @@ -1523,12 +1523,12 @@ static struct proto l2cap_proto = { }; static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock, - int proto, gfp_t prio) + int proto, gfp_t prio, int kern) { struct sock *sk; struct l2cap_chan *chan; - sk = sk_alloc(net, PF_BLUETOOTH, prio, &l2cap_proto); + sk = sk_alloc(net, PF_BLUETOOTH, prio, &l2cap_proto, kern); if (!sk) return NULL; @@ -1574,7 +1574,7 @@ static int l2cap_sock_create(struct net *net, struct socket *sock, int protocol, sock->ops = &l2cap_sock_ops; - sk = l2cap_sock_alloc(net, sock, protocol, GFP_ATOMIC); + sk = l2cap_sock_alloc(net, sock, protocol, GFP_ATOMIC, kern); if (!sk) return -ENOMEM; diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index 825e8fb5114b..b2338e971b33 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -269,12 +269,12 @@ static struct proto rfcomm_proto = { .obj_size = sizeof(struct rfcomm_pinfo) }; -static struct sock *rfcomm_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio) +static struct sock *rfcomm_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio, int kern) { struct rfcomm_dlc *d; struct sock *sk; - sk = sk_alloc(net, PF_BLUETOOTH, prio, &rfcomm_proto); + sk = sk_alloc(net, PF_BLUETOOTH, prio, &rfcomm_proto, kern); if (!sk) return NULL; @@ -324,7 +324,7 @@ static int rfcomm_sock_create(struct net *net, struct socket *sock, sock->ops = &rfcomm_sock_ops; - sk = rfcomm_sock_alloc(net, sock, protocol, GFP_ATOMIC); + sk = rfcomm_sock_alloc(net, sock, protocol, GFP_ATOMIC, kern); if (!sk) return -ENOMEM; @@ -969,7 +969,7 @@ int rfcomm_connect_ind(struct rfcomm_session *s, u8 channel, struct rfcomm_dlc * goto done; } - sk = rfcomm_sock_alloc(sock_net(parent), NULL, BTPROTO_RFCOMM, GFP_ATOMIC); + sk = rfcomm_sock_alloc(sock_net(parent), NULL, BTPROTO_RFCOMM, GFP_ATOMIC, 0); if (!sk) goto done; diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 4322c833e748..6b6e59dc54cf 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -460,11 +460,11 @@ static struct proto sco_proto = { .obj_size = sizeof(struct sco_pinfo) }; -static struct sock *sco_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio) +static struct sock *sco_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio, int kern) { struct sock *sk; - sk = sk_alloc(net, PF_BLUETOOTH, prio, &sco_proto); + sk = sk_alloc(net, PF_BLUETOOTH, prio, &sco_proto, kern); if (!sk) return NULL; @@ -501,7 +501,7 @@ static int sco_sock_create(struct net *net, struct socket *sock, int protocol, sock->ops = &sco_sock_ops; - sk = sco_sock_alloc(net, sock, protocol, GFP_ATOMIC); + sk = sco_sock_alloc(net, sock, protocol, GFP_ATOMIC, kern); if (!sk) return -ENOMEM; @@ -1026,7 +1026,7 @@ static void sco_conn_ready(struct sco_conn *conn) bh_lock_sock(parent); sk = sco_sock_alloc(sock_net(parent), NULL, - BTPROTO_SCO, GFP_ATOMIC); + BTPROTO_SCO, GFP_ATOMIC, 0); if (!sk) { bh_unlock_sock(parent); sco_conn_unlock(conn); diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 4ec0c803aef1..78a04ebb113c 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -1047,7 +1047,7 @@ static int caif_create(struct net *net, struct socket *sock, int protocol, * is really not used at all in the net/core or socket.c but the * initialization makes sure that sock->state is not uninitialized. */ - sk = sk_alloc(net, PF_CAIF, GFP_KERNEL, &prot); + sk = sk_alloc(net, PF_CAIF, GFP_KERNEL, &prot, kern); if (!sk) return -ENOMEM; diff --git a/net/can/af_can.c b/net/can/af_can.c index 32d710eaf1fc..d4d404bdfc9a 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -179,7 +179,7 @@ static int can_create(struct net *net, struct socket *sock, int protocol, sock->ops = cp->ops; - sk = sk_alloc(net, PF_CAN, GFP_KERNEL, cp->prot); + sk = sk_alloc(net, PF_CAN, GFP_KERNEL, cp->prot, kern); if (!sk) { err = -ENOMEM; goto errout; diff --git a/net/core/sock.c b/net/core/sock.c index e891bcf325ca..cbc3789b830c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1396,9 +1396,10 @@ EXPORT_SYMBOL_GPL(sock_update_netprioidx); * @family: protocol family * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) * @prot: struct proto associated with this new sock instance + * @kern: is this to be a kernel socket? */ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, - struct proto *prot) + struct proto *prot, int kern) { struct sock *sk; diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 754484b3cd0e..675cf94e04f8 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -468,10 +468,10 @@ static struct proto dn_proto = { .obj_size = sizeof(struct dn_sock), }; -static struct sock *dn_alloc_sock(struct net *net, struct socket *sock, gfp_t gfp) +static struct sock *dn_alloc_sock(struct net *net, struct socket *sock, gfp_t gfp, int kern) { struct dn_scp *scp; - struct sock *sk = sk_alloc(net, PF_DECnet, gfp, &dn_proto); + struct sock *sk = sk_alloc(net, PF_DECnet, gfp, &dn_proto, kern); if (!sk) goto out; @@ -693,7 +693,7 @@ static int dn_create(struct net *net, struct socket *sock, int protocol, } - if ((sk = dn_alloc_sock(net, sock, GFP_KERNEL)) == NULL) + if ((sk = dn_alloc_sock(net, sock, GFP_KERNEL, kern)) == NULL) return -ENOBUFS; sk->sk_protocol = protocol; @@ -1096,7 +1096,7 @@ static int dn_accept(struct socket *sock, struct socket *newsock, int flags) cb = DN_SKB_CB(skb); sk->sk_ack_backlog--; - newsk = dn_alloc_sock(sock_net(sk), newsock, sk->sk_allocation); + newsk = dn_alloc_sock(sock_net(sk), newsock, sk->sk_allocation, 0); if (newsk == NULL) { release_sock(sk); kfree_skb(skb); diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index b60c65f70346..7aaaf967df58 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -1014,7 +1014,7 @@ static int ieee802154_create(struct net *net, struct socket *sock, } rc = -ENOMEM; - sk = sk_alloc(net, PF_IEEE802154, GFP_KERNEL, proto); + sk = sk_alloc(net, PF_IEEE802154, GFP_KERNEL, proto, kern); if (!sk) goto out; rc = 0; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 09f4d024dfe5..e2dd9cb99d61 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -317,7 +317,7 @@ lookup_protocol: WARN_ON(!answer_prot->slab); err = -ENOBUFS; - sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot); + sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern); if (!sk) goto out; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 4632afa57e05..f3866c0b6cfe 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -167,7 +167,7 @@ lookup_protocol: WARN_ON(!answer_prot->slab); err = -ENOBUFS; - sk = sk_alloc(net, PF_INET6, GFP_KERNEL, answer_prot); + sk = sk_alloc(net, PF_INET6, GFP_KERNEL, answer_prot, kern); if (!sk) goto out; diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index 4ea5d7497b5f..48d0dc89b58d 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -1347,7 +1347,7 @@ static int ipx_create(struct net *net, struct socket *sock, int protocol, goto out; rc = -ENOMEM; - sk = sk_alloc(net, PF_IPX, GFP_KERNEL, &ipx_proto); + sk = sk_alloc(net, PF_IPX, GFP_KERNEL, &ipx_proto, kern); if (!sk) goto out; diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index ee0ea25c8e7a..fae6822cc367 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -1100,7 +1100,7 @@ static int irda_create(struct net *net, struct socket *sock, int protocol, } /* Allocate networking socket */ - sk = sk_alloc(net, PF_IRDA, GFP_KERNEL, &irda_proto); + sk = sk_alloc(net, PF_IRDA, GFP_KERNEL, &irda_proto, kern); if (sk == NULL) return -ENOMEM; diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 6daa52a18d40..918151c11348 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -535,12 +535,12 @@ static void iucv_sock_init(struct sock *sk, struct sock *parent) sk->sk_type = parent->sk_type; } -static struct sock *iucv_sock_alloc(struct socket *sock, int proto, gfp_t prio) +static struct sock *iucv_sock_alloc(struct socket *sock, int proto, gfp_t prio, int kern) { struct sock *sk; struct iucv_sock *iucv; - sk = sk_alloc(&init_net, PF_IUCV, prio, &iucv_proto); + sk = sk_alloc(&init_net, PF_IUCV, prio, &iucv_proto, kern); if (!sk) return NULL; iucv = iucv_sk(sk); @@ -602,7 +602,7 @@ static int iucv_sock_create(struct net *net, struct socket *sock, int protocol, return -ESOCKTNOSUPPORT; } - sk = iucv_sock_alloc(sock, protocol, GFP_KERNEL); + sk = iucv_sock_alloc(sock, protocol, GFP_KERNEL, kern); if (!sk) return -ENOMEM; @@ -1723,7 +1723,7 @@ static int iucv_callback_connreq(struct iucv_path *path, } /* Create the new socket */ - nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC); + nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC, 0); if (!nsk) { err = pr_iucv->path_sever(path, user_data); iucv_path_free(path); @@ -1933,7 +1933,7 @@ static int afiucv_hs_callback_syn(struct sock *sk, struct sk_buff *skb) goto out; } - nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC); + nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC, 0); bh_lock_sock(sk); if ((sk->sk_state != IUCV_LISTEN) || sk_acceptq_is_full(sk) || diff --git a/net/key/af_key.c b/net/key/af_key.c index f0d52d721b3a..9e834ec475a9 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -149,7 +149,7 @@ static int pfkey_create(struct net *net, struct socket *sock, int protocol, return -EPROTONOSUPPORT; err = -ENOMEM; - sk = sk_alloc(net, PF_KEY, GFP_KERNEL, &key_proto); + sk = sk_alloc(net, PF_KEY, GFP_KERNEL, &key_proto, kern); if (sk == NULL) goto out; diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index e9b0dec56b8e..f56c9f69e9f2 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -542,12 +542,12 @@ static int pppol2tp_backlog_recv(struct sock *sk, struct sk_buff *skb) /* socket() handler. Initialize a new struct sock. */ -static int pppol2tp_create(struct net *net, struct socket *sock) +static int pppol2tp_create(struct net *net, struct socket *sock, int kern) { int error = -ENOMEM; struct sock *sk; - sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppol2tp_sk_proto); + sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppol2tp_sk_proto, kern); if (!sk) goto out; diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 17a8dff06090..8fd9febaa5ba 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -168,7 +168,7 @@ static int llc_ui_create(struct net *net, struct socket *sock, int protocol, if (likely(sock->type == SOCK_DGRAM || sock->type == SOCK_STREAM)) { rc = -ENOMEM; - sk = llc_sk_alloc(net, PF_LLC, GFP_KERNEL, &llc_proto); + sk = llc_sk_alloc(net, PF_LLC, GFP_KERNEL, &llc_proto, kern); if (sk) { rc = 0; llc_ui_sk_init(sock, sk); diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index 81a61fce3afb..3e821daf9dd4 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c @@ -768,7 +768,7 @@ static struct sock *llc_create_incoming_sock(struct sock *sk, struct llc_addr *daddr) { struct sock *newsk = llc_sk_alloc(sock_net(sk), sk->sk_family, GFP_ATOMIC, - sk->sk_prot); + sk->sk_prot, 0); struct llc_sock *newllc, *llc = llc_sk(sk); if (!newsk) @@ -931,9 +931,9 @@ static void llc_sk_init(struct sock *sk) * Allocates a LLC sock and initializes it. Returns the new LLC sock * or %NULL if there's no memory available for one */ -struct sock *llc_sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot) +struct sock *llc_sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot, int kern) { - struct sock *sk = sk_alloc(net, family, priority, prot); + struct sock *sk = sk_alloc(net, family, priority, prot, kern); if (!sk) goto out; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index a5fff75accf8..be6665ab7f40 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1119,14 +1119,15 @@ static struct proto netlink_proto = { }; static int __netlink_create(struct net *net, struct socket *sock, - struct mutex *cb_mutex, int protocol) + struct mutex *cb_mutex, int protocol, + int kern) { struct sock *sk; struct netlink_sock *nlk; sock->ops = &netlink_ops; - sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto); + sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto, kern); if (!sk) return -ENOMEM; @@ -1188,7 +1189,7 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol, if (err < 0) goto out; - err = __netlink_create(net, sock, cb_mutex, protocol); + err = __netlink_create(net, sock, cb_mutex, protocol, kern); if (err < 0) goto out_module; @@ -2515,14 +2516,12 @@ __netlink_kernel_create(struct net *net, int unit, struct module *module, if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock)) return NULL; - /* * We have to just have a reference on the net from sk, but don't * get_net it. Besides, we cannot get and then put the net here. * So we create one inside init_net and the move it to net. */ - - if (__netlink_create(&init_net, sock, cb_mutex, unit) < 0) + if (__netlink_create(&init_net, sock, cb_mutex, unit, 0) < 0) goto out_sock_release_nosk; sk = sock->sk; diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index b987fd56c3c5..ed212ffc1d9d 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -433,7 +433,7 @@ static int nr_create(struct net *net, struct socket *sock, int protocol, if (sock->type != SOCK_SEQPACKET || protocol != 0) return -ESOCKTNOSUPPORT; - sk = sk_alloc(net, PF_NETROM, GFP_ATOMIC, &nr_proto); + sk = sk_alloc(net, PF_NETROM, GFP_ATOMIC, &nr_proto, kern); if (sk == NULL) return -ENOMEM; @@ -476,7 +476,7 @@ static struct sock *nr_make_new(struct sock *osk) if (osk->sk_type != SOCK_SEQPACKET) return NULL; - sk = sk_alloc(sock_net(osk), PF_NETROM, GFP_ATOMIC, osk->sk_prot); + sk = sk_alloc(sock_net(osk), PF_NETROM, GFP_ATOMIC, osk->sk_prot, 0); if (sk == NULL) return NULL; diff --git a/net/nfc/af_nfc.c b/net/nfc/af_nfc.c index 2277276f52bc..54e40fa47822 100644 --- a/net/nfc/af_nfc.c +++ b/net/nfc/af_nfc.c @@ -40,7 +40,7 @@ static int nfc_sock_create(struct net *net, struct socket *sock, int proto, read_lock(&proto_tab_lock); if (proto_tab[proto] && try_module_get(proto_tab[proto]->owner)) { - rc = proto_tab[proto]->create(net, sock, proto_tab[proto]); + rc = proto_tab[proto]->create(net, sock, proto_tab[proto], kern); module_put(proto_tab[proto]->owner); } read_unlock(&proto_tab_lock); diff --git a/net/nfc/llcp.h b/net/nfc/llcp.h index de1789e3cc82..1f68724d44d3 100644 --- a/net/nfc/llcp.h +++ b/net/nfc/llcp.h @@ -225,7 +225,7 @@ void nfc_llcp_send_to_raw_sock(struct nfc_llcp_local *local, struct sk_buff *skb, u8 direction); /* Sock API */ -struct sock *nfc_llcp_sock_alloc(struct socket *sock, int type, gfp_t gfp); +struct sock *nfc_llcp_sock_alloc(struct socket *sock, int type, gfp_t gfp, int kern); void nfc_llcp_sock_free(struct nfc_llcp_sock *sock); void nfc_llcp_accept_unlink(struct sock *sk); void nfc_llcp_accept_enqueue(struct sock *parent, struct sock *sk); diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c index b18f07ccb504..98876274a1ee 100644 --- a/net/nfc/llcp_core.c +++ b/net/nfc/llcp_core.c @@ -934,7 +934,7 @@ static void nfc_llcp_recv_connect(struct nfc_llcp_local *local, sock->ssap = ssap; } - new_sk = nfc_llcp_sock_alloc(NULL, parent->sk_type, GFP_ATOMIC); + new_sk = nfc_llcp_sock_alloc(NULL, parent->sk_type, GFP_ATOMIC, 0); if (new_sk == NULL) { reason = LLCP_DM_REJ; release_sock(&sock->sk); diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index 9578bd6a4f3e..b7de0da46acd 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -942,12 +942,12 @@ static void llcp_sock_destruct(struct sock *sk) } } -struct sock *nfc_llcp_sock_alloc(struct socket *sock, int type, gfp_t gfp) +struct sock *nfc_llcp_sock_alloc(struct socket *sock, int type, gfp_t gfp, int kern) { struct sock *sk; struct nfc_llcp_sock *llcp_sock; - sk = sk_alloc(&init_net, PF_NFC, gfp, &llcp_sock_proto); + sk = sk_alloc(&init_net, PF_NFC, gfp, &llcp_sock_proto, kern); if (!sk) return NULL; @@ -993,7 +993,7 @@ void nfc_llcp_sock_free(struct nfc_llcp_sock *sock) } static int llcp_sock_create(struct net *net, struct socket *sock, - const struct nfc_protocol *nfc_proto) + const struct nfc_protocol *nfc_proto, int kern) { struct sock *sk; @@ -1009,7 +1009,7 @@ static int llcp_sock_create(struct net *net, struct socket *sock, else sock->ops = &llcp_sock_ops; - sk = nfc_llcp_sock_alloc(sock, sock->type, GFP_ATOMIC); + sk = nfc_llcp_sock_alloc(sock, sock->type, GFP_ATOMIC, kern); if (sk == NULL) return -ENOMEM; diff --git a/net/nfc/nfc.h b/net/nfc/nfc.h index a8ce80b47720..5c93e8412a26 100644 --- a/net/nfc/nfc.h +++ b/net/nfc/nfc.h @@ -30,7 +30,7 @@ struct nfc_protocol { struct proto *proto; struct module *owner; int (*create)(struct net *net, struct socket *sock, - const struct nfc_protocol *nfc_proto); + const struct nfc_protocol *nfc_proto, int kern); }; struct nfc_rawsock { diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c index 82b4e8024778..e9a91488fe3d 100644 --- a/net/nfc/rawsock.c +++ b/net/nfc/rawsock.c @@ -334,7 +334,7 @@ static void rawsock_destruct(struct sock *sk) } static int rawsock_create(struct net *net, struct socket *sock, - const struct nfc_protocol *nfc_proto) + const struct nfc_protocol *nfc_proto, int kern) { struct sock *sk; @@ -348,7 +348,7 @@ static int rawsock_create(struct net *net, struct socket *sock, else sock->ops = &rawsock_ops; - sk = sk_alloc(net, PF_NFC, GFP_ATOMIC, nfc_proto->proto); + sk = sk_alloc(net, PF_NFC, GFP_ATOMIC, nfc_proto->proto, kern); if (!sk) return -ENOMEM; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 5102c3cc4eec..94713276a1d9 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2832,7 +2832,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol, sock->state = SS_UNCONNECTED; err = -ENOBUFS; - sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto); + sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern); if (sk == NULL) goto out; diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c index 32ab87d34828..10d42f3220ab 100644 --- a/net/phonet/af_phonet.c +++ b/net/phonet/af_phonet.c @@ -97,7 +97,7 @@ static int pn_socket_create(struct net *net, struct socket *sock, int protocol, goto out; } - sk = sk_alloc(net, PF_PHONET, GFP_KERNEL, pnp->prot); + sk = sk_alloc(net, PF_PHONET, GFP_KERNEL, pnp->prot, kern); if (sk == NULL) { err = -ENOMEM; goto out; diff --git a/net/phonet/pep.c b/net/phonet/pep.c index 6de2aeb98a1f..850a86cde0b3 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -845,7 +845,7 @@ static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp) } /* Create a new to-be-accepted sock */ - newsk = sk_alloc(sock_net(sk), PF_PHONET, GFP_KERNEL, sk->sk_prot); + newsk = sk_alloc(sock_net(sk), PF_PHONET, GFP_KERNEL, sk->sk_prot, 0); if (!newsk) { pep_reject_conn(sk, skb, PN_PIPE_ERR_OVERLOAD, GFP_KERNEL); err = -ENOBUFS; diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index 10443377fb9d..3d83641f2861 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -440,7 +440,7 @@ static int rds_create(struct net *net, struct socket *sock, int protocol, if (sock->type != SOCK_SEQPACKET || protocol) return -ESOCKTNOSUPPORT; - sk = sk_alloc(net, AF_RDS, GFP_ATOMIC, &rds_proto); + sk = sk_alloc(net, AF_RDS, GFP_ATOMIC, &rds_proto, kern); if (!sk) return -ENOMEM; diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 8ae603069a1a..36dbc2da3661 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -520,7 +520,7 @@ static int rose_create(struct net *net, struct socket *sock, int protocol, if (sock->type != SOCK_SEQPACKET || protocol != 0) return -ESOCKTNOSUPPORT; - sk = sk_alloc(net, PF_ROSE, GFP_ATOMIC, &rose_proto); + sk = sk_alloc(net, PF_ROSE, GFP_ATOMIC, &rose_proto, kern); if (sk == NULL) return -ENOMEM; @@ -559,7 +559,7 @@ static struct sock *rose_make_new(struct sock *osk) if (osk->sk_type != SOCK_SEQPACKET) return NULL; - sk = sk_alloc(sock_net(osk), PF_ROSE, GFP_ATOMIC, &rose_proto); + sk = sk_alloc(sock_net(osk), PF_ROSE, GFP_ATOMIC, &rose_proto, 0); if (sk == NULL) return NULL; diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 0095b9a0b779..25d60ed15284 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -632,7 +632,7 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol, sock->ops = &rxrpc_rpc_ops; sock->state = SS_UNCONNECTED; - sk = sk_alloc(net, PF_RXRPC, GFP_KERNEL, &rxrpc_proto); + sk = sk_alloc(net, PF_RXRPC, GFP_KERNEL, &rxrpc_proto, kern); if (!sk) return -ENOMEM; diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 0e4198ee2370..e703ff7fed40 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -635,7 +635,7 @@ static struct sock *sctp_v6_create_accept_sk(struct sock *sk, struct ipv6_pinfo *newnp, *np = inet6_sk(sk); struct sctp6_sock *newsctp6sk; - newsk = sk_alloc(sock_net(sk), PF_INET6, GFP_KERNEL, sk->sk_prot); + newsk = sk_alloc(sock_net(sk), PF_INET6, GFP_KERNEL, sk->sk_prot, 0); if (!newsk) goto out; diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 53b7acde9aa3..59e80356672b 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -550,7 +550,7 @@ static struct sock *sctp_v4_create_accept_sk(struct sock *sk, struct sctp_association *asoc) { struct sock *newsk = sk_alloc(sock_net(sk), PF_INET, GFP_KERNEL, - sk->sk_prot); + sk->sk_prot, 0); struct inet_sock *newinet; if (!newsk) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 9074b5cede38..8f3c8e2cef8e 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -342,7 +342,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock, } /* Allocate socket's protocol area */ - sk = sk_alloc(net, AF_TIPC, GFP_KERNEL, &tipc_proto); + sk = sk_alloc(net, AF_TIPC, GFP_KERNEL, &tipc_proto, kern); if (sk == NULL) return -ENOMEM; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 5266ea7b922b..941b3d26e3bf 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -620,7 +620,7 @@ static struct proto unix_proto = { */ static struct lock_class_key af_unix_sk_receive_queue_lock_key; -static struct sock *unix_create1(struct net *net, struct socket *sock) +static struct sock *unix_create1(struct net *net, struct socket *sock, int kern) { struct sock *sk = NULL; struct unix_sock *u; @@ -629,7 +629,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock) if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) goto out; - sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto); + sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto, kern); if (!sk) goto out; @@ -688,7 +688,7 @@ static int unix_create(struct net *net, struct socket *sock, int protocol, return -ESOCKTNOSUPPORT; } - return unix_create1(net, sock) ? 0 : -ENOMEM; + return unix_create1(net, sock, kern) ? 0 : -ENOMEM; } static int unix_release(struct socket *sock) @@ -1088,7 +1088,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, err = -ENOMEM; /* create new sock for complete connection */ - newsk = unix_create1(sock_net(sk), NULL); + newsk = unix_create1(sock_net(sk), NULL, 0); if (newsk == NULL) goto out; diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 2ec86e652a19..df5fc6b340f1 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -581,13 +581,14 @@ struct sock *__vsock_create(struct net *net, struct socket *sock, struct sock *parent, gfp_t priority, - unsigned short type) + unsigned short type, + int kern) { struct sock *sk; struct vsock_sock *psk; struct vsock_sock *vsk; - sk = sk_alloc(net, AF_VSOCK, priority, &vsock_proto); + sk = sk_alloc(net, AF_VSOCK, priority, &vsock_proto, kern); if (!sk) return NULL; @@ -1866,7 +1867,7 @@ static int vsock_create(struct net *net, struct socket *sock, sock->state = SS_UNCONNECTED; - return __vsock_create(net, sock, NULL, GFP_KERNEL, 0) ? 0 : -ENOMEM; + return __vsock_create(net, sock, NULL, GFP_KERNEL, 0, kern) ? 0 : -ENOMEM; } static const struct net_proto_family vsock_family_ops = { diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index c294da095461..1f63daff3965 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -1022,7 +1022,7 @@ static int vmci_transport_recv_listen(struct sock *sk, } pending = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL, - sk->sk_type); + sk->sk_type, 0); if (!pending) { vmci_transport_send_reset(sk, pkt); return -ENOMEM; diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index c3ab230e4493..a750f330b8dd 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -515,10 +515,10 @@ static struct proto x25_proto = { .obj_size = sizeof(struct x25_sock), }; -static struct sock *x25_alloc_socket(struct net *net) +static struct sock *x25_alloc_socket(struct net *net, int kern) { struct x25_sock *x25; - struct sock *sk = sk_alloc(net, AF_X25, GFP_ATOMIC, &x25_proto); + struct sock *sk = sk_alloc(net, AF_X25, GFP_ATOMIC, &x25_proto, kern); if (!sk) goto out; @@ -553,7 +553,7 @@ static int x25_create(struct net *net, struct socket *sock, int protocol, goto out; rc = -ENOBUFS; - if ((sk = x25_alloc_socket(net)) == NULL) + if ((sk = x25_alloc_socket(net, kern)) == NULL) goto out; x25 = x25_sk(sk); @@ -602,7 +602,7 @@ static struct sock *x25_make_new(struct sock *osk) if (osk->sk_type != SOCK_SEQPACKET) goto out; - if ((sk = x25_alloc_socket(sock_net(osk))) == NULL) + if ((sk = x25_alloc_socket(sock_net(osk), 0)) == NULL) goto out; x25 = x25_sk(sk); -- cgit From 26abe14379f8e2fa3fd1bcf97c9a7ad9364886fe Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 8 May 2015 21:10:31 -0500 Subject: net: Modify sk_alloc to not reference count the netns of kernel sockets. Now that sk_alloc knows when a kernel socket is being allocated modify it to not reference count the network namespace of kernel sockets. Keep track of if a socket needs reference counting by adding a flag to struct sock called sk_net_refcnt. Update all of the callers of sock_create_kern to stop using sk_change_net and sk_release_kernel as those hacks are no longer needed, to avoid reference counting a kernel socket. Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/core/sock.c | 8 ++++++-- net/ipv4/af_inet.c | 4 +--- net/ipv4/udp_tunnel.c | 8 +++----- net/ipv6/ip6_udp_tunnel.c | 6 ++---- net/l2tp/l2tp_core.c | 15 ++++++--------- net/netfilter/ipvs/ip_vs_sync.c | 30 +++++++++--------------------- 6 files changed, 27 insertions(+), 44 deletions(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index cbc3789b830c..9e8968e9ebeb 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1412,7 +1412,10 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, */ sk->sk_prot = sk->sk_prot_creator = prot; sock_lock_init(sk); - sock_net_set(sk, get_net(net)); + sk->sk_net_refcnt = kern ? 0 : 1; + if (likely(sk->sk_net_refcnt)) + get_net(net); + sock_net_set(sk, net); atomic_set(&sk->sk_wmem_alloc, 1); sock_update_classid(sk); @@ -1446,7 +1449,8 @@ static void __sk_free(struct sock *sk) if (sk->sk_peer_cred) put_cred(sk->sk_peer_cred); put_pid(sk->sk_peer_pid); - put_net(sock_net(sk)); + if (likely(sk->sk_net_refcnt)) + put_net(sock_net(sk)); sk_prot_free(sk->sk_prot_creator, sk); } diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index e2dd9cb99d61..235d36afece3 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1430,7 +1430,7 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family, struct net *net) { struct socket *sock; - int rc = sock_create_kern(&init_net, family, type, protocol, &sock); + int rc = sock_create_kern(net, family, type, protocol, &sock); if (rc == 0) { *sk = sock->sk; @@ -1440,8 +1440,6 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family, * we do not wish this socket to see incoming packets. */ (*sk)->sk_prot->unhash(*sk); - - sk_change_net(*sk, net); } return rc; } diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index 4e2837476967..933ea903f7b8 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -15,12 +15,10 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, struct socket *sock = NULL; struct sockaddr_in udp_addr; - err = sock_create_kern(&init_net, AF_INET, SOCK_DGRAM, 0, &sock); + err = sock_create_kern(net, AF_INET, SOCK_DGRAM, 0, &sock); if (err < 0) goto error; - sk_change_net(sock->sk, net); - udp_addr.sin_family = AF_INET; udp_addr.sin_addr = cfg->local_ip; udp_addr.sin_port = cfg->local_udp_port; @@ -47,7 +45,7 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, error: if (sock) { kernel_sock_shutdown(sock, SHUT_RDWR); - sk_release_kernel(sock->sk); + sock_release(sock); } *sockp = NULL; return err; @@ -101,7 +99,7 @@ void udp_tunnel_sock_release(struct socket *sock) { rcu_assign_sk_user_data(sock->sk, NULL); kernel_sock_shutdown(sock, SHUT_RDWR); - sk_release_kernel(sock->sk); + sock_release(sock); } EXPORT_SYMBOL_GPL(udp_tunnel_sock_release); diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index 478576b61214..e1a1136bda7c 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -19,12 +19,10 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, int err; struct socket *sock = NULL; - err = sock_create_kern(&init_net, AF_INET6, SOCK_DGRAM, 0, &sock); + err = sock_create_kern(net, AF_INET6, SOCK_DGRAM, 0, &sock); if (err < 0) goto error; - sk_change_net(sock->sk, net); - udp6_addr.sin6_family = AF_INET6; memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6, sizeof(udp6_addr.sin6_addr)); @@ -55,7 +53,7 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, error: if (sock) { kernel_sock_shutdown(sock, SHUT_RDWR); - sk_release_kernel(sock->sk); + sock_release(sock); } *sockp = NULL; return err; diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index ae513a2fe7f3..f6b090df3930 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1334,9 +1334,10 @@ static void l2tp_tunnel_del_work(struct work_struct *work) if (sock) inet_shutdown(sock, 2); } else { - if (sock) + if (sock) { kernel_sock_shutdown(sock, SHUT_RDWR); - sk_release_kernel(sk); + sock_release(sock); + } } l2tp_tunnel_sock_put(sk); @@ -1399,13 +1400,11 @@ static int l2tp_tunnel_sock_create(struct net *net, if (cfg->local_ip6 && cfg->peer_ip6) { struct sockaddr_l2tpip6 ip6_addr = {0}; - err = sock_create_kern(&init_net, AF_INET6, SOCK_DGRAM, + err = sock_create_kern(net, AF_INET6, SOCK_DGRAM, IPPROTO_L2TP, &sock); if (err < 0) goto out; - sk_change_net(sock->sk, net); - ip6_addr.l2tp_family = AF_INET6; memcpy(&ip6_addr.l2tp_addr, cfg->local_ip6, sizeof(ip6_addr.l2tp_addr)); @@ -1429,13 +1428,11 @@ static int l2tp_tunnel_sock_create(struct net *net, { struct sockaddr_l2tpip ip_addr = {0}; - err = sock_create_kern(&init_net, AF_INET, SOCK_DGRAM, + err = sock_create_kern(net, AF_INET, SOCK_DGRAM, IPPROTO_L2TP, &sock); if (err < 0) goto out; - sk_change_net(sock->sk, net); - ip_addr.l2tp_family = AF_INET; ip_addr.l2tp_addr = cfg->local_ip; ip_addr.l2tp_conn_id = tunnel_id; @@ -1462,7 +1459,7 @@ out: *sockp = sock; if ((err < 0) && sock) { kernel_sock_shutdown(sock, SHUT_RDWR); - sk_release_kernel(sock->sk); + sock_release(sock); *sockp = NULL; } diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 2e9a5b5d1239..b08ba9538d12 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1457,18 +1457,12 @@ static struct socket *make_send_sock(struct net *net, int id) struct socket *sock; int result; - /* First create a socket move it to right name space later */ - result = sock_create_kern(&init_net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); + /* First create a socket */ + result = sock_create_kern(net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); if (result < 0) { pr_err("Error during creation of socket; terminating\n"); return ERR_PTR(result); } - /* - * Kernel sockets that are a part of a namespace, should not - * hold a reference to a namespace in order to allow to stop it. - * After sk_change_net should be released using sk_release_kernel. - */ - sk_change_net(sock->sk, net); result = set_mcast_if(sock->sk, ipvs->master_mcast_ifn); if (result < 0) { pr_err("Error setting outbound mcast interface\n"); @@ -1497,7 +1491,7 @@ static struct socket *make_send_sock(struct net *net, int id) return sock; error: - sk_release_kernel(sock->sk); + sock_release(sock); return ERR_PTR(result); } @@ -1518,17 +1512,11 @@ static struct socket *make_receive_sock(struct net *net, int id) int result; /* First create a socket */ - result = sock_create_kern(&init_net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); + result = sock_create_kern(net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); if (result < 0) { pr_err("Error during creation of socket; terminating\n"); return ERR_PTR(result); } - /* - * Kernel sockets that are a part of a namespace, should not - * hold a reference to a namespace in order to allow to stop it. - * After sk_change_net should be released using sk_release_kernel. - */ - sk_change_net(sock->sk, net); /* it is equivalent to the REUSEADDR option in user-space */ sock->sk->sk_reuse = SK_CAN_REUSE; result = sysctl_sync_sock_size(ipvs); @@ -1554,7 +1542,7 @@ static struct socket *make_receive_sock(struct net *net, int id) return sock; error: - sk_release_kernel(sock->sk); + sock_release(sock); return ERR_PTR(result); } @@ -1692,7 +1680,7 @@ done: ip_vs_sync_buff_release(sb); /* release the sending multicast socket */ - sk_release_kernel(tinfo->sock->sk); + sock_release(tinfo->sock); kfree(tinfo); return 0; @@ -1729,7 +1717,7 @@ static int sync_thread_backup(void *data) } /* release the sending multicast socket */ - sk_release_kernel(tinfo->sock->sk); + sock_release(tinfo->sock); kfree(tinfo->buf); kfree(tinfo); @@ -1854,11 +1842,11 @@ int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid) return 0; outsocket: - sk_release_kernel(sock->sk); + sock_release(sock); outtinfo: if (tinfo) { - sk_release_kernel(tinfo->sock->sk); + sock_release(tinfo->sock); kfree(tinfo->buf); kfree(tinfo); } -- cgit From 13d3078e22f5651cd0184ce1b4e4ab8a46c62b27 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 8 May 2015 21:11:33 -0500 Subject: netlink: Create kernel netlink sockets in the proper network namespace Utilize the new functionality of sk_alloc so that nothing needs to be done to suprress the reference counting on kernel sockets. Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index be6665ab7f40..e9b9559731bb 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2516,16 +2516,11 @@ __netlink_kernel_create(struct net *net, int unit, struct module *module, if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock)) return NULL; - /* - * We have to just have a reference on the net from sk, but don't - * get_net it. Besides, we cannot get and then put the net here. - * So we create one inside init_net and the move it to net. - */ - if (__netlink_create(&init_net, sock, cb_mutex, unit, 0) < 0) + + if (__netlink_create(net, sock, cb_mutex, unit, 1) < 0) goto out_sock_release_nosk; sk = sock->sk; - sk_change_net(sk, net); if (!cfg || cfg->groups < 32) groups = 32; @@ -2581,7 +2576,10 @@ EXPORT_SYMBOL(__netlink_kernel_create); void netlink_kernel_release(struct sock *sk) { - sk_release_kernel(sk); + if (sk == NULL || sk->sk_socket == NULL) + return; + + sock_release(sk->sk_socket); } EXPORT_SYMBOL(netlink_kernel_release); -- cgit From affb9792f1d99e1e4d64411e147b648d65f2576e Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 8 May 2015 21:12:13 -0500 Subject: net: kill sk_change_net and sk_release_kernel These functions are no longer needed and no longer used kill them. Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/core/sock.c | 19 ------------------- 1 file changed, 19 deletions(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 9e8968e9ebeb..c18738a795b0 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1466,25 +1466,6 @@ void sk_free(struct sock *sk) } EXPORT_SYMBOL(sk_free); -/* - * Last sock_put should drop reference to sk->sk_net. It has already - * been dropped in sk_change_net. Taking reference to stopping namespace - * is not an option. - * Take reference to a socket to remove it from hash _alive_ and after that - * destroy it in the context of init_net. - */ -void sk_release_kernel(struct sock *sk) -{ - if (sk == NULL || sk->sk_socket == NULL) - return; - - sock_hold(sk); - sock_net_set(sk, get_net(&init_net)); - sock_release(sk->sk_socket); - sock_put(sk); -} -EXPORT_SYMBOL(sk_release_kernel); - static void sk_update_clone(const struct sock *sk, struct sock *newsk) { if (mem_cgroup_sockets_enabled && sk->sk_cgrp) -- cgit From c9e99fd078ef7fdcd9ee4f5a4cfdbece319587af Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 9 May 2015 22:51:31 +0200 Subject: net: sched: consolidate handle_ing and ing_filter Given quite some code has been removed from ing_filter(), we can just consolidate that function into handle_ing() and get rid of a few instructions at the same time. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/dev.c | 46 ++++++++++++++++------------------------------ 1 file changed, 16 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 862875ec8f2f..8a757464bfa2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3521,37 +3521,19 @@ EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); #endif #ifdef CONFIG_NET_CLS_ACT -/* TODO: Maybe we should just force sch_ingress to be compiled in - * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions - * a compare and 2 stores extra right now if we dont have it on - * but have CONFIG_NET_CLS_ACT - * NOTE: This doesn't stop any functionality; if you dont have - * the ingress scheduler, you just can't add policies on ingress. - * - */ -static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq) -{ - int result = TC_ACT_OK; - struct Qdisc *q; - - skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); - - q = rcu_dereference(rxq->qdisc); - if (q != &noop_qdisc) { - if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) - result = qdisc_enqueue_root(skb, q); - } - - return result; -} - static inline struct sk_buff *handle_ing(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, struct net_device *orig_dev) { struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue); + struct Qdisc *q; - if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc) + /* If there's at least one ingress present somewhere (so + * we get here via enabled static key), remaining devices + * that are not configured with an ingress qdisc will bail + * out w/o the rcu_dereference(). + */ + if (!rxq || (q = rcu_dereference(rxq->qdisc)) == &noop_qdisc) return skb; if (*pt_prev) { @@ -3559,11 +3541,15 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb, *pt_prev = NULL; } - switch (ing_filter(skb, rxq)) { - case TC_ACT_SHOT: - case TC_ACT_STOLEN: - kfree_skb(skb); - return NULL; + skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); + + if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { + switch (qdisc_enqueue_root(skb, q)) { + case TC_ACT_SHOT: + case TC_ACT_STOLEN: + kfree_skb(skb); + return NULL; + } } return skb; -- cgit From d2788d34885d4ce5ba17a8996fd95d28942e574e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 9 May 2015 22:51:32 +0200 Subject: net: sched: further simplify handle_ing Ingress qdisc has no other purpose than calling into tc_classify() that executes attached classifier(s) and action(s). It has a 1:1 relationship to dev->ingress_queue. After having commit 087c1a601ad7 ("net: sched: run ingress qdisc without locks") removed the central ingress lock, one major contention point is gone. The extra indirection layers however, are not necessary for calling into ingress qdisc. pktgen calling locally into netif_receive_skb() with a dummy u32, single CPU result on a Supermicro X10SLM-F, Xeon E3-1240: before ~21,1 Mpps, after patch ~22,9 Mpps. We can redirect the private classifier list to the netdev directly, without changing any classifier API bits (!) and execute on that from handle_ing() side. The __QDISC_STATE_DEACTIVATE test can be removed, ingress qdisc doesn't have a queue and thus dev_deactivate_queue() is also not applicable, ingress_cl_list provides similar behaviour. In other words, ingress qdisc acts like TCQ_F_BUILTIN qdisc. One next possible step is the removal of the dev's ingress (dummy) netdev_queue, and to only have the list member in the netdevice itself. Note, the filter chain is RCU protected and individual filter elements are being kfree'd by sched subsystem after RCU grace period. RCU read lock is being held by __netif_receive_skb_core(). Joint work with Alexei Starovoitov. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/dev.c | 30 +++++++++++++++---------- net/sched/sch_ingress.c | 58 ++++++++----------------------------------------- 2 files changed, 27 insertions(+), 61 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 8a757464bfa2..e5f77c40bbd1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3525,31 +3525,37 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, struct net_device *orig_dev) { - struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue); - struct Qdisc *q; + struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list); + struct tcf_result cl_res; /* If there's at least one ingress present somewhere (so * we get here via enabled static key), remaining devices * that are not configured with an ingress qdisc will bail - * out w/o the rcu_dereference(). + * out here. */ - if (!rxq || (q = rcu_dereference(rxq->qdisc)) == &noop_qdisc) + if (!cl) return skb; - if (*pt_prev) { *ret = deliver_skb(skb, *pt_prev, orig_dev); *pt_prev = NULL; } + qdisc_bstats_update_cpu(cl->q, skb); skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); - if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { - switch (qdisc_enqueue_root(skb, q)) { - case TC_ACT_SHOT: - case TC_ACT_STOLEN: - kfree_skb(skb); - return NULL; - } + switch (tc_classify(skb, cl, &cl_res)) { + case TC_ACT_OK: + case TC_ACT_RECLASSIFY: + skb->tc_index = TC_H_MIN(cl_res.classid); + break; + case TC_ACT_SHOT: + qdisc_qstats_drop_cpu(cl->q); + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + kfree_skb(skb); + return NULL; + default: + break; } return skb; diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index a89cc3278bfb..e7c648fa9dc3 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -12,16 +12,10 @@ #include #include #include + #include #include - -struct ingress_qdisc_data { - struct tcf_proto __rcu *filter_list; -}; - -/* ------------------------- Class/flow operations ------------------------- */ - static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg) { return NULL; @@ -49,45 +43,11 @@ static void ingress_walk(struct Qdisc *sch, struct qdisc_walker *walker) static struct tcf_proto __rcu **ingress_find_tcf(struct Qdisc *sch, unsigned long cl) { - struct ingress_qdisc_data *p = qdisc_priv(sch); - - return &p->filter_list; -} - -/* --------------------------- Qdisc operations ---------------------------- */ + struct net_device *dev = qdisc_dev(sch); -static int ingress_enqueue(struct sk_buff *skb, struct Qdisc *sch) -{ - struct ingress_qdisc_data *p = qdisc_priv(sch); - struct tcf_result res; - struct tcf_proto *fl = rcu_dereference_bh(p->filter_list); - int result; - - result = tc_classify(skb, fl, &res); - - qdisc_bstats_update_cpu(sch, skb); - switch (result) { - case TC_ACT_SHOT: - result = TC_ACT_SHOT; - qdisc_qstats_drop_cpu(sch); - break; - case TC_ACT_STOLEN: - case TC_ACT_QUEUED: - result = TC_ACT_STOLEN; - break; - case TC_ACT_RECLASSIFY: - case TC_ACT_OK: - skb->tc_index = TC_H_MIN(res.classid); - default: - result = TC_ACT_OK; - break; - } - - return result; + return &dev->ingress_cl_list; } -/* ------------------------------------------------------------- */ - static int ingress_init(struct Qdisc *sch, struct nlattr *opt) { net_inc_ingress_queue(); @@ -98,9 +58,9 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt) static void ingress_destroy(struct Qdisc *sch) { - struct ingress_qdisc_data *p = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); - tcf_destroy_chain(&p->filter_list); + tcf_destroy_chain(&dev->ingress_cl_list); net_dec_ingress_queue(); } @@ -111,6 +71,7 @@ static int ingress_dump(struct Qdisc *sch, struct sk_buff *skb) nest = nla_nest_start(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; + return nla_nest_end(skb, nest); nla_put_failure: @@ -131,8 +92,6 @@ static const struct Qdisc_class_ops ingress_class_ops = { static struct Qdisc_ops ingress_qdisc_ops __read_mostly = { .cl_ops = &ingress_class_ops, .id = "ingress", - .priv_size = sizeof(struct ingress_qdisc_data), - .enqueue = ingress_enqueue, .init = ingress_init, .destroy = ingress_destroy, .dump = ingress_dump, @@ -149,6 +108,7 @@ static void __exit ingress_module_exit(void) unregister_qdisc(&ingress_qdisc_ops); } -module_init(ingress_module_init) -module_exit(ingress_module_exit) +module_init(ingress_module_init); +module_exit(ingress_module_exit); + MODULE_LICENSE("GPL"); -- cgit From 658358cec93a7130615cfc1d6843ab07e49625e6 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 11 May 2015 19:12:09 +0200 Subject: mac80211: fix throughput LED trigger As I was testing with hwsim, I missed that my previous commit to make LED work depend on activation broke the code because I missed removing the old trigger struct and some code was still using it, now erroneously, causing crashes. Fix this by always using the correct struct. Reported-by: Felix Fietkau Tested-by: Felix Fietkau Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 1 - net/mac80211/led.c | 12 ++++++------ 2 files changed, 6 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 241b74f3bd81..2c4fe45ea38a 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1038,7 +1038,6 @@ enum queue_stop_reason { #ifdef CONFIG_MAC80211_LEDS struct tpt_led_trigger { - struct led_trigger trig; char name[32]; const struct ieee80211_tpt_blink *blink_table; unsigned int blink_table_len; diff --git a/net/mac80211/led.c b/net/mac80211/led.c index 38f05565eaac..0505845b7ab8 100644 --- a/net/mac80211/led.c +++ b/net/mac80211/led.c @@ -276,10 +276,10 @@ static void tpt_trig_timer(unsigned long data) } } - read_lock(&tpt_trig->trig.leddev_list_lock); - list_for_each_entry(led_cdev, &tpt_trig->trig.led_cdevs, trig_list) + read_lock(&local->tpt_led.leddev_list_lock); + list_for_each_entry(led_cdev, &local->tpt_led.led_cdevs, trig_list) led_blink_set(led_cdev, &on, &off); - read_unlock(&tpt_trig->trig.leddev_list_lock); + read_unlock(&local->tpt_led.leddev_list_lock); } const char * @@ -341,10 +341,10 @@ static void ieee80211_stop_tpt_led_trig(struct ieee80211_local *local) tpt_trig->running = false; del_timer_sync(&tpt_trig->timer); - read_lock(&tpt_trig->trig.leddev_list_lock); - list_for_each_entry(led_cdev, &tpt_trig->trig.led_cdevs, trig_list) + read_lock(&local->tpt_led.leddev_list_lock); + list_for_each_entry(led_cdev, &local->tpt_led.led_cdevs, trig_list) led_set_brightness(led_cdev, LED_OFF); - read_unlock(&tpt_trig->trig.leddev_list_lock); + read_unlock(&local->tpt_led.leddev_list_lock); } void ieee80211_mod_tpt_led_trig(struct ieee80211_local *local, -- cgit From b396cca6fafccf16206a5d041d59c9e6b65b6f5a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 11 May 2015 09:06:56 -0700 Subject: net: sched: deprecate enqueue_root() Only left enqueue_root() user is netem, and it looks not necessary : qdisc_skb_cb(skb)->pkt_len is preserved after one skb_clone() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_netem.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 956ead2cab9a..5abd1d9de989 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -440,9 +440,9 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) { struct Qdisc *rootq = qdisc_root(sch); u32 dupsave = q->duplicate; /* prevent duplicating a dup... */ - q->duplicate = 0; - qdisc_enqueue_root(skb2, rootq); + q->duplicate = 0; + rootq->enqueue(skb2, rootq); q->duplicate = dupsave; } -- cgit From 9451980a6646ed487efce04a9df28f450935683e Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 6 May 2015 21:11:40 -0700 Subject: net: Use cached copy of pfmemalloc to avoid accessing page While testing I found that the testing for pfmemalloc in build_skb was rather expensive. I found the issue to be two-fold. First we have to get from the virtual address to the head page and that comes at the cost of something like 11 cycles. Then there is the cost for reading pfmemalloc out of the head page which can be cache cold due to the fact that put_page_testzero is likely invalidating the cache-line on one or more CPUs as the fragments can be shared. To avoid this extra expense I have added a pfmemalloc member to the netdev_alloc_cache. I then pushed pieces of __alloc_rx_skb into __napi_alloc_skb and __netdev_alloc_skb so that I could rewrite them to make use of the cached pfmemalloc value. The result is that my perf traces show a reduction from 9.28% overhead to 3.7% for the code covered by build_skb, __alloc_rx_skb, and __napi_alloc_skb when performing a test with the packet being dropped instead of being handed to napi_gro_receive. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/core/skbuff.c | 141 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 79 insertions(+), 62 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b9eb90b39ac7..d6851ca32598 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -353,6 +353,7 @@ struct netdev_alloc_cache { * containing page->_count every time we allocate a fragment. */ unsigned int pagecnt_bias; + bool pfmemalloc; }; static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache); static DEFINE_PER_CPU(struct netdev_alloc_cache, napi_alloc_cache); @@ -379,10 +380,9 @@ static struct page *__page_frag_refill(struct netdev_alloc_cache *nc, return page; } -static void *__alloc_page_frag(struct netdev_alloc_cache __percpu *cache, +static void *__alloc_page_frag(struct netdev_alloc_cache *nc, unsigned int fragsz, gfp_t gfp_mask) { - struct netdev_alloc_cache *nc = this_cpu_ptr(cache); struct page *page = nc->frag.page; unsigned int size; int offset; @@ -402,6 +402,7 @@ refill: atomic_add(size - 1, &page->_count); /* reset page count bias and offset to start of new frag */ + nc->pfmemalloc = page->pfmemalloc; nc->pagecnt_bias = size; nc->frag.offset = size; } @@ -430,11 +431,13 @@ refill: static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) { + struct netdev_alloc_cache *nc; unsigned long flags; void *data; local_irq_save(flags); - data = __alloc_page_frag(&netdev_alloc_cache, fragsz, gfp_mask); + nc = this_cpu_ptr(&netdev_alloc_cache); + data = __alloc_page_frag(nc, fragsz, gfp_mask); local_irq_restore(flags); return data; } @@ -454,7 +457,9 @@ EXPORT_SYMBOL(netdev_alloc_frag); static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) { - return __alloc_page_frag(&napi_alloc_cache, fragsz, gfp_mask); + struct netdev_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + + return __alloc_page_frag(nc, fragsz, gfp_mask); } void *napi_alloc_frag(unsigned int fragsz) @@ -464,76 +469,64 @@ void *napi_alloc_frag(unsigned int fragsz) EXPORT_SYMBOL(napi_alloc_frag); /** - * __alloc_rx_skb - allocate an skbuff for rx + * __netdev_alloc_skb - allocate an skbuff for rx on a specific device + * @dev: network device to receive on * @length: length to allocate * @gfp_mask: get_free_pages mask, passed to alloc_skb - * @flags: If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for - * allocations in case we have to fallback to __alloc_skb() - * If SKB_ALLOC_NAPI is set, page fragment will be allocated - * from napi_cache instead of netdev_cache. * * Allocate a new &sk_buff and assign it a usage count of one. The - * buffer has unspecified headroom built in. Users should allocate + * buffer has NET_SKB_PAD headroom built in. Users should allocate * the headroom they think they need without accounting for the * built in space. The built in space is used for optimisations. * * %NULL is returned if there is no free memory. */ -static struct sk_buff *__alloc_rx_skb(unsigned int length, gfp_t gfp_mask, - int flags) +struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, + gfp_t gfp_mask) { - struct sk_buff *skb = NULL; - unsigned int fragsz = SKB_DATA_ALIGN(length) + - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + struct netdev_alloc_cache *nc; + unsigned long flags; + struct sk_buff *skb; + bool pfmemalloc; + void *data; - if (fragsz <= PAGE_SIZE && !(gfp_mask & (__GFP_WAIT | GFP_DMA))) { - void *data; + len += NET_SKB_PAD; - if (sk_memalloc_socks()) - gfp_mask |= __GFP_MEMALLOC; + if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) || + (gfp_mask & (__GFP_WAIT | GFP_DMA))) + return __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); - data = (flags & SKB_ALLOC_NAPI) ? - __napi_alloc_frag(fragsz, gfp_mask) : - __netdev_alloc_frag(fragsz, gfp_mask); + len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + len = SKB_DATA_ALIGN(len); - if (likely(data)) { - skb = build_skb(data, fragsz); - if (unlikely(!skb)) - put_page(virt_to_head_page(data)); - } - } else { - skb = __alloc_skb(length, gfp_mask, - SKB_ALLOC_RX, NUMA_NO_NODE); - } - return skb; -} + if (sk_memalloc_socks()) + gfp_mask |= __GFP_MEMALLOC; -/** - * __netdev_alloc_skb - allocate an skbuff for rx on a specific device - * @dev: network device to receive on - * @length: length to allocate - * @gfp_mask: get_free_pages mask, passed to alloc_skb - * - * Allocate a new &sk_buff and assign it a usage count of one. The - * buffer has NET_SKB_PAD headroom built in. Users should allocate - * the headroom they think they need without accounting for the - * built in space. The built in space is used for optimisations. - * - * %NULL is returned if there is no free memory. - */ -struct sk_buff *__netdev_alloc_skb(struct net_device *dev, - unsigned int length, gfp_t gfp_mask) -{ - struct sk_buff *skb; + local_irq_save(flags); - length += NET_SKB_PAD; - skb = __alloc_rx_skb(length, gfp_mask, 0); + nc = this_cpu_ptr(&netdev_alloc_cache); + data = __alloc_page_frag(nc, len, gfp_mask); + pfmemalloc = nc->pfmemalloc; - if (likely(skb)) { - skb_reserve(skb, NET_SKB_PAD); - skb->dev = dev; + local_irq_restore(flags); + + if (unlikely(!data)) + return NULL; + + skb = __build_skb(data, len); + if (unlikely(!skb)) { + put_page(virt_to_head_page(data)); + return NULL; } + /* use OR instead of assignment to avoid clearing of bits in mask */ + if (pfmemalloc) + skb->pfmemalloc = 1; + skb->head_frag = 1; + + skb_reserve(skb, NET_SKB_PAD); + skb->dev = dev; + return skb; } EXPORT_SYMBOL(__netdev_alloc_skb); @@ -551,19 +544,43 @@ EXPORT_SYMBOL(__netdev_alloc_skb); * * %NULL is returned if there is no free memory. */ -struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, - unsigned int length, gfp_t gfp_mask) +struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, + gfp_t gfp_mask) { + struct netdev_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); struct sk_buff *skb; + void *data; + + len += NET_SKB_PAD + NET_IP_ALIGN; + + if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) || + (gfp_mask & (__GFP_WAIT | GFP_DMA))) + return __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); + + len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + len = SKB_DATA_ALIGN(len); - length += NET_SKB_PAD + NET_IP_ALIGN; - skb = __alloc_rx_skb(length, gfp_mask, SKB_ALLOC_NAPI); + if (sk_memalloc_socks()) + gfp_mask |= __GFP_MEMALLOC; + + data = __alloc_page_frag(nc, len, gfp_mask); + if (unlikely(!data)) + return NULL; - if (likely(skb)) { - skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); - skb->dev = napi->dev; + skb = __build_skb(data, len); + if (unlikely(!skb)) { + put_page(virt_to_head_page(data)); + return NULL; } + /* use OR instead of assignment to avoid clearing of bits in mask */ + if (nc->pfmemalloc) + skb->pfmemalloc = 1; + skb->head_frag = 1; + + skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); + skb->dev = napi->dev; + return skb; } EXPORT_SYMBOL(__napi_alloc_skb); -- cgit From 0e39250845c0f91acc64264709b25f7f9b85c2c3 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 6 May 2015 21:11:51 -0700 Subject: net: Store virtual address instead of page in netdev_alloc_cache This change makes it so that we store the virtual address of the page in the netdev_alloc_cache instead of the page pointer. The idea behind this is to avoid multiple calls to page_address since the virtual address is required for every access, but the page pointer is only needed at allocation or reset of the page. While I was at it I also reordered the netdev_alloc_cache structure a bit so that the size is always 16 bytes by dropping size in the case where PAGE_SIZE is greater than or equal to 32KB. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/core/skbuff.c | 55 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 32 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index d6851ca32598..a3062ec341c3 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -348,7 +348,13 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size) EXPORT_SYMBOL(build_skb); struct netdev_alloc_cache { - struct page_frag frag; + void * va; +#if (PAGE_SIZE < NETDEV_FRAG_PAGE_MAX_SIZE) + __u16 offset; + __u16 size; +#else + __u32 offset; +#endif /* we maintain a pagecount bias, so that we dont dirty cache line * containing page->_count every time we allocate a fragment. */ @@ -361,21 +367,20 @@ static DEFINE_PER_CPU(struct netdev_alloc_cache, napi_alloc_cache); static struct page *__page_frag_refill(struct netdev_alloc_cache *nc, gfp_t gfp_mask) { - const unsigned int order = NETDEV_FRAG_PAGE_MAX_ORDER; struct page *page = NULL; gfp_t gfp = gfp_mask; - if (order) { - gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY | - __GFP_NOMEMALLOC; - page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, order); - nc->frag.size = PAGE_SIZE << (page ? order : 0); - } - +#if (PAGE_SIZE < NETDEV_FRAG_PAGE_MAX_SIZE) + gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY | + __GFP_NOMEMALLOC; + page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, + NETDEV_FRAG_PAGE_MAX_ORDER); + nc->size = page ? NETDEV_FRAG_PAGE_MAX_SIZE : PAGE_SIZE; +#endif if (unlikely(!page)) page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); - nc->frag.page = page; + nc->va = page ? page_address(page) : NULL; return page; } @@ -383,19 +388,20 @@ static struct page *__page_frag_refill(struct netdev_alloc_cache *nc, static void *__alloc_page_frag(struct netdev_alloc_cache *nc, unsigned int fragsz, gfp_t gfp_mask) { - struct page *page = nc->frag.page; - unsigned int size; + unsigned int size = PAGE_SIZE; + struct page *page; int offset; - if (unlikely(!page)) { + if (unlikely(!nc->va)) { refill: page = __page_frag_refill(nc, gfp_mask); if (!page) return NULL; - /* if size can vary use frag.size else just use PAGE_SIZE */ - size = NETDEV_FRAG_PAGE_MAX_ORDER ? nc->frag.size : PAGE_SIZE; - +#if (PAGE_SIZE < NETDEV_FRAG_PAGE_MAX_SIZE) + /* if size can vary use size else just use PAGE_SIZE */ + size = nc->size; +#endif /* Even if we own the page, we do not use atomic_set(). * This would break get_page_unless_zero() users. */ @@ -404,17 +410,20 @@ refill: /* reset page count bias and offset to start of new frag */ nc->pfmemalloc = page->pfmemalloc; nc->pagecnt_bias = size; - nc->frag.offset = size; + nc->offset = size; } - offset = nc->frag.offset - fragsz; + offset = nc->offset - fragsz; if (unlikely(offset < 0)) { + page = virt_to_page(nc->va); + if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count)) goto refill; - /* if size can vary use frag.size else just use PAGE_SIZE */ - size = NETDEV_FRAG_PAGE_MAX_ORDER ? nc->frag.size : PAGE_SIZE; - +#if (PAGE_SIZE < NETDEV_FRAG_PAGE_MAX_SIZE) + /* if size can vary use size else just use PAGE_SIZE */ + size = nc->size; +#endif /* OK, page count is 0, we can safely set it */ atomic_set(&page->_count, size); @@ -424,9 +433,9 @@ refill: } nc->pagecnt_bias--; - nc->frag.offset = offset; + nc->offset = offset; - return page_address(page) + offset; + return nc->va + offset; } static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) -- cgit From b63ae8ca096dfdbfeef6a209c30a93a966518853 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 6 May 2015 21:11:57 -0700 Subject: mm/net: Rename and move page fragment handling from net/ to mm/ This change moves the __alloc_page_frag functionality out of the networking stack and into the page allocation portion of mm. The idea it so help make this maintainable by placing it with other page allocation functions. Since we are moving it from skbuff.c to page_alloc.c I have also renamed the basic defines and structure from netdev_alloc_cache to page_frag_cache to reflect that this is now part of a different kernel subsystem. I have also added a simple __free_page_frag function which can handle freeing the frags based on the skb->head pointer. The model for this is based off of __free_pages since we don't actually need to deal with all of the cases that put_page handles. I incorporated the virt_to_head_page call and compound_order into the function as it actually allows for a signficant size reduction by reducing code duplication. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/core/skbuff.c | 100 ++++-------------------------------------------------- 1 file changed, 6 insertions(+), 94 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a3062ec341c3..dcc0e07abf47 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -347,100 +347,12 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size) } EXPORT_SYMBOL(build_skb); -struct netdev_alloc_cache { - void * va; -#if (PAGE_SIZE < NETDEV_FRAG_PAGE_MAX_SIZE) - __u16 offset; - __u16 size; -#else - __u32 offset; -#endif - /* we maintain a pagecount bias, so that we dont dirty cache line - * containing page->_count every time we allocate a fragment. - */ - unsigned int pagecnt_bias; - bool pfmemalloc; -}; -static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache); -static DEFINE_PER_CPU(struct netdev_alloc_cache, napi_alloc_cache); - -static struct page *__page_frag_refill(struct netdev_alloc_cache *nc, - gfp_t gfp_mask) -{ - struct page *page = NULL; - gfp_t gfp = gfp_mask; - -#if (PAGE_SIZE < NETDEV_FRAG_PAGE_MAX_SIZE) - gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY | - __GFP_NOMEMALLOC; - page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, - NETDEV_FRAG_PAGE_MAX_ORDER); - nc->size = page ? NETDEV_FRAG_PAGE_MAX_SIZE : PAGE_SIZE; -#endif - if (unlikely(!page)) - page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); - - nc->va = page ? page_address(page) : NULL; - - return page; -} - -static void *__alloc_page_frag(struct netdev_alloc_cache *nc, - unsigned int fragsz, gfp_t gfp_mask) -{ - unsigned int size = PAGE_SIZE; - struct page *page; - int offset; - - if (unlikely(!nc->va)) { -refill: - page = __page_frag_refill(nc, gfp_mask); - if (!page) - return NULL; - -#if (PAGE_SIZE < NETDEV_FRAG_PAGE_MAX_SIZE) - /* if size can vary use size else just use PAGE_SIZE */ - size = nc->size; -#endif - /* Even if we own the page, we do not use atomic_set(). - * This would break get_page_unless_zero() users. - */ - atomic_add(size - 1, &page->_count); - - /* reset page count bias and offset to start of new frag */ - nc->pfmemalloc = page->pfmemalloc; - nc->pagecnt_bias = size; - nc->offset = size; - } - - offset = nc->offset - fragsz; - if (unlikely(offset < 0)) { - page = virt_to_page(nc->va); - - if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count)) - goto refill; - -#if (PAGE_SIZE < NETDEV_FRAG_PAGE_MAX_SIZE) - /* if size can vary use size else just use PAGE_SIZE */ - size = nc->size; -#endif - /* OK, page count is 0, we can safely set it */ - atomic_set(&page->_count, size); - - /* reset page count bias and offset to start of new frag */ - nc->pagecnt_bias = size; - offset = size - fragsz; - } - - nc->pagecnt_bias--; - nc->offset = offset; - - return nc->va + offset; -} +static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); +static DEFINE_PER_CPU(struct page_frag_cache, napi_alloc_cache); static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) { - struct netdev_alloc_cache *nc; + struct page_frag_cache *nc; unsigned long flags; void *data; @@ -466,7 +378,7 @@ EXPORT_SYMBOL(netdev_alloc_frag); static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) { - struct netdev_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache); return __alloc_page_frag(nc, fragsz, gfp_mask); } @@ -493,7 +405,7 @@ EXPORT_SYMBOL(napi_alloc_frag); struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, gfp_t gfp_mask) { - struct netdev_alloc_cache *nc; + struct page_frag_cache *nc; unsigned long flags; struct sk_buff *skb; bool pfmemalloc; @@ -556,7 +468,7 @@ EXPORT_SYMBOL(__netdev_alloc_skb); struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, gfp_t gfp_mask) { - struct netdev_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache); struct sk_buff *skb; void *data; -- cgit From 181edb2bfa22b50817684135ab6430ed2808abf0 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 6 May 2015 21:12:03 -0700 Subject: net: Add skb_free_frag to replace use of put_page in freeing skb->head This change adds a function called skb_free_frag which is meant to compliment the function netdev_alloc_frag. The general idea is to enable a more lightweight version of page freeing since we don't actually need all the overhead of a put_page, and we don't quite fit the model of __free_pages. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/core/skbuff.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index dcc0e07abf47..d67e612bf0ef 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -436,7 +436,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, skb = __build_skb(data, len); if (unlikely(!skb)) { - put_page(virt_to_head_page(data)); + skb_free_frag(data); return NULL; } @@ -490,7 +490,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, skb = __build_skb(data, len); if (unlikely(!skb)) { - put_page(virt_to_head_page(data)); + skb_free_frag(data); return NULL; } @@ -549,10 +549,12 @@ static void skb_clone_fraglist(struct sk_buff *skb) static void skb_free_head(struct sk_buff *skb) { + unsigned char *head = skb->head; + if (skb->head_frag) - put_page(virt_to_head_page(skb->head)); + skb_free_frag(head); else - kfree(skb->head); + kfree(head); } static void skb_release_data(struct sk_buff *skb) -- cgit From a3eb95f891d6130b1fc03dd07a8b54cf0a5c8ab8 Mon Sep 17 00:00:00 2001 From: David Ward Date: Sat, 9 May 2015 22:01:46 -0400 Subject: net_sched: gred: add TCA_GRED_LIMIT attribute In a GRED qdisc, if the default "virtual queue" (VQ) does not have drop parameters configured, then packets for the default VQ are not subjected to RED and are only dropped if the queue is larger than the net_device's tx_queue_len. This behavior is useful for WRED mode, since these packets will still influence the calculated average queue length and (therefore) the drop probability for all of the other VQs. However, for some drivers tx_queue_len is zero. In other cases the user may wish to make the limit the same for all VQs (including the default VQ with no drop parameters). This change adds a TCA_GRED_LIMIT attribute to set the GRED queue limit, in bytes, during qdisc setup. (This limit is in bytes to be consistent with the drop parameters.) The default limit is the same as for a bfifo queue (tx_queue_len * psched_mtu). If the drop parameters of any VQ are configured with a smaller limit than the GRED queue limit, that VQ will still observe the smaller limit instead. Signed-off-by: David Ward Signed-off-by: David S. Miller --- net/sched/sch_gred.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index a4ca4517cdc8..826e2994152b 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -165,7 +165,8 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch) * if no default DP has been configured. This * allows for DP flows to be left untouched. */ - if (skb_queue_len(&sch->q) < qdisc_dev(sch)->tx_queue_len) + if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <= + sch->limit)) return qdisc_enqueue_tail(skb, sch); else goto drop; @@ -397,7 +398,10 @@ static inline int gred_change_vq(struct Qdisc *sch, int dp, q->DP = dp; q->prio = prio; - q->limit = ctl->limit; + if (ctl->limit > sch->limit) + q->limit = sch->limit; + else + q->limit = ctl->limit; if (q->backlog == 0) red_end_of_idle_period(&q->vars); @@ -414,6 +418,7 @@ static const struct nla_policy gred_policy[TCA_GRED_MAX + 1] = { [TCA_GRED_STAB] = { .len = 256 }, [TCA_GRED_DPS] = { .len = sizeof(struct tc_gred_sopt) }, [TCA_GRED_MAX_P] = { .type = NLA_U32 }, + [TCA_GRED_LIMIT] = { .type = NLA_U32 }, }; static int gred_change(struct Qdisc *sch, struct nlattr *opt) @@ -433,11 +438,15 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt) if (err < 0) return err; - if (tb[TCA_GRED_PARMS] == NULL && tb[TCA_GRED_STAB] == NULL) + if (tb[TCA_GRED_PARMS] == NULL && tb[TCA_GRED_STAB] == NULL) { + if (tb[TCA_GRED_LIMIT] != NULL) + sch->limit = nla_get_u32(tb[TCA_GRED_LIMIT]); return gred_change_table_def(sch, opt); + } if (tb[TCA_GRED_PARMS] == NULL || - tb[TCA_GRED_STAB] == NULL) + tb[TCA_GRED_STAB] == NULL || + tb[TCA_GRED_LIMIT] != NULL) return -EINVAL; max_P = tb[TCA_GRED_MAX_P] ? nla_get_u32(tb[TCA_GRED_MAX_P]) : 0; @@ -501,6 +510,14 @@ static int gred_init(struct Qdisc *sch, struct nlattr *opt) if (tb[TCA_GRED_PARMS] || tb[TCA_GRED_STAB]) return -EINVAL; + if (tb[TCA_GRED_LIMIT]) + sch->limit = nla_get_u32(tb[TCA_GRED_LIMIT]); + else { + u32 qlen = qdisc_dev(sch)->tx_queue_len ? : 1; + + sch->limit = qlen * psched_mtu(qdisc_dev(sch)); + } + return gred_change_table_def(sch, tb[TCA_GRED_DPS]); } @@ -531,6 +548,9 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) if (nla_put(skb, TCA_GRED_MAX_P, sizeof(max_p), max_p)) goto nla_put_failure; + if (nla_put_u32(skb, TCA_GRED_LIMIT, sch->limit)) + goto nla_put_failure; + parms = nla_nest_start(skb, TCA_GRED_PARMS); if (parms == NULL) goto nla_put_failure; -- cgit From ebb9a03a590e2325f747be43c8db450e92509501 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sun, 10 May 2015 09:47:46 -0700 Subject: switchdev: s/netdev_switch_/switchdev_/ and s/NETDEV_SWITCH_/SWITCHDEV_/ Turned out that "switchdev" sticks. So just unify all related terms to use this prefix. Signed-off-by: Jiri Pirko Signed-off-by: Scott Feldman Acked-by: Roopa Prabhu Acked-by: Andy Gospodarek Signed-off-by: David S. Miller --- net/bridge/br.c | 22 ++++---- net/bridge/br_netlink.c | 6 +-- net/bridge/br_stp.c | 2 +- net/core/net-sysfs.c | 2 +- net/core/rtnetlink.c | 2 +- net/ipv4/fib_trie.c | 40 +++++++------- net/switchdev/switchdev.c | 134 +++++++++++++++++++++++----------------------- 7 files changed, 101 insertions(+), 107 deletions(-) (limited to 'net') diff --git a/net/bridge/br.c b/net/bridge/br.c index 02c24cf63c34..a1abe4936fe1 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -121,13 +121,13 @@ static struct notifier_block br_device_notifier = { .notifier_call = br_device_event }; -static int br_netdev_switch_event(struct notifier_block *unused, - unsigned long event, void *ptr) +static int br_switchdev_event(struct notifier_block *unused, + unsigned long event, void *ptr) { - struct net_device *dev = netdev_switch_notifier_info_to_dev(ptr); + struct net_device *dev = switchdev_notifier_info_to_dev(ptr); struct net_bridge_port *p; struct net_bridge *br; - struct netdev_switch_notifier_fdb_info *fdb_info; + struct switchdev_notifier_fdb_info *fdb_info; int err = NOTIFY_DONE; rtnl_lock(); @@ -138,14 +138,14 @@ static int br_netdev_switch_event(struct notifier_block *unused, br = p->br; switch (event) { - case NETDEV_SWITCH_FDB_ADD: + case SWITCHDEV_FDB_ADD: fdb_info = ptr; err = br_fdb_external_learn_add(br, p, fdb_info->addr, fdb_info->vid); if (err) err = notifier_from_errno(err); break; - case NETDEV_SWITCH_FDB_DEL: + case SWITCHDEV_FDB_DEL: fdb_info = ptr; err = br_fdb_external_learn_del(br, p, fdb_info->addr, fdb_info->vid); @@ -159,8 +159,8 @@ out: return err; } -static struct notifier_block br_netdev_switch_notifier = { - .notifier_call = br_netdev_switch_event, +static struct notifier_block br_switchdev_notifier = { + .notifier_call = br_switchdev_event, }; static void __net_exit br_net_exit(struct net *net) @@ -214,7 +214,7 @@ static int __init br_init(void) if (err) goto err_out3; - err = register_netdev_switch_notifier(&br_netdev_switch_notifier); + err = register_switchdev_notifier(&br_switchdev_notifier); if (err) goto err_out4; @@ -235,7 +235,7 @@ static int __init br_init(void) return 0; err_out5: - unregister_netdev_switch_notifier(&br_netdev_switch_notifier); + unregister_switchdev_notifier(&br_switchdev_notifier); err_out4: unregister_netdevice_notifier(&br_device_notifier); err_out3: @@ -253,7 +253,7 @@ static void __exit br_deinit(void) { stp_proto_unregister(&br_stp_proto); br_netlink_fini(); - unregister_netdev_switch_notifier(&br_netdev_switch_notifier); + unregister_switchdev_notifier(&br_switchdev_notifier); unregister_netdevice_notifier(&br_device_notifier); brioctl_set(NULL); unregister_pernet_subsys(&br_net_ops); diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 4b5c236998ff..dc234533e204 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -631,8 +631,7 @@ int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags) if (p && !(flags & BRIDGE_FLAGS_SELF)) { /* set bridge attributes in hardware if supported */ - ret_offload = netdev_switch_port_bridge_setlink(dev, nlh, - flags); + ret_offload = switchdev_port_bridge_setlink(dev, nlh, flags); if (ret_offload && ret_offload != -EOPNOTSUPP) br_warn(p->br, "error setting attrs on port %u(%s)\n", (unsigned int)p->port_no, p->dev->name); @@ -671,8 +670,7 @@ int br_dellink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags) if (p && !(flags & BRIDGE_FLAGS_SELF)) { /* del bridge attributes in hardware */ - ret_offload = netdev_switch_port_bridge_dellink(dev, nlh, - flags); + ret_offload = switchdev_port_bridge_dellink(dev, nlh, flags); if (ret_offload && ret_offload != -EOPNOTSUPP) br_warn(p->br, "error deleting attrs on port %u (%s)\n", (unsigned int)p->port_no, p->dev->name); diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index fb3ebe615513..28e3f4bc01e0 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -42,7 +42,7 @@ void br_set_state(struct net_bridge_port *p, unsigned int state) int err; p->state = state; - err = netdev_switch_port_stp_update(p->dev, state); + err = switchdev_port_stp_update(p->dev, state); if (err && err != -EOPNOTSUPP) br_warn(p->br, "error setting offload STP state on port %u(%s)\n", (unsigned int) p->port_no, p->dev->name); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 4238d6da5c60..be86a7ce9282 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -460,7 +460,7 @@ static ssize_t phys_switch_id_show(struct device *dev, if (dev_isalive(netdev)) { struct netdev_phys_item_id ppid; - ret = netdev_switch_parent_id_get(netdev, &ppid); + ret = switchdev_parent_id_get(netdev, &ppid); if (!ret) ret = sprintf(buf, "%*phN\n", ppid.id_len, ppid.id); } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 83e08323fdcd..fcd41fcd7e70 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1006,7 +1006,7 @@ static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev) int err; struct netdev_phys_item_id psid; - err = netdev_switch_parent_id_get(dev, &psid); + err = switchdev_parent_id_get(dev, &psid); if (err) { if (err == -EOPNOTSUPP) return 0; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index e13fcc602da2..03444c6ae342 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1165,13 +1165,13 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) new_fa->fa_state = state & ~FA_S_ACCESSED; new_fa->fa_slen = fa->fa_slen; - err = netdev_switch_fib_ipv4_add(key, plen, fi, - new_fa->fa_tos, - cfg->fc_type, - cfg->fc_nlflags, - tb->tb_id); + err = switchdev_fib_ipv4_add(key, plen, fi, + new_fa->fa_tos, + cfg->fc_type, + cfg->fc_nlflags, + tb->tb_id); if (err) { - netdev_switch_fib_ipv4_abort(fi); + switchdev_fib_ipv4_abort(fi); kmem_cache_free(fn_alias_kmem, new_fa); goto out; } @@ -1215,12 +1215,10 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) new_fa->tb_id = tb->tb_id; /* (Optionally) offload fib entry to switch hardware. */ - err = netdev_switch_fib_ipv4_add(key, plen, fi, tos, - cfg->fc_type, - cfg->fc_nlflags, - tb->tb_id); + err = switchdev_fib_ipv4_add(key, plen, fi, tos, cfg->fc_type, + cfg->fc_nlflags, tb->tb_id); if (err) { - netdev_switch_fib_ipv4_abort(fi); + switchdev_fib_ipv4_abort(fi); goto out_free_new_fa; } @@ -1239,7 +1237,7 @@ succeeded: return 0; out_sw_fib_del: - netdev_switch_fib_ipv4_del(key, plen, fi, tos, cfg->fc_type, tb->tb_id); + switchdev_fib_ipv4_del(key, plen, fi, tos, cfg->fc_type, tb->tb_id); out_free_new_fa: kmem_cache_free(fn_alias_kmem, new_fa); out: @@ -1517,8 +1515,8 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) if (!fa_to_delete) return -ESRCH; - netdev_switch_fib_ipv4_del(key, plen, fa_to_delete->fa_info, tos, - cfg->fc_type, tb->tb_id); + switchdev_fib_ipv4_del(key, plen, fa_to_delete->fa_info, tos, + cfg->fc_type, tb->tb_id); rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id, &cfg->fc_nlinfo, 0); @@ -1767,10 +1765,9 @@ void fib_table_flush_external(struct fib_table *tb) if (!fi || !(fi->fib_flags & RTNH_F_EXTERNAL)) continue; - netdev_switch_fib_ipv4_del(n->key, - KEYLENGTH - fa->fa_slen, - fi, fa->fa_tos, - fa->fa_type, tb->tb_id); + switchdev_fib_ipv4_del(n->key, KEYLENGTH - fa->fa_slen, + fi, fa->fa_tos, fa->fa_type, + tb->tb_id); } /* update leaf slen */ @@ -1835,10 +1832,9 @@ int fib_table_flush(struct fib_table *tb) continue; } - netdev_switch_fib_ipv4_del(n->key, - KEYLENGTH - fa->fa_slen, - fi, fa->fa_tos, - fa->fa_type, tb->tb_id); + switchdev_fib_ipv4_del(n->key, KEYLENGTH - fa->fa_slen, + fi, fa->fa_tos, fa->fa_type, + tb->tb_id); hlist_del_rcu(&fa->fa_list); fib_release_info(fa->fa_info); alias_free_mem_rcu(fa); diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 46568b85c333..52613ed49a8c 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -19,14 +19,14 @@ #include /** - * netdev_switch_parent_id_get - Get ID of a switch + * switchdev_parent_id_get - Get ID of a switch * @dev: port device * @psid: switch ID * * Get ID of a switch this port is part of. */ -int netdev_switch_parent_id_get(struct net_device *dev, - struct netdev_phys_item_id *psid) +int switchdev_parent_id_get(struct net_device *dev, + struct netdev_phys_item_id *psid) { const struct swdev_ops *ops = dev->swdev_ops; @@ -34,17 +34,17 @@ int netdev_switch_parent_id_get(struct net_device *dev, return -EOPNOTSUPP; return ops->swdev_parent_id_get(dev, psid); } -EXPORT_SYMBOL_GPL(netdev_switch_parent_id_get); +EXPORT_SYMBOL_GPL(switchdev_parent_id_get); /** - * netdev_switch_port_stp_update - Notify switch device port of STP + * switchdev_port_stp_update - Notify switch device port of STP * state change * @dev: port device * @state: port STP state * * Notify switch device port of bridge port STP state change. */ -int netdev_switch_port_stp_update(struct net_device *dev, u8 state) +int switchdev_port_stp_update(struct net_device *dev, u8 state) { const struct swdev_ops *ops = dev->swdev_ops; struct net_device *lower_dev; @@ -55,57 +55,57 @@ int netdev_switch_port_stp_update(struct net_device *dev, u8 state) return ops->swdev_port_stp_update(dev, state); netdev_for_each_lower_dev(dev, lower_dev, iter) { - err = netdev_switch_port_stp_update(lower_dev, state); + err = switchdev_port_stp_update(lower_dev, state); if (err && err != -EOPNOTSUPP) return err; } return err; } -EXPORT_SYMBOL_GPL(netdev_switch_port_stp_update); +EXPORT_SYMBOL_GPL(switchdev_port_stp_update); -static DEFINE_MUTEX(netdev_switch_mutex); -static RAW_NOTIFIER_HEAD(netdev_switch_notif_chain); +static DEFINE_MUTEX(switchdev_mutex); +static RAW_NOTIFIER_HEAD(switchdev_notif_chain); /** - * register_netdev_switch_notifier - Register notifier + * register_switchdev_notifier - Register notifier * @nb: notifier_block * * Register switch device notifier. This should be used by code * which needs to monitor events happening in particular device. * Return values are same as for atomic_notifier_chain_register(). */ -int register_netdev_switch_notifier(struct notifier_block *nb) +int register_switchdev_notifier(struct notifier_block *nb) { int err; - mutex_lock(&netdev_switch_mutex); - err = raw_notifier_chain_register(&netdev_switch_notif_chain, nb); - mutex_unlock(&netdev_switch_mutex); + mutex_lock(&switchdev_mutex); + err = raw_notifier_chain_register(&switchdev_notif_chain, nb); + mutex_unlock(&switchdev_mutex); return err; } -EXPORT_SYMBOL_GPL(register_netdev_switch_notifier); +EXPORT_SYMBOL_GPL(register_switchdev_notifier); /** - * unregister_netdev_switch_notifier - Unregister notifier + * unregister_switchdev_notifier - Unregister notifier * @nb: notifier_block * * Unregister switch device notifier. * Return values are same as for atomic_notifier_chain_unregister(). */ -int unregister_netdev_switch_notifier(struct notifier_block *nb) +int unregister_switchdev_notifier(struct notifier_block *nb) { int err; - mutex_lock(&netdev_switch_mutex); - err = raw_notifier_chain_unregister(&netdev_switch_notif_chain, nb); - mutex_unlock(&netdev_switch_mutex); + mutex_lock(&switchdev_mutex); + err = raw_notifier_chain_unregister(&switchdev_notif_chain, nb); + mutex_unlock(&switchdev_mutex); return err; } -EXPORT_SYMBOL_GPL(unregister_netdev_switch_notifier); +EXPORT_SYMBOL_GPL(unregister_switchdev_notifier); /** - * call_netdev_switch_notifiers - Call notifiers + * call_switchdev_notifiers - Call notifiers * @val: value passed unmodified to notifier function * @dev: port device * @info: notifier information data @@ -114,21 +114,21 @@ EXPORT_SYMBOL_GPL(unregister_netdev_switch_notifier); * when it needs to propagate hardware event. * Return values are same as for atomic_notifier_call_chain(). */ -int call_netdev_switch_notifiers(unsigned long val, struct net_device *dev, - struct netdev_switch_notifier_info *info) +int call_switchdev_notifiers(unsigned long val, struct net_device *dev, + struct switchdev_notifier_info *info) { int err; info->dev = dev; - mutex_lock(&netdev_switch_mutex); - err = raw_notifier_call_chain(&netdev_switch_notif_chain, val, info); - mutex_unlock(&netdev_switch_mutex); + mutex_lock(&switchdev_mutex); + err = raw_notifier_call_chain(&switchdev_notif_chain, val, info); + mutex_unlock(&switchdev_mutex); return err; } -EXPORT_SYMBOL_GPL(call_netdev_switch_notifiers); +EXPORT_SYMBOL_GPL(call_switchdev_notifiers); /** - * netdev_switch_port_bridge_setlink - Notify switch device port of bridge + * switchdev_port_bridge_setlink - Notify switch device port of bridge * port attributes * * @dev: port device @@ -137,8 +137,8 @@ EXPORT_SYMBOL_GPL(call_netdev_switch_notifiers); * * Notify switch device port of bridge port attributes */ -int netdev_switch_port_bridge_setlink(struct net_device *dev, - struct nlmsghdr *nlh, u16 flags) +int switchdev_port_bridge_setlink(struct net_device *dev, + struct nlmsghdr *nlh, u16 flags) { const struct net_device_ops *ops = dev->netdev_ops; @@ -150,10 +150,10 @@ int netdev_switch_port_bridge_setlink(struct net_device *dev, return ops->ndo_bridge_setlink(dev, nlh, flags); } -EXPORT_SYMBOL_GPL(netdev_switch_port_bridge_setlink); +EXPORT_SYMBOL_GPL(switchdev_port_bridge_setlink); /** - * netdev_switch_port_bridge_dellink - Notify switch device port of bridge + * switchdev_port_bridge_dellink - Notify switch device port of bridge * port attribute delete * * @dev: port device @@ -162,8 +162,8 @@ EXPORT_SYMBOL_GPL(netdev_switch_port_bridge_setlink); * * Notify switch device port of bridge port attribute delete */ -int netdev_switch_port_bridge_dellink(struct net_device *dev, - struct nlmsghdr *nlh, u16 flags) +int switchdev_port_bridge_dellink(struct net_device *dev, + struct nlmsghdr *nlh, u16 flags) { const struct net_device_ops *ops = dev->netdev_ops; @@ -175,11 +175,11 @@ int netdev_switch_port_bridge_dellink(struct net_device *dev, return ops->ndo_bridge_dellink(dev, nlh, flags); } -EXPORT_SYMBOL_GPL(netdev_switch_port_bridge_dellink); +EXPORT_SYMBOL_GPL(switchdev_port_bridge_dellink); /** - * ndo_dflt_netdev_switch_port_bridge_setlink - default ndo bridge setlink - * op for master devices + * ndo_dflt_switchdev_port_bridge_setlink - default ndo bridge setlink + * op for master devices * * @dev: port device * @nlh: netlink msg with bridge port attributes @@ -187,8 +187,8 @@ EXPORT_SYMBOL_GPL(netdev_switch_port_bridge_dellink); * * Notify master device slaves of bridge port attributes */ -int ndo_dflt_netdev_switch_port_bridge_setlink(struct net_device *dev, - struct nlmsghdr *nlh, u16 flags) +int ndo_dflt_switchdev_port_bridge_setlink(struct net_device *dev, + struct nlmsghdr *nlh, u16 flags) { struct net_device *lower_dev; struct list_head *iter; @@ -198,18 +198,18 @@ int ndo_dflt_netdev_switch_port_bridge_setlink(struct net_device *dev, return ret; netdev_for_each_lower_dev(dev, lower_dev, iter) { - err = netdev_switch_port_bridge_setlink(lower_dev, nlh, flags); + err = switchdev_port_bridge_setlink(lower_dev, nlh, flags); if (err && err != -EOPNOTSUPP) ret = err; } return ret; } -EXPORT_SYMBOL_GPL(ndo_dflt_netdev_switch_port_bridge_setlink); +EXPORT_SYMBOL_GPL(ndo_dflt_switchdev_port_bridge_setlink); /** - * ndo_dflt_netdev_switch_port_bridge_dellink - default ndo bridge dellink - * op for master devices + * ndo_dflt_switchdev_port_bridge_dellink - default ndo bridge dellink + * op for master devices * * @dev: port device * @nlh: netlink msg with bridge port attributes @@ -217,8 +217,8 @@ EXPORT_SYMBOL_GPL(ndo_dflt_netdev_switch_port_bridge_setlink); * * Notify master device slaves of bridge port attribute deletes */ -int ndo_dflt_netdev_switch_port_bridge_dellink(struct net_device *dev, - struct nlmsghdr *nlh, u16 flags) +int ndo_dflt_switchdev_port_bridge_dellink(struct net_device *dev, + struct nlmsghdr *nlh, u16 flags) { struct net_device *lower_dev; struct list_head *iter; @@ -228,16 +228,16 @@ int ndo_dflt_netdev_switch_port_bridge_dellink(struct net_device *dev, return ret; netdev_for_each_lower_dev(dev, lower_dev, iter) { - err = netdev_switch_port_bridge_dellink(lower_dev, nlh, flags); + err = switchdev_port_bridge_dellink(lower_dev, nlh, flags); if (err && err != -EOPNOTSUPP) ret = err; } return ret; } -EXPORT_SYMBOL_GPL(ndo_dflt_netdev_switch_port_bridge_dellink); +EXPORT_SYMBOL_GPL(ndo_dflt_switchdev_port_bridge_dellink); -static struct net_device *netdev_switch_get_lowest_dev(struct net_device *dev) +static struct net_device *switchdev_get_lowest_dev(struct net_device *dev) { const struct swdev_ops *ops = dev->swdev_ops; struct net_device *lower_dev; @@ -253,7 +253,7 @@ static struct net_device *netdev_switch_get_lowest_dev(struct net_device *dev) return dev; netdev_for_each_lower_dev(dev, lower_dev, iter) { - port_dev = netdev_switch_get_lowest_dev(lower_dev); + port_dev = switchdev_get_lowest_dev(lower_dev); if (port_dev) return port_dev; } @@ -261,7 +261,7 @@ static struct net_device *netdev_switch_get_lowest_dev(struct net_device *dev) return NULL; } -static struct net_device *netdev_switch_get_dev_by_nhs(struct fib_info *fi) +static struct net_device *switchdev_get_dev_by_nhs(struct fib_info *fi) { struct netdev_phys_item_id psid; struct netdev_phys_item_id prev_psid; @@ -276,11 +276,11 @@ static struct net_device *netdev_switch_get_dev_by_nhs(struct fib_info *fi) if (!nh->nh_dev) return NULL; - dev = netdev_switch_get_lowest_dev(nh->nh_dev); + dev = switchdev_get_lowest_dev(nh->nh_dev); if (!dev) return NULL; - if (netdev_switch_parent_id_get(dev, &psid)) + if (switchdev_parent_id_get(dev, &psid)) return NULL; if (nhsel > 0) { @@ -297,7 +297,7 @@ static struct net_device *netdev_switch_get_dev_by_nhs(struct fib_info *fi) } /** - * netdev_switch_fib_ipv4_add - Add IPv4 route entry to switch + * switchdev_fib_ipv4_add - Add IPv4 route entry to switch * * @dst: route's IPv4 destination address * @dst_len: destination address length (prefix length) @@ -309,8 +309,8 @@ static struct net_device *netdev_switch_get_dev_by_nhs(struct fib_info *fi) * * Add IPv4 route entry to switch device. */ -int netdev_switch_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi, - u8 tos, u8 type, u32 nlflags, u32 tb_id) +int switchdev_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi, + u8 tos, u8 type, u32 nlflags, u32 tb_id) { struct net_device *dev; const struct swdev_ops *ops; @@ -328,7 +328,7 @@ int netdev_switch_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi, if (fi->fib_net->ipv4.fib_offload_disabled) return 0; - dev = netdev_switch_get_dev_by_nhs(fi); + dev = switchdev_get_dev_by_nhs(fi); if (!dev) return 0; ops = dev->swdev_ops; @@ -343,10 +343,10 @@ int netdev_switch_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi, return err; } -EXPORT_SYMBOL_GPL(netdev_switch_fib_ipv4_add); +EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_add); /** - * netdev_switch_fib_ipv4_del - Delete IPv4 route entry from switch + * switchdev_fib_ipv4_del - Delete IPv4 route entry from switch * * @dst: route's IPv4 destination address * @dst_len: destination address length (prefix length) @@ -357,8 +357,8 @@ EXPORT_SYMBOL_GPL(netdev_switch_fib_ipv4_add); * * Delete IPv4 route entry from switch device. */ -int netdev_switch_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi, - u8 tos, u8 type, u32 tb_id) +int switchdev_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi, + u8 tos, u8 type, u32 tb_id) { struct net_device *dev; const struct swdev_ops *ops; @@ -367,7 +367,7 @@ int netdev_switch_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi, if (!(fi->fib_flags & RTNH_F_EXTERNAL)) return 0; - dev = netdev_switch_get_dev_by_nhs(fi); + dev = switchdev_get_dev_by_nhs(fi); if (!dev) return 0; ops = dev->swdev_ops; @@ -381,14 +381,14 @@ int netdev_switch_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi, return err; } -EXPORT_SYMBOL_GPL(netdev_switch_fib_ipv4_del); +EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_del); /** - * netdev_switch_fib_ipv4_abort - Abort an IPv4 FIB operation + * switchdev_fib_ipv4_abort - Abort an IPv4 FIB operation * * @fi: route FIB info structure */ -void netdev_switch_fib_ipv4_abort(struct fib_info *fi) +void switchdev_fib_ipv4_abort(struct fib_info *fi) { /* There was a problem installing this route to the offload * device. For now, until we come up with more refined @@ -401,4 +401,4 @@ void netdev_switch_fib_ipv4_abort(struct fib_info *fi) fib_flush_external(fi->fib_net); fi->fib_net->ipv4.fib_offload_disabled = true; } -EXPORT_SYMBOL_GPL(netdev_switch_fib_ipv4_abort); +EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_abort); -- cgit From 9d47c0a2d958e06322c88245749278633d333cca Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sun, 10 May 2015 09:47:47 -0700 Subject: switchdev: s/swdev_/switchdev_/ Turned out that "switchdev" sticks. So just unify all related terms to use this prefix. Signed-off-by: Jiri Pirko Signed-off-by: Scott Feldman Acked-by: Roopa Prabhu Acked-by: Andy Gospodarek Signed-off-by: David S. Miller --- net/dsa/slave.c | 8 ++++---- net/switchdev/switchdev.c | 40 ++++++++++++++++++++-------------------- 2 files changed, 24 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 03e041addea3..1546acf6ebd3 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -675,9 +675,9 @@ static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_get_iflink = dsa_slave_get_iflink, }; -static const struct swdev_ops dsa_slave_swdev_ops = { - .swdev_parent_id_get = dsa_slave_parent_id_get, - .swdev_port_stp_update = dsa_slave_stp_update, +static const struct switchdev_ops dsa_slave_switchdev_ops = { + .switchdev_parent_id_get = dsa_slave_parent_id_get, + .switchdev_port_stp_update = dsa_slave_stp_update, }; static void dsa_slave_adjust_link(struct net_device *dev) @@ -866,7 +866,7 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent, eth_hw_addr_inherit(slave_dev, master); slave_dev->tx_queue_len = 0; slave_dev->netdev_ops = &dsa_slave_netdev_ops; - slave_dev->swdev_ops = &dsa_slave_swdev_ops; + slave_dev->switchdev_ops = &dsa_slave_switchdev_ops; netdev_for_each_tx_queue(slave_dev, dsa_slave_set_lockdep_class_one, NULL); diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 52613ed49a8c..b7f44a23def5 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -28,11 +28,11 @@ int switchdev_parent_id_get(struct net_device *dev, struct netdev_phys_item_id *psid) { - const struct swdev_ops *ops = dev->swdev_ops; + const struct switchdev_ops *ops = dev->switchdev_ops; - if (!ops || !ops->swdev_parent_id_get) + if (!ops || !ops->switchdev_parent_id_get) return -EOPNOTSUPP; - return ops->swdev_parent_id_get(dev, psid); + return ops->switchdev_parent_id_get(dev, psid); } EXPORT_SYMBOL_GPL(switchdev_parent_id_get); @@ -46,13 +46,13 @@ EXPORT_SYMBOL_GPL(switchdev_parent_id_get); */ int switchdev_port_stp_update(struct net_device *dev, u8 state) { - const struct swdev_ops *ops = dev->swdev_ops; + const struct switchdev_ops *ops = dev->switchdev_ops; struct net_device *lower_dev; struct list_head *iter; int err = -EOPNOTSUPP; - if (ops && ops->swdev_port_stp_update) - return ops->swdev_port_stp_update(dev, state); + if (ops && ops->switchdev_port_stp_update) + return ops->switchdev_port_stp_update(dev, state); netdev_for_each_lower_dev(dev, lower_dev, iter) { err = switchdev_port_stp_update(lower_dev, state); @@ -239,17 +239,17 @@ EXPORT_SYMBOL_GPL(ndo_dflt_switchdev_port_bridge_dellink); static struct net_device *switchdev_get_lowest_dev(struct net_device *dev) { - const struct swdev_ops *ops = dev->swdev_ops; + const struct switchdev_ops *ops = dev->switchdev_ops; struct net_device *lower_dev; struct net_device *port_dev; struct list_head *iter; /* Recusively search down until we find a sw port dev. - * (A sw port dev supports swdev_parent_id_get). + * (A sw port dev supports switchdev_parent_id_get). */ if (dev->features & NETIF_F_HW_SWITCH_OFFLOAD && - ops && ops->swdev_parent_id_get) + ops && ops->switchdev_parent_id_get) return dev; netdev_for_each_lower_dev(dev, lower_dev, iter) { @@ -313,7 +313,7 @@ int switchdev_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi, u8 tos, u8 type, u32 nlflags, u32 tb_id) { struct net_device *dev; - const struct swdev_ops *ops; + const struct switchdev_ops *ops; int err = 0; /* Don't offload route if using custom ip rules or if @@ -331,12 +331,12 @@ int switchdev_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi, dev = switchdev_get_dev_by_nhs(fi); if (!dev) return 0; - ops = dev->swdev_ops; + ops = dev->switchdev_ops; - if (ops->swdev_fib_ipv4_add) { - err = ops->swdev_fib_ipv4_add(dev, htonl(dst), dst_len, - fi, tos, type, nlflags, - tb_id); + if (ops->switchdev_fib_ipv4_add) { + err = ops->switchdev_fib_ipv4_add(dev, htonl(dst), dst_len, + fi, tos, type, nlflags, + tb_id); if (!err) fi->fib_flags |= RTNH_F_EXTERNAL; } @@ -361,7 +361,7 @@ int switchdev_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi, u8 tos, u8 type, u32 tb_id) { struct net_device *dev; - const struct swdev_ops *ops; + const struct switchdev_ops *ops; int err = 0; if (!(fi->fib_flags & RTNH_F_EXTERNAL)) @@ -370,11 +370,11 @@ int switchdev_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi, dev = switchdev_get_dev_by_nhs(fi); if (!dev) return 0; - ops = dev->swdev_ops; + ops = dev->switchdev_ops; - if (ops->swdev_fib_ipv4_del) { - err = ops->swdev_fib_ipv4_del(dev, htonl(dst), dst_len, - fi, tos, type, tb_id); + if (ops->switchdev_fib_ipv4_del) { + err = ops->switchdev_fib_ipv4_del(dev, htonl(dst), dst_len, + fi, tos, type, tb_id); if (!err) fi->fib_flags &= ~RTNH_F_EXTERNAL; } -- cgit From 3094333d9089d43e8b8f0418676fa6ae06c27b51 Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Sun, 10 May 2015 09:47:48 -0700 Subject: switchdev: introduce get/set attrs ops Add two new swdev ops for get/set switch port attributes. Most swdev interactions on a port are gets or sets on port attributes, so rather than adding ops for each attribute, let's define clean get/set ops for all attributes, and then we can have clear, consistent rules on how attributes propagate on stacked devs. Add the basic algorithms for get/set attr ops. Use the same recusive algo to walk lower devs we've used for STP updates, for example. For get, compare attr value for each lower dev and only return success if attr values match across all lower devs. For sets, set the same attr value for all lower devs. We'll use a two-phase prepare-commit transaction model for sets. In the first phase, the driver(s) are asked if attr set is OK. If all OK, the commit attr set in second phase. A driver would NACK the prepare phase if it can't set the attr due to lack of resources or support, within it's control. RTNL lock must be held across both phases because we'll recurse all lower devs first in prepare phase, and then recurse all lower devs again in commit phase. If any lower dev fails the prepare phase, we need to abort the transaction for all lower devs. If lower dev recusion isn't desired, allow a flag SWITCHDEV_F_NO_RECURSE to indicate get/set only work on port (lowest) device. Signed-off-by: Scott Feldman Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/switchdev/switchdev.c | 169 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 169 insertions(+) (limited to 'net') diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index b7f44a23def5..8f47187dc185 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -36,6 +36,175 @@ int switchdev_parent_id_get(struct net_device *dev, } EXPORT_SYMBOL_GPL(switchdev_parent_id_get); +/** + * switchdev_port_attr_get - Get port attribute + * + * @dev: port device + * @attr: attribute to get + */ +int switchdev_port_attr_get(struct net_device *dev, struct switchdev_attr *attr) +{ + const struct switchdev_ops *ops = dev->switchdev_ops; + struct net_device *lower_dev; + struct list_head *iter; + struct switchdev_attr first = { + .id = SWITCHDEV_ATTR_UNDEFINED + }; + int err = -EOPNOTSUPP; + + if (ops && ops->switchdev_port_attr_get) + return ops->switchdev_port_attr_get(dev, attr); + + if (attr->flags & SWITCHDEV_F_NO_RECURSE) + return err; + + /* Switch device port(s) may be stacked under + * bond/team/vlan dev, so recurse down to get attr on + * each port. Return -ENODATA if attr values don't + * compare across ports. + */ + + netdev_for_each_lower_dev(dev, lower_dev, iter) { + err = switchdev_port_attr_get(lower_dev, attr); + if (err) + break; + if (first.id == SWITCHDEV_ATTR_UNDEFINED) + first = *attr; + else if (memcmp(&first, attr, sizeof(*attr))) + return -ENODATA; + } + + return err; +} +EXPORT_SYMBOL_GPL(switchdev_port_attr_get); + +static int __switchdev_port_attr_set(struct net_device *dev, + struct switchdev_attr *attr) +{ + const struct switchdev_ops *ops = dev->switchdev_ops; + struct net_device *lower_dev; + struct list_head *iter; + int err = -EOPNOTSUPP; + + if (ops && ops->switchdev_port_attr_set) + return ops->switchdev_port_attr_set(dev, attr); + + if (attr->flags & SWITCHDEV_F_NO_RECURSE) + return err; + + /* Switch device port(s) may be stacked under + * bond/team/vlan dev, so recurse down to set attr on + * each port. + */ + + netdev_for_each_lower_dev(dev, lower_dev, iter) { + err = __switchdev_port_attr_set(lower_dev, attr); + if (err) + break; + } + + return err; +} + +struct switchdev_attr_set_work { + struct work_struct work; + struct net_device *dev; + struct switchdev_attr attr; +}; + +static void switchdev_port_attr_set_work(struct work_struct *work) +{ + struct switchdev_attr_set_work *asw = + container_of(work, struct switchdev_attr_set_work, work); + int err; + + rtnl_lock(); + err = switchdev_port_attr_set(asw->dev, &asw->attr); + BUG_ON(err); + rtnl_unlock(); + + dev_put(asw->dev); + kfree(work); +} + +static int switchdev_port_attr_set_defer(struct net_device *dev, + struct switchdev_attr *attr) +{ + struct switchdev_attr_set_work *asw; + + asw = kmalloc(sizeof(*asw), GFP_ATOMIC); + if (!asw) + return -ENOMEM; + + INIT_WORK(&asw->work, switchdev_port_attr_set_work); + + dev_hold(dev); + asw->dev = dev; + memcpy(&asw->attr, attr, sizeof(asw->attr)); + + schedule_work(&asw->work); + + return 0; +} + +/** + * switchdev_port_attr_set - Set port attribute + * + * @dev: port device + * @attr: attribute to set + * + * Use a 2-phase prepare-commit transaction model to ensure + * system is not left in a partially updated state due to + * failure from driver/device. + */ +int switchdev_port_attr_set(struct net_device *dev, struct switchdev_attr *attr) +{ + int err; + + if (!rtnl_is_locked()) { + /* Running prepare-commit transaction across stacked + * devices requires nothing moves, so if rtnl_lock is + * not held, schedule a worker thread to hold rtnl_lock + * while setting attr. + */ + + return switchdev_port_attr_set_defer(dev, attr); + } + + /* Phase I: prepare for attr set. Driver/device should fail + * here if there are going to be issues in the commit phase, + * such as lack of resources or support. The driver/device + * should reserve resources needed for the commit phase here, + * but should not commit the attr. + */ + + attr->trans = SWITCHDEV_TRANS_PREPARE; + err = __switchdev_port_attr_set(dev, attr); + if (err) { + /* Prepare phase failed: abort the transaction. Any + * resources reserved in the prepare phase are + * released. + */ + + attr->trans = SWITCHDEV_TRANS_ABORT; + __switchdev_port_attr_set(dev, attr); + + return err; + } + + /* Phase II: commit attr set. This cannot fail as a fault + * of driver/device. If it does, it's a bug in the driver/device + * because the driver said everythings was OK in phase I. + */ + + attr->trans = SWITCHDEV_TRANS_COMMIT; + err = __switchdev_port_attr_set(dev, attr); + BUG_ON(err); + + return err; +} +EXPORT_SYMBOL_GPL(switchdev_port_attr_set); + /** * switchdev_port_stp_update - Notify switch device port of STP * state change -- cgit From f8e20a9f87d33865cc1d67f13da0db8d457fc3c9 Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Sun, 10 May 2015 09:47:49 -0700 Subject: switchdev: convert parent_id_get to switchdev attr get Switch ID is just a gettable port attribute. Convert switchdev op switchdev_parent_id_get to a switchdev attr. Note: for sysfs and netlink interfaces, SWITCHDEV_ATTR_PORT_PARENT_ID is called with SWITCHDEV_F_NO_RECUSE to limit switch ID user-visiblity to only port netdevs. So when a port is stacked under bond/bridge, the user can only query switch id via the switch ports, but not via the upper devices Signed-off-by: Scott Feldman Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 10 +++++++--- net/core/rtnetlink.c | 9 ++++++--- net/dsa/slave.c | 16 +++++++++++----- net/switchdev/switchdev.c | 38 +++++++++++--------------------------- 4 files changed, 35 insertions(+), 38 deletions(-) (limited to 'net') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index be86a7ce9282..5a9ce96f6d27 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -458,11 +458,15 @@ static ssize_t phys_switch_id_show(struct device *dev, return restart_syscall(); if (dev_isalive(netdev)) { - struct netdev_phys_item_id ppid; + struct switchdev_attr attr = { + .id = SWITCHDEV_ATTR_PORT_PARENT_ID, + .flags = SWITCHDEV_F_NO_RECURSE, + }; - ret = switchdev_parent_id_get(netdev, &ppid); + ret = switchdev_port_attr_get(netdev, &attr); if (!ret) - ret = sprintf(buf, "%*phN\n", ppid.id_len, ppid.id); + ret = sprintf(buf, "%*phN\n", attr.ppid.id_len, + attr.ppid.id); } rtnl_unlock(); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index fcd41fcd7e70..c6c6b2c34926 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1004,16 +1004,19 @@ static int rtnl_phys_port_name_fill(struct sk_buff *skb, struct net_device *dev) static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev) { int err; - struct netdev_phys_item_id psid; + struct switchdev_attr attr = { + .id = SWITCHDEV_ATTR_PORT_PARENT_ID, + .flags = SWITCHDEV_F_NO_RECURSE, + }; - err = switchdev_parent_id_get(dev, &psid); + err = switchdev_port_attr_get(dev, &attr); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } - if (nla_put(skb, IFLA_PHYS_SWITCH_ID, psid.id_len, psid.id)) + if (nla_put(skb, IFLA_PHYS_SWITCH_ID, attr.ppid.id_len, attr.ppid.id)) return -EMSGSIZE; return 0; diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 1546acf6ebd3..de705b674ac9 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -382,14 +382,20 @@ static int dsa_slave_bridge_port_leave(struct net_device *dev) return ret; } -static int dsa_slave_parent_id_get(struct net_device *dev, - struct netdev_phys_item_id *psid) +static int dsa_slave_port_attr_get(struct net_device *dev, + struct switchdev_attr *attr) { struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; - psid->id_len = sizeof(ds->index); - memcpy(&psid->id, &ds->index, psid->id_len); + switch (attr->id) { + case SWITCHDEV_ATTR_PORT_PARENT_ID: + attr->ppid.id_len = sizeof(ds->index); + memcpy(&attr->ppid.id, &ds->index, attr->ppid.id_len); + break; + default: + return -EOPNOTSUPP; + } return 0; } @@ -676,7 +682,7 @@ static const struct net_device_ops dsa_slave_netdev_ops = { }; static const struct switchdev_ops dsa_slave_switchdev_ops = { - .switchdev_parent_id_get = dsa_slave_parent_id_get, + .switchdev_port_attr_get = dsa_slave_port_attr_get, .switchdev_port_stp_update = dsa_slave_stp_update, }; diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 8f47187dc185..117fd0797abd 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -18,24 +18,6 @@ #include #include -/** - * switchdev_parent_id_get - Get ID of a switch - * @dev: port device - * @psid: switch ID - * - * Get ID of a switch this port is part of. - */ -int switchdev_parent_id_get(struct net_device *dev, - struct netdev_phys_item_id *psid) -{ - const struct switchdev_ops *ops = dev->switchdev_ops; - - if (!ops || !ops->switchdev_parent_id_get) - return -EOPNOTSUPP; - return ops->switchdev_parent_id_get(dev, psid); -} -EXPORT_SYMBOL_GPL(switchdev_parent_id_get); - /** * switchdev_port_attr_get - Get port attribute * @@ -414,11 +396,10 @@ static struct net_device *switchdev_get_lowest_dev(struct net_device *dev) struct list_head *iter; /* Recusively search down until we find a sw port dev. - * (A sw port dev supports switchdev_parent_id_get). + * (A sw port dev supports switchdev_port_attr_get). */ - if (dev->features & NETIF_F_HW_SWITCH_OFFLOAD && - ops && ops->switchdev_parent_id_get) + if (ops && ops->switchdev_port_attr_get) return dev; netdev_for_each_lower_dev(dev, lower_dev, iter) { @@ -432,8 +413,10 @@ static struct net_device *switchdev_get_lowest_dev(struct net_device *dev) static struct net_device *switchdev_get_dev_by_nhs(struct fib_info *fi) { - struct netdev_phys_item_id psid; - struct netdev_phys_item_id prev_psid; + struct switchdev_attr attr = { + .id = SWITCHDEV_ATTR_PORT_PARENT_ID, + }; + struct switchdev_attr prev_attr; struct net_device *dev = NULL; int nhsel; @@ -449,17 +432,18 @@ static struct net_device *switchdev_get_dev_by_nhs(struct fib_info *fi) if (!dev) return NULL; - if (switchdev_parent_id_get(dev, &psid)) + if (switchdev_port_attr_get(dev, &attr)) return NULL; if (nhsel > 0) { - if (prev_psid.id_len != psid.id_len) + if (prev_attr.ppid.id_len != attr.ppid.id_len) return NULL; - if (memcmp(prev_psid.id, psid.id, psid.id_len)) + if (memcmp(prev_attr.ppid.id, attr.ppid.id, + attr.ppid.id_len)) return NULL; } - prev_psid = psid; + prev_attr = attr; } return dev; -- cgit From 3563606258cf3b8f02eabddb1cb45a94c44d9611 Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Sun, 10 May 2015 09:47:51 -0700 Subject: switchdev: convert STP update to switchdev attr set STP update is just a settable port attribute, so convert switchdev_port_stp_update to an attr set. For DSA, the prepare phase is skipped and STP updates are only done in the commit phase. This is because currently the DSA drivers don't need to allocate any memory for STP updates and the STP update will not fail to HW (unless something horrible goes wrong on the MDIO bus, in which case the prepare phase wouldn't have been able to predict anyway). Signed-off-by: Scott Feldman Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/bridge/br_stp.c | 6 +++++- net/dsa/slave.c | 20 +++++++++++++++++++- net/switchdev/switchdev.c | 28 ---------------------------- 3 files changed, 24 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index 28e3f4bc01e0..b9300da31565 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -39,10 +39,14 @@ void br_log_state(const struct net_bridge_port *p) void br_set_state(struct net_bridge_port *p, unsigned int state) { + struct switchdev_attr attr = { + .id = SWITCHDEV_ATTR_PORT_STP_STATE, + .stp_state = state, + }; int err; p->state = state; - err = switchdev_port_stp_update(p->dev, state); + err = switchdev_port_attr_set(p->dev, &attr); if (err && err != -EOPNOTSUPP) br_warn(p->br, "error setting offload STP state on port %u(%s)\n", (unsigned int) p->port_no, p->dev->name); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index de705b674ac9..3fb5210e318c 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -345,6 +345,24 @@ static int dsa_slave_stp_update(struct net_device *dev, u8 state) return ret; } +static int dsa_slave_port_attr_set(struct net_device *dev, + struct switchdev_attr *attr) +{ + int ret = 0; + + switch (attr->id) { + case SWITCHDEV_ATTR_PORT_STP_STATE: + if (attr->trans == SWITCHDEV_TRANS_COMMIT) + ret = dsa_slave_stp_update(dev, attr->stp_state); + break; + default: + ret = -EOPNOTSUPP; + break; + } + + return ret; +} + static int dsa_slave_bridge_port_join(struct net_device *dev, struct net_device *br) { @@ -683,7 +701,7 @@ static const struct net_device_ops dsa_slave_netdev_ops = { static const struct switchdev_ops dsa_slave_switchdev_ops = { .switchdev_port_attr_get = dsa_slave_port_attr_get, - .switchdev_port_stp_update = dsa_slave_stp_update, + .switchdev_port_attr_set = dsa_slave_port_attr_set, }; static void dsa_slave_adjust_link(struct net_device *dev) diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 117fd0797abd..a3c359004902 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -187,34 +187,6 @@ int switchdev_port_attr_set(struct net_device *dev, struct switchdev_attr *attr) } EXPORT_SYMBOL_GPL(switchdev_port_attr_set); -/** - * switchdev_port_stp_update - Notify switch device port of STP - * state change - * @dev: port device - * @state: port STP state - * - * Notify switch device port of bridge port STP state change. - */ -int switchdev_port_stp_update(struct net_device *dev, u8 state) -{ - const struct switchdev_ops *ops = dev->switchdev_ops; - struct net_device *lower_dev; - struct list_head *iter; - int err = -EOPNOTSUPP; - - if (ops && ops->switchdev_port_stp_update) - return ops->switchdev_port_stp_update(dev, state); - - netdev_for_each_lower_dev(dev, lower_dev, iter) { - err = switchdev_port_stp_update(lower_dev, state); - if (err && err != -EOPNOTSUPP) - return err; - } - - return err; -} -EXPORT_SYMBOL_GPL(switchdev_port_stp_update); - static DEFINE_MUTEX(switchdev_mutex); static RAW_NOTIFIER_HEAD(switchdev_notif_chain); -- cgit From 491d0f1533ac750260406dbf84cdad44fd3d8a29 Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Sun, 10 May 2015 09:47:52 -0700 Subject: switchdev: introduce switchdev add/del obj ops Like switchdev attr get/set, add new switchdev obj add/del. switchdev objs will be things like VLANs or FIB entries, so add/del fits better for objects than get/set used for attributes. Use same two-phase prepare-commit transaction model as in attr set. Signed-off-by: Scott Feldman Acked-by: Sridhar Samudrala Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/switchdev/switchdev.c | 107 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) (limited to 'net') diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index a3c359004902..3d4d99a70b80 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -187,6 +187,113 @@ int switchdev_port_attr_set(struct net_device *dev, struct switchdev_attr *attr) } EXPORT_SYMBOL_GPL(switchdev_port_attr_set); +int __switchdev_port_obj_add(struct net_device *dev, struct switchdev_obj *obj) +{ + const struct switchdev_ops *ops = dev->switchdev_ops; + struct net_device *lower_dev; + struct list_head *iter; + int err = -EOPNOTSUPP; + + if (ops && ops->switchdev_port_obj_add) + return ops->switchdev_port_obj_add(dev, obj); + + /* Switch device port(s) may be stacked under + * bond/team/vlan dev, so recurse down to add object on + * each port. + */ + + netdev_for_each_lower_dev(dev, lower_dev, iter) { + err = __switchdev_port_obj_add(lower_dev, obj); + if (err) + break; + } + + return err; +} + +/** + * switchdev_port_obj_add - Add port object + * + * @dev: port device + * @obj: object to add + * + * Use a 2-phase prepare-commit transaction model to ensure + * system is not left in a partially updated state due to + * failure from driver/device. + * + * rtnl_lock must be held. + */ +int switchdev_port_obj_add(struct net_device *dev, struct switchdev_obj *obj) +{ + int err; + + ASSERT_RTNL(); + + /* Phase I: prepare for obj add. Driver/device should fail + * here if there are going to be issues in the commit phase, + * such as lack of resources or support. The driver/device + * should reserve resources needed for the commit phase here, + * but should not commit the obj. + */ + + obj->trans = SWITCHDEV_TRANS_PREPARE; + err = __switchdev_port_obj_add(dev, obj); + if (err) { + /* Prepare phase failed: abort the transaction. Any + * resources reserved in the prepare phase are + * released. + */ + + obj->trans = SWITCHDEV_TRANS_ABORT; + __switchdev_port_obj_add(dev, obj); + + return err; + } + + /* Phase II: commit obj add. This cannot fail as a fault + * of driver/device. If it does, it's a bug in the driver/device + * because the driver said everythings was OK in phase I. + */ + + obj->trans = SWITCHDEV_TRANS_COMMIT; + err = __switchdev_port_obj_add(dev, obj); + WARN(err, "%s: Commit of object (id=%d) failed.\n", dev->name, obj->id); + + return err; +} +EXPORT_SYMBOL_GPL(switchdev_port_obj_add); + +/** + * switchdev_port_obj_del - Delete port object + * + * @dev: port device + * @obj: object to delete + */ +int switchdev_port_obj_del(struct net_device *dev, struct switchdev_obj *obj) +{ + const struct switchdev_ops *ops = dev->switchdev_ops; + struct net_device *lower_dev; + struct list_head *iter; + int err = -EOPNOTSUPP; + + if (ops && ops->switchdev_port_obj_del) + return ops->switchdev_port_obj_del(dev, obj); + + /* Switch device port(s) may be stacked under + * bond/team/vlan dev, so recurse down to delete object on + * each port. + */ + + netdev_for_each_lower_dev(dev, lower_dev, iter) { + err = switchdev_port_obj_del(lower_dev, obj); + if (err) + break; + } + + return err; +} +EXPORT_SYMBOL_GPL(switchdev_port_obj_del); + static DEFINE_MUTEX(switchdev_mutex); static RAW_NOTIFIER_HEAD(switchdev_notif_chain); -- cgit From 47f8328bb1a4115413e35b9b20d04b061ed544f8 Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Sun, 10 May 2015 09:47:56 -0700 Subject: switchdev: add new switchdev bridge setlink Add new switchdev_port_bridge_setlink that can be used by drivers implementing .ndo_bridge_setlink to set switchdev bridge attributes. Basically turn the raw rtnl_bridge_setlink netlink into switchdev attr sets. Proper netlink attr policy checking is done on the protinfo part of the netlink msg. Currently, for protinfo, only bridge port attrs BR_LEARNING and BR_LEARNING_SYNC are parsed and passed to port driver. For afspec, VLAN objs are passed so switchdev driver can set VLANs assigned to SELF. To illustrate with iproute2 cmd, we have: bridge vlan add vid 10 dev sw1p1 self master To add VLAN 10 to port sw1p1 for both the bridge (master) and the device (self). Signed-off-by: Scott Feldman Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/switchdev/switchdev.c | 151 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 140 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 3d4d99a70b80..b01791a9b56d 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -357,28 +358,156 @@ int call_switchdev_notifiers(unsigned long val, struct net_device *dev, } EXPORT_SYMBOL_GPL(call_switchdev_notifiers); +static int switchdev_port_br_setflag(struct net_device *dev, + struct nlattr *nlattr, + unsigned long brport_flag) +{ + struct switchdev_attr attr = { + .id = SWITCHDEV_ATTR_PORT_BRIDGE_FLAGS, + }; + u8 flag = nla_get_u8(nlattr); + int err; + + err = switchdev_port_attr_get(dev, &attr); + if (err) + return err; + + if (flag) + attr.brport_flags |= brport_flag; + else + attr.brport_flags &= ~brport_flag; + + return switchdev_port_attr_set(dev, &attr); +} + +static const struct nla_policy +switchdev_port_bridge_policy[IFLA_BRPORT_MAX + 1] = { + [IFLA_BRPORT_STATE] = { .type = NLA_U8 }, + [IFLA_BRPORT_COST] = { .type = NLA_U32 }, + [IFLA_BRPORT_PRIORITY] = { .type = NLA_U16 }, + [IFLA_BRPORT_MODE] = { .type = NLA_U8 }, + [IFLA_BRPORT_GUARD] = { .type = NLA_U8 }, + [IFLA_BRPORT_PROTECT] = { .type = NLA_U8 }, + [IFLA_BRPORT_FAST_LEAVE] = { .type = NLA_U8 }, + [IFLA_BRPORT_LEARNING] = { .type = NLA_U8 }, + [IFLA_BRPORT_LEARNING_SYNC] = { .type = NLA_U8 }, + [IFLA_BRPORT_UNICAST_FLOOD] = { .type = NLA_U8 }, +}; + +static int switchdev_port_br_setlink_protinfo(struct net_device *dev, + struct nlattr *protinfo) +{ + struct nlattr *attr; + int rem; + int err; + + err = nla_validate_nested(protinfo, IFLA_BRPORT_MAX, + switchdev_port_bridge_policy); + if (err) + return err; + + nla_for_each_nested(attr, protinfo, rem) { + switch (nla_type(attr)) { + case IFLA_BRPORT_LEARNING: + err = switchdev_port_br_setflag(dev, attr, + BR_LEARNING); + break; + case IFLA_BRPORT_LEARNING_SYNC: + err = switchdev_port_br_setflag(dev, attr, + BR_LEARNING_SYNC); + break; + default: + err = -EOPNOTSUPP; + break; + } + if (err) + return err; + } + + return 0; +} + +static int switchdev_port_br_afspec(struct net_device *dev, + struct nlattr *afspec, + int (*f)(struct net_device *dev, + struct switchdev_obj *obj)) +{ + struct nlattr *attr; + struct bridge_vlan_info *vinfo; + struct switchdev_obj obj = { + .id = SWITCHDEV_OBJ_PORT_VLAN, + }; + int rem; + int err; + + nla_for_each_nested(attr, afspec, rem) { + if (nla_type(attr) != IFLA_BRIDGE_VLAN_INFO) + continue; + if (nla_len(attr) != sizeof(struct bridge_vlan_info)) + return -EINVAL; + vinfo = nla_data(attr); + obj.vlan.flags = vinfo->flags; + if (vinfo->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) { + if (obj.vlan.vid_start) + return -EINVAL; + obj.vlan.vid_start = vinfo->vid; + } else if (vinfo->flags & BRIDGE_VLAN_INFO_RANGE_END) { + if (!obj.vlan.vid_start) + return -EINVAL; + obj.vlan.vid_end = vinfo->vid; + if (obj.vlan.vid_end <= obj.vlan.vid_start) + return -EINVAL; + err = f(dev, &obj); + if (err) + return err; + memset(&obj.vlan, 0, sizeof(obj.vlan)); + } else { + if (obj.vlan.vid_start) + return -EINVAL; + obj.vlan.vid_start = vinfo->vid; + obj.vlan.vid_end = vinfo->vid; + err = f(dev, &obj); + if (err) + return err; + memset(&obj.vlan, 0, sizeof(obj.vlan)); + } + } + + return 0; +} + /** - * switchdev_port_bridge_setlink - Notify switch device port of bridge - * port attributes + * switchdev_port_bridge_setlink - Set bridge port attributes * * @dev: port device - * @nlh: netlink msg with bridge port attributes - * @flags: bridge setlink flags + * @nlh: netlink header + * @flags: netlink flags * - * Notify switch device port of bridge port attributes + * Called for SELF on rtnl_bridge_setlink to set bridge port + * attributes. */ int switchdev_port_bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags) { - const struct net_device_ops *ops = dev->netdev_ops; + struct nlattr *protinfo; + struct nlattr *afspec; + int err = 0; - if (!(dev->features & NETIF_F_HW_SWITCH_OFFLOAD)) - return 0; + protinfo = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), + IFLA_PROTINFO); + if (protinfo) { + err = switchdev_port_br_setlink_protinfo(dev, protinfo); + if (err) + return err; + } - if (!ops->ndo_bridge_setlink) - return -EOPNOTSUPP; + afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), + IFLA_AF_SPEC); + if (afspec) + err = switchdev_port_br_afspec(dev, afspec, + switchdev_port_obj_add); - return ops->ndo_bridge_setlink(dev, nlh, flags); + return err; } EXPORT_SYMBOL_GPL(switchdev_port_bridge_setlink); -- cgit From e71f220b342d78cfb8ee9f1b60f1351f7183f2a5 Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Sun, 10 May 2015 09:47:58 -0700 Subject: switchdev: remove old switchdev_port_bridge_setlink New attr-based bridge_setlink can recurse lower devs and recover on err, so remove old wrapper (including ndo_dflt_switchdev_port_bridge_setlink). Signed-off-by: Scott Feldman Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/switchdev/switchdev.c | 30 ------------------------------ 1 file changed, 30 deletions(-) (limited to 'net') diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index b01791a9b56d..dcdec9de9137 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -536,36 +536,6 @@ int switchdev_port_bridge_dellink(struct net_device *dev, } EXPORT_SYMBOL_GPL(switchdev_port_bridge_dellink); -/** - * ndo_dflt_switchdev_port_bridge_setlink - default ndo bridge setlink - * op for master devices - * - * @dev: port device - * @nlh: netlink msg with bridge port attributes - * @flags: bridge setlink flags - * - * Notify master device slaves of bridge port attributes - */ -int ndo_dflt_switchdev_port_bridge_setlink(struct net_device *dev, - struct nlmsghdr *nlh, u16 flags) -{ - struct net_device *lower_dev; - struct list_head *iter; - int ret = 0, err = 0; - - if (!(dev->features & NETIF_F_HW_SWITCH_OFFLOAD)) - return ret; - - netdev_for_each_lower_dev(dev, lower_dev, iter) { - err = switchdev_port_bridge_setlink(lower_dev, nlh, flags); - if (err && err != -EOPNOTSUPP) - ret = err; - } - - return ret; -} -EXPORT_SYMBOL_GPL(ndo_dflt_switchdev_port_bridge_setlink); - /** * ndo_dflt_switchdev_port_bridge_dellink - default ndo bridge dellink * op for master devices -- cgit From 41c498b9359e360f08723b7605ec0c40926ec415 Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Sun, 10 May 2015 09:47:59 -0700 Subject: bridge: restore br_setlink back to original This is revert of: commit 68e331c785b8 ("bridge: offload bridge port attributes to switch asic if feature flag set") Restore br_setlink back to original and don't call into SELF port driver. rtnetlink.c:bridge_setlink() already does a call into port driver for SELF. bridge set link cmd defaults to MASTER. From man page for bridge link set cmd: self link setting is configured on specified physical device master link setting is configured on the software bridge (default) The link setting has two values: the device-side value and the software bridge-side value. These are independent and settable using the bridge link set cmd by specifying some combination of [master] | [self]. Furthermore, the device-side and bridge-side settings have their own initial value, viewable from bridge -d link show cmd. Restoring br_setlink back to original makes rocker (the only in-kernel user of SELF link settings) work as first implement: two-sided values. It's true that when both MASTER and SELF are specified from the command, two netlink notifications are generated, one for each side of the settings. The user-space app can distiquish between the two notifications by observing the MASTER or SELF flag. Signed-off-by: Scott Feldman Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index dc234533e204..e9b943574542 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -586,7 +586,7 @@ int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags) struct nlattr *afspec; struct net_bridge_port *p; struct nlattr *tb[IFLA_BRPORT_MAX + 1]; - int err = 0, ret_offload = 0; + int err = 0; protinfo = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_PROTINFO); afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); @@ -628,15 +628,6 @@ int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags) afspec, RTM_SETLINK); } - if (p && !(flags & BRIDGE_FLAGS_SELF)) { - /* set bridge attributes in hardware if supported - */ - ret_offload = switchdev_port_bridge_setlink(dev, nlh, flags); - if (ret_offload && ret_offload != -EOPNOTSUPP) - br_warn(p->br, "error setting attrs on port %u(%s)\n", - (unsigned int)p->port_no, p->dev->name); - } - if (err == 0) br_ifinfo_notify(RTM_NEWLINK, p); out: -- cgit From 5c34e0221423aeabc0b085adc5fccda3f91e2c49 Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Sun, 10 May 2015 09:48:00 -0700 Subject: switchdev: add new switchdev_port_bridge_dellink Same change as setlink. Provide the wrapper op for SELF ndo_bridge_dellink and call into the switchdev driver to delete afspec VLANs. Signed-off-by: Scott Feldman Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/switchdev/switchdev.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index dcdec9de9137..8ce678e397b4 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -512,27 +512,27 @@ int switchdev_port_bridge_setlink(struct net_device *dev, EXPORT_SYMBOL_GPL(switchdev_port_bridge_setlink); /** - * switchdev_port_bridge_dellink - Notify switch device port of bridge - * port attribute delete + * switchdev_port_bridge_dellink - Set bridge port attributes * * @dev: port device - * @nlh: netlink msg with bridge port attributes - * @flags: bridge setlink flags + * @nlh: netlink header + * @flags: netlink flags * - * Notify switch device port of bridge port attribute delete + * Called for SELF on rtnl_bridge_dellink to set bridge port + * attributes. */ int switchdev_port_bridge_dellink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags) { - const struct net_device_ops *ops = dev->netdev_ops; - - if (!(dev->features & NETIF_F_HW_SWITCH_OFFLOAD)) - return 0; + struct nlattr *afspec; - if (!ops->ndo_bridge_dellink) - return -EOPNOTSUPP; + afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), + IFLA_AF_SPEC); + if (afspec) + return switchdev_port_br_afspec(dev, afspec, + switchdev_port_obj_del); - return ops->ndo_bridge_dellink(dev, nlh, flags); + return 0; } EXPORT_SYMBOL_GPL(switchdev_port_bridge_dellink); -- cgit From 87a5dae59e7abaad911ab719caa5548dd6df5557 Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Sun, 10 May 2015 09:48:02 -0700 Subject: switchdev: remove unused switchdev_port_bridge_dellink Now we can remove old wrappers for dellink. Signed-off-by: Scott Feldman Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/switchdev/switchdev.c | 30 ------------------------------ 1 file changed, 30 deletions(-) (limited to 'net') diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 8ce678e397b4..0e15b6f6bb56 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -536,36 +536,6 @@ int switchdev_port_bridge_dellink(struct net_device *dev, } EXPORT_SYMBOL_GPL(switchdev_port_bridge_dellink); -/** - * ndo_dflt_switchdev_port_bridge_dellink - default ndo bridge dellink - * op for master devices - * - * @dev: port device - * @nlh: netlink msg with bridge port attributes - * @flags: bridge dellink flags - * - * Notify master device slaves of bridge port attribute deletes - */ -int ndo_dflt_switchdev_port_bridge_dellink(struct net_device *dev, - struct nlmsghdr *nlh, u16 flags) -{ - struct net_device *lower_dev; - struct list_head *iter; - int ret = 0, err = 0; - - if (!(dev->features & NETIF_F_HW_SWITCH_OFFLOAD)) - return ret; - - netdev_for_each_lower_dev(dev, lower_dev, iter) { - err = switchdev_port_bridge_dellink(lower_dev, nlh, flags); - if (err && err != -EOPNOTSUPP) - ret = err; - } - - return ret; -} -EXPORT_SYMBOL_GPL(ndo_dflt_switchdev_port_bridge_dellink); - static struct net_device *switchdev_get_lowest_dev(struct net_device *dev) { const struct switchdev_ops *ops = dev->switchdev_ops; -- cgit From 8508025c598bdee33d9afa153e9c00c7771e7d63 Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Sun, 10 May 2015 09:48:03 -0700 Subject: bridge: revert br_dellink change back to original This is revert of: commit 68e331c785b8 ("bridge: offload bridge port attributes to switch asic if feature flag set") Restore br_dellink back to original and don't call into SELF port driver. rtnetlink.c:bridge_dellink() already does a call into port driver for SELF. bridge vlan add/del cmd defaults to MASTER. From man page for bridge vlan add/del cmd: self the vlan is configured on the specified physical device. Required if the device is the bridge device. master the vlan is configured on the software bridge (default). Signed-off-by: Scott Feldman Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index e9b943574542..6b67ed3831de 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -639,7 +639,7 @@ int br_dellink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags) { struct nlattr *afspec; struct net_bridge_port *p; - int err = 0, ret_offload = 0; + int err = 0; afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); if (!afspec) @@ -658,15 +658,6 @@ int br_dellink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags) */ br_ifinfo_notify(RTM_NEWLINK, p); - if (p && !(flags & BRIDGE_FLAGS_SELF)) { - /* del bridge attributes in hardware - */ - ret_offload = switchdev_port_bridge_dellink(dev, nlh, flags); - if (ret_offload && ret_offload != -EOPNOTSUPP) - br_warn(p->br, "error deleting attrs on port %u (%s)\n", - (unsigned int)p->port_no, p->dev->name); - } - return err; } static int br_validate(struct nlattr *tb[], struct nlattr *data[]) -- cgit From 8793d0a664a8a2c5e18e929c1f995c784c105705 Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Sun, 10 May 2015 09:48:04 -0700 Subject: switchdev: add new switchdev_port_bridge_getlink Like bridge_setlink, add switchdev wrapper to handle bridge_getlink and call into port driver to get port attrs. For now, only BR_LEARNING and BR_LEARNING_SYNC are returned. To add more, we'll probably want to break away from ndo_dflt_bridge_getlink() and build the netlink skb directly in the switchdev code. Signed-off-by: Scott Feldman Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/switchdev/switchdev.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'net') diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 0e15b6f6bb56..9210355ec965 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -358,6 +358,34 @@ int call_switchdev_notifiers(unsigned long val, struct net_device *dev, } EXPORT_SYMBOL_GPL(call_switchdev_notifiers); +/** + * switchdev_port_bridge_getlink - Get bridge port attributes + * + * @dev: port device + * + * Called for SELF on rtnl_bridge_getlink to get bridge port + * attributes. + */ +int switchdev_port_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, + struct net_device *dev, u32 filter_mask, + int nlflags) +{ + struct switchdev_attr attr = { + .id = SWITCHDEV_ATTR_PORT_BRIDGE_FLAGS, + }; + u16 mode = BRIDGE_MODE_UNDEF; + u32 mask = BR_LEARNING | BR_LEARNING_SYNC; + int err; + + err = switchdev_port_attr_get(dev, &attr); + if (err) + return err; + + return ndo_dflt_bridge_getlink(skb, pid, seq, dev, mode, + attr.brport_flags, mask, nlflags); +} +EXPORT_SYMBOL_GPL(switchdev_port_bridge_getlink); + static int switchdev_port_br_setflag(struct net_device *dev, struct nlattr *nlattr, unsigned long brport_flag) -- cgit From 58c2cb16b116d7feace621bd6b647bbabacfa225 Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Sun, 10 May 2015 09:48:06 -0700 Subject: switchdev: convert fib_ipv4_add/del over to switchdev_port_obj_add/del The IPv4 FIB ops convert nicely to the switchdev objs and we're left with only four switchdev ops: port get/set and port add/del. Other objs will follow, such as FDB. So go ahead and convert IPv4 FIB over to switchdev obj for consistency, anticipating more objs to come. Signed-off-by: Scott Feldman Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/switchdev/switchdev.c | 49 ++++++++++++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 9210355ec965..65d49d4477b9 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -641,8 +641,19 @@ static struct net_device *switchdev_get_dev_by_nhs(struct fib_info *fi) int switchdev_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi, u8 tos, u8 type, u32 nlflags, u32 tb_id) { + struct switchdev_obj fib_obj = { + .id = SWITCHDEV_OBJ_IPV4_FIB, + .ipv4_fib = { + .dst = htonl(dst), + .dst_len = dst_len, + .fi = fi, + .tos = tos, + .type = type, + .nlflags = nlflags, + .tb_id = tb_id, + }, + }; struct net_device *dev; - const struct switchdev_ops *ops; int err = 0; /* Don't offload route if using custom ip rules or if @@ -660,15 +671,10 @@ int switchdev_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi, dev = switchdev_get_dev_by_nhs(fi); if (!dev) return 0; - ops = dev->switchdev_ops; - - if (ops->switchdev_fib_ipv4_add) { - err = ops->switchdev_fib_ipv4_add(dev, htonl(dst), dst_len, - fi, tos, type, nlflags, - tb_id); - if (!err) - fi->fib_flags |= RTNH_F_EXTERNAL; - } + + err = switchdev_port_obj_add(dev, &fib_obj); + if (!err) + fi->fib_flags |= RTNH_F_EXTERNAL; return err; } @@ -689,8 +695,19 @@ EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_add); int switchdev_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi, u8 tos, u8 type, u32 tb_id) { + struct switchdev_obj fib_obj = { + .id = SWITCHDEV_OBJ_IPV4_FIB, + .ipv4_fib = { + .dst = htonl(dst), + .dst_len = dst_len, + .fi = fi, + .tos = tos, + .type = type, + .nlflags = 0, + .tb_id = tb_id, + }, + }; struct net_device *dev; - const struct switchdev_ops *ops; int err = 0; if (!(fi->fib_flags & RTNH_F_EXTERNAL)) @@ -699,14 +716,10 @@ int switchdev_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi, dev = switchdev_get_dev_by_nhs(fi); if (!dev) return 0; - ops = dev->switchdev_ops; - if (ops->switchdev_fib_ipv4_del) { - err = ops->switchdev_fib_ipv4_del(dev, htonl(dst), dst_len, - fi, tos, type, tb_id); - if (!err) - fi->fib_flags &= ~RTNH_F_EXTERNAL; - } + err = switchdev_port_obj_del(dev, &fib_obj); + if (!err) + fi->fib_flags &= ~RTNH_F_EXTERNAL; return err; } -- cgit From 7889cbee8357aaed85898d028829dfb4f75bae2c Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Sun, 10 May 2015 09:48:07 -0700 Subject: switchdev: remove NETIF_F_HW_SWITCH_OFFLOAD feature flag Roopa said remove the feature flag for this series and she'll work on bringing it back if needed at a later date. Signed-off-by: Scott Feldman Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/ethtool.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 1d00b8922902..eb0c3ace7458 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -98,7 +98,6 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_RXALL_BIT] = "rx-all", [NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload", [NETIF_F_BUSY_POLL_BIT] = "busy-poll", - [NETIF_F_HW_SWITCH_OFFLOAD_BIT] = "hw-switch-offload", }; static const char -- cgit From a2029240e5836e73ebcc1a8ddb8c22d636f89c9a Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 11 May 2015 21:17:53 +0200 Subject: net: deinline netif_tx_stop_all_queues(), remove WARN_ON in netif_tx_stop_queue() These functions compile to 60 bytes of machine code each. With this .config: http://busybox.net/~vda/kernel_config there are 617 calls of netif_tx_stop_queue() and 49 calls of netif_tx_stop_all_queues() in vmlinux. To fix this, remove WARN_ON in netif_tx_stop_queue() as suggested by davem, and deinline netif_tx_stop_all_queues(). Change in code size is about 20k: text data bss dec hex filename 82426986 22255416 20627456 125309858 77813a2 vmlinux.before 82406248 22255416 20627456 125289120 777c2a0 vmlinux gcc-4.7.2 still creates deinlined version of netif_tx_stop_queue sometimes: $ nm --size-sort vmlinux | grep netif_tx_stop_queue | wc -l 190 ffffffff81b558a8 : ffffffff81b558a8: 55 push %rbp ffffffff81b558a9: 48 89 e5 mov %rsp,%rbp ffffffff81b558ac: f0 80 8f e0 01 00 00 lock orb $0x1,0x1e0(%rdi) ffffffff81b558b3: 01 ffffffff81b558b4: 5d pop %rbp ffffffff81b558b5: c3 retq This needs additional fixing. Signed-off-by: Denys Vlasenko CC: Alexei Starovoitov CC: Alexander Duyck CC: Joe Perches CC: David S. Miller CC: Jiri Pirko CC: linux-kernel@vger.kernel.org CC: netdev@vger.kernel.org CC: netfilter-devel@vger.kernel.org Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/dev.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index e5f77c40bbd1..fd012bbe0486 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6301,6 +6301,17 @@ static int netif_alloc_netdev_queues(struct net_device *dev) return 0; } +void netif_tx_stop_all_queues(struct net_device *dev) +{ + unsigned int i; + + for (i = 0; i < dev->num_tx_queues; i++) { + struct netdev_queue *txq = netdev_get_tx_queue(dev, i); + netif_tx_stop_queue(txq); + } +} +EXPORT_SYMBOL(netif_tx_stop_all_queues); + /** * register_netdevice - register a network device * @dev: device to register -- cgit From 9eea92226407e7a117ef1ceef45380ebd000a0e2 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 11 May 2015 15:19:48 -0700 Subject: pktgen: fix packet generation pkt_gen->last_ok was not set properly, so after the first burst pktgen instead of allocating new packet, will reuse old one, advance eth_type_trans further, which would mean the stack will be seeing very short bogus packets. Fixes: 62f64aed622b ("pktgen: introduce xmit_mode ''") Signed-off-by: Alexei Starovoitov Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- net/core/pktgen.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 8f2687da058e..62f979984a23 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -1189,6 +1189,16 @@ static ssize_t pktgen_if_write(struct file *file, return -ENOTSUPP; pkt_dev->xmit_mode = M_NETIF_RECEIVE; + + /* make sure new packet is allocated every time + * pktgen_xmit() is called + */ + pkt_dev->last_ok = 1; + + /* override clone_skb if user passed default value + * at module loading time + */ + pkt_dev->clone_skb = 0; } else { sprintf(pg_result, "xmit_mode -:%s:- unknown\nAvailable modes: %s", @@ -3415,7 +3425,6 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) /* get out of the loop and wait * until skb is consumed */ - pkt_dev->last_ok = 1; break; } /* skb was 'freed' by stack, so clean few -- cgit From 9449c3cd90472141cf081af88181a56163ff7132 Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Tue, 12 May 2015 18:29:44 +0800 Subject: net: make skb_dst_pop routine static As xfrm_output_one() is the only caller of skb_dst_pop(), we should make skb_dst_pop() localized. Signed-off-by: Ying Xue Signed-off-by: David S. Miller --- net/xfrm/xfrm_output.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'net') diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index fbcedbe33190..68ada2ca4b60 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -38,6 +38,18 @@ static int xfrm_skb_check_space(struct sk_buff *skb) return pskb_expand_head(skb, nhead, ntail, GFP_ATOMIC); } +/* Children define the path of the packet through the + * Linux networking. Thus, destinations are stackable. + */ + +static struct dst_entry *skb_dst_pop(struct sk_buff *skb) +{ + struct dst_entry *child = dst_clone(skb_dst(skb)->child); + + skb_dst_drop(skb); + return child; +} + static int xfrm_output_one(struct sk_buff *skb, int err) { struct dst_entry *dst = skb_dst(skb); -- cgit From 2006aa4a8c3571e861070b4524dc554e302924bd Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Sat, 2 May 2015 19:27:56 +0200 Subject: netfilter: ipset: Fix sparse warning "warning: cast to restricted __be32" warnings are fixed Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_hash_ipmark.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c index 7abf9788cfa8..2ec4ac5bc672 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmark.c +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -128,7 +128,7 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret) return ret; - e.mark = ntohl(nla_get_u32(tb[IPSET_ATTR_MARK])); + e.mark = ntohl(nla_get_be32(tb[IPSET_ATTR_MARK])); e.mark &= h->markmask; if (adt == IPSET_TEST || @@ -263,7 +263,7 @@ hash_ipmark6_uadt(struct ip_set *set, struct nlattr *tb[], if (ret) return ret; - e.mark = ntohl(nla_get_u32(tb[IPSET_ATTR_MARK])); + e.mark = ntohl(nla_get_be32(tb[IPSET_ATTR_MARK])); e.mark &= h->markmask; if (adt == IPSET_TEST) { -- cgit From 22496f098bb80480c13636c6b66b316685eeea04 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Sat, 2 May 2015 19:27:58 +0200 Subject: netfilter: ipset: Give a better name to a macro in ip_set_core.c Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_core.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index d259da3ce67a..ed05f1e8738f 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -42,7 +42,7 @@ static inline struct ip_set_net *ip_set_pernet(struct net *net) } #define IP_SET_INC 64 -#define STREQ(a, b) (strncmp(a, b, IPSET_MAXNAMELEN) == 0) +#define STRNCMP(a, b) (strncmp(a, b, IPSET_MAXNAMELEN) == 0) static unsigned int max_sets; @@ -85,7 +85,7 @@ find_set_type(const char *name, u8 family, u8 revision) struct ip_set_type *type; list_for_each_entry_rcu(type, &ip_set_type_list, list) - if (STREQ(type->name, name) && + if (STRNCMP(type->name, name) && (type->family == family || type->family == NFPROTO_UNSPEC) && revision >= type->revision_min && @@ -132,7 +132,7 @@ __find_set_type_get(const char *name, u8 family, u8 revision, /* Make sure the type is already loaded * but we don't support the revision */ list_for_each_entry_rcu(type, &ip_set_type_list, list) - if (STREQ(type->name, name)) { + if (STRNCMP(type->name, name)) { err = -IPSET_ERR_FIND_TYPE; goto unlock; } @@ -166,7 +166,7 @@ __find_set_type_minmax(const char *name, u8 family, u8 *min, u8 *max, *min = 255; *max = 0; rcu_read_lock(); list_for_each_entry_rcu(type, &ip_set_type_list, list) - if (STREQ(type->name, name) && + if (STRNCMP(type->name, name) && (type->family == family || type->family == NFPROTO_UNSPEC)) { found = true; @@ -581,7 +581,7 @@ ip_set_get_byname(struct net *net, const char *name, struct ip_set **set) rcu_read_lock(); for (i = 0; i < inst->ip_set_max; i++) { s = rcu_dereference(inst->ip_set_list)[i]; - if (s != NULL && STREQ(s->name, name)) { + if (s != NULL && STRNCMP(s->name, name)) { __ip_set_get(s); index = i; *set = s; @@ -758,7 +758,7 @@ find_set_and_id(struct ip_set_net *inst, const char *name, ip_set_id_t *id) *id = IPSET_INVALID_ID; for (i = 0; i < inst->ip_set_max; i++) { set = ip_set(inst, i); - if (set != NULL && STREQ(set->name, name)) { + if (set != NULL && STRNCMP(set->name, name)) { *id = i; break; } @@ -787,7 +787,7 @@ find_free_id(struct ip_set_net *inst, const char *name, ip_set_id_t *index, if (s == NULL) { if (*index == IPSET_INVALID_ID) *index = i; - } else if (STREQ(name, s->name)) { + } else if (STRNCMP(name, s->name)) { /* Name clash */ *set = s; return -EEXIST; @@ -887,7 +887,7 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb, if (ret == -EEXIST) { /* If this is the same set and requested, ignore error */ if ((flags & IPSET_FLAG_EXIST) && - STREQ(set->type->name, clash->type->name) && + STRNCMP(set->type->name, clash->type->name) && set->type->family == clash->type->family && set->type->revision_min == clash->type->revision_min && set->type->revision_max == clash->type->revision_max && @@ -1098,7 +1098,7 @@ ip_set_rename(struct sock *ctnl, struct sk_buff *skb, name2 = nla_data(attr[IPSET_ATTR_SETNAME2]); for (i = 0; i < inst->ip_set_max; i++) { s = ip_set(inst, i); - if (s != NULL && STREQ(s->name, name2)) { + if (s != NULL && STRNCMP(s->name, name2)) { ret = -IPSET_ERR_EXIST_SETNAME2; goto out; } -- cgit From 3e4e8d126c2934803006aa8437cc444f0dc9f438 Mon Sep 17 00:00:00 2001 From: Alexander Drozdov Date: Sat, 2 May 2015 19:28:01 +0200 Subject: netfilter: ipset: make ip_set_get_ip*_port to use skb_network_offset All the ipset functions respect skb->network_header value, except for ip_set_get_ip4_port() & ip_set_get_ip6_port(). The functions should use skb_network_offset() to get the transport header offset. Signed-off-by: Alexander Drozdov Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_getport.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_getport.c b/net/netfilter/ipset/ip_set_getport.c index 29fb01ddff93..1981f021cc60 100644 --- a/net/netfilter/ipset/ip_set_getport.c +++ b/net/netfilter/ipset/ip_set_getport.c @@ -98,7 +98,7 @@ ip_set_get_ip4_port(const struct sk_buff *skb, bool src, __be16 *port, u8 *proto) { const struct iphdr *iph = ip_hdr(skb); - unsigned int protooff = ip_hdrlen(skb); + unsigned int protooff = skb_network_offset(skb) + ip_hdrlen(skb); int protocol = iph->protocol; /* See comments at tcp_match in ip_tables.c */ @@ -135,7 +135,9 @@ ip_set_get_ip6_port(const struct sk_buff *skb, bool src, __be16 frag_off = 0; nexthdr = ipv6_hdr(skb)->nexthdr; - protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, + protoff = ipv6_skip_exthdr(skb, + skb_network_offset(skb) + + sizeof(struct ipv6hdr), &nexthdr, &frag_off); if (protoff < 0 || (frag_off & htons(~0x7)) != 0) return false; -- cgit From caed0ed35b1e3f3f05dfcb386bcb26df7d47f0ae Mon Sep 17 00:00:00 2001 From: Sergey Popovich Date: Sat, 2 May 2015 19:28:03 +0200 Subject: netfilter: ipset: Properly calculate extensions offsets and total length Offsets and total length returned by the ip_set_elem_len() calculated incorrectly as initial set element length (i.e. len parameter) is used multiple times in offset calculations, also affecting set element total length. Use initial set element length as start offset, do not add aligned extension offset to the offset. Return offset as total length of the set element. This reduces memory requirements on per element basic for the hash:* type of sets. For example output from 'ipset -terse list test-1' on 64-bit PC, where test-1 is generated via following script: #!/bin/bash set_name='test-1' ipset create "$set_name" hash:net family inet \ timeout 10800 counters comment \ hashsize 65536 maxelem 65536 declare -i o3 o4 fmt="add $set_name 192.168.%u.%u\n" for ((o3 = 0; o3 < 256; o3++)); do for ((o4 = 0; o4 < 256; o4++)); do printf "$fmt" $o3 $o4 done done |ipset -exist restore BEFORE this patch is applied # ipset -terse list test-1 Name: test-1 Type: hash:net Revision: 6 Header: family inet hashsize 65536 maxelem 65536 timeout 10800 counters comment Size in memory: 26348440 and AFTER applying patch # ipset -terse list test-1 Name: test-1 Type: hash:net Revision: 6 Header: family inet hashsize 65536 maxelem 65536 timeout 10800 counters comment Size in memory: 7706392 References: 0 Signed-off-by: Sergey Popovich Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index ed05f1e8738f..7f9c0565e715 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -365,7 +365,7 @@ size_t ip_set_elem_len(struct ip_set *set, struct nlattr *tb[], size_t len) { enum ip_set_ext_id id; - size_t offset = 0; + size_t offset = len; u32 cadt_flags = 0; if (tb[IPSET_ATTR_CADT_FLAGS]) @@ -375,12 +375,12 @@ ip_set_elem_len(struct ip_set *set, struct nlattr *tb[], size_t len) for (id = 0; id < IPSET_EXT_ID_MAX; id++) { if (!add_extension(id, cadt_flags, tb)) continue; - offset += ALIGN(len + offset, ip_set_extensions[id].align); + offset = ALIGN(offset, ip_set_extensions[id].align); set->offset[id] = offset; set->extensions |= ip_set_extensions[id].type; offset += ip_set_extensions[id].len; } - return len + offset; + return offset; } EXPORT_SYMBOL_GPL(ip_set_elem_len); -- cgit From 2b67d6e01de1fa220e3f6a4ee63d7ee07d959f55 Mon Sep 17 00:00:00 2001 From: Sergey Popovich Date: Sat, 2 May 2015 19:28:05 +0200 Subject: netfilter: ipset: No need to make nomatch bitfield We do not store cidr packed with no match, so there is no need to make nomatch bitfield. This simplifies mtype_data_reset_flags() a bit. Signed-off-by: Sergey Popovich Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_hash_netportnet.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index bfaa94c7baa7..7cfd2dfc8f9d 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -54,7 +54,7 @@ struct hash_netportnet4_elem { u16 ccmp; }; u16 padding; - u8 nomatch:1; + u8 nomatch; u8 proto; }; @@ -326,7 +326,7 @@ struct hash_netportnet6_elem { u16 ccmp; }; u16 padding; - u8 nomatch:1; + u8 nomatch; u8 proto; }; -- cgit From 43ef29c91a247e9e0cc4d4538c1d1f5b8ca728c1 Mon Sep 17 00:00:00 2001 From: Sergey Popovich Date: Sat, 2 May 2015 19:28:06 +0200 Subject: netfilter: ipset: Preprocessor directices cleanup * Undefine mtype_data_reset_elem before defining. * Remove duplicated mtype_gc_init undefine, move mtype_gc_init define closer to mtype_gc define. * Use htype instead of HTYPE in IPSET_TOKEN(HTYPE, _create)(). * Remove PF definition from sets: no more used. Signed-off-by: Sergey Popovich Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_hash_gen.h | 7 ++++--- net/netfilter/ipset/ip_set_hash_ip.c | 3 --- net/netfilter/ipset/ip_set_hash_ipmark.c | 3 --- net/netfilter/ipset/ip_set_hash_ipport.c | 3 --- net/netfilter/ipset/ip_set_hash_ipportip.c | 3 --- net/netfilter/ipset/ip_set_hash_ipportnet.c | 3 --- net/netfilter/ipset/ip_set_hash_mac.c | 1 - net/netfilter/ipset/ip_set_hash_net.c | 3 --- net/netfilter/ipset/ip_set_hash_netiface.c | 3 --- net/netfilter/ipset/ip_set_hash_netnet.c | 3 --- net/netfilter/ipset/ip_set_hash_netport.c | 3 --- net/netfilter/ipset/ip_set_hash_netportnet.c | 3 --- 12 files changed, 4 insertions(+), 34 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 974ff386db0f..a0430651ecff 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -180,6 +180,7 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize) #undef mtype_data_equal #undef mtype_do_data_match #undef mtype_data_set_flags +#undef mtype_data_reset_elem #undef mtype_data_reset_flags #undef mtype_data_netmask #undef mtype_data_list @@ -193,7 +194,6 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize) #undef mtype_ahash_memsize #undef mtype_flush #undef mtype_destroy -#undef mtype_gc_init #undef mtype_same_set #undef mtype_kadt #undef mtype_uadt @@ -227,6 +227,7 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize) #define mtype_data_list IPSET_TOKEN(MTYPE, _data_list) #define mtype_data_next IPSET_TOKEN(MTYPE, _data_next) #define mtype_elem IPSET_TOKEN(MTYPE, _elem) + #define mtype_ahash_destroy IPSET_TOKEN(MTYPE, _ahash_destroy) #define mtype_ext_cleanup IPSET_TOKEN(MTYPE, _ext_cleanup) #define mtype_add_cidr IPSET_TOKEN(MTYPE, _add_cidr) @@ -234,7 +235,6 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize) #define mtype_ahash_memsize IPSET_TOKEN(MTYPE, _ahash_memsize) #define mtype_flush IPSET_TOKEN(MTYPE, _flush) #define mtype_destroy IPSET_TOKEN(MTYPE, _destroy) -#define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init) #define mtype_same_set IPSET_TOKEN(MTYPE, _same_set) #define mtype_kadt IPSET_TOKEN(MTYPE, _kadt) #define mtype_uadt IPSET_TOKEN(MTYPE, _uadt) @@ -249,6 +249,7 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize) #define mtype_head IPSET_TOKEN(MTYPE, _head) #define mtype_list IPSET_TOKEN(MTYPE, _list) #define mtype_gc IPSET_TOKEN(MTYPE, _gc) +#define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init) #define mtype_variant IPSET_TOKEN(MTYPE, _variant) #define mtype_data_match IPSET_TOKEN(MTYPE, _data_match) @@ -1045,7 +1046,7 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, u8 netmask; #endif size_t hsize; - struct HTYPE *h; + struct htype *h; struct htable *t; #ifndef IP_SET_PROTO_UNDEF diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index 76959d79e9d1..d21d1e6621ce 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -74,7 +74,6 @@ hash_ip4_data_next(struct hash_ip4_elem *next, const struct hash_ip4_elem *e) } #define MTYPE hash_ip4 -#define PF 4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" @@ -208,12 +207,10 @@ hash_ip6_data_next(struct hash_ip4_elem *next, const struct hash_ip6_elem *e) } #undef MTYPE -#undef PF #undef HOST_MASK #undef HKEY_DATALEN #define MTYPE hash_ip6 -#define PF 6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c index 2ec4ac5bc672..e408bcfb8b4e 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmark.c +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -77,7 +77,6 @@ hash_ipmark4_data_next(struct hash_ipmark4_elem *next, } #define MTYPE hash_ipmark4 -#define PF 4 #define HOST_MASK 32 #define HKEY_DATALEN sizeof(struct hash_ipmark4_elem) #include "ip_set_hash_gen.h" @@ -204,12 +203,10 @@ hash_ipmark6_data_next(struct hash_ipmark4_elem *next, } #undef MTYPE -#undef PF #undef HOST_MASK #undef HKEY_DATALEN #define MTYPE hash_ipmark6 -#define PF 6 #define HOST_MASK 128 #define HKEY_DATALEN sizeof(struct hash_ipmark6_elem) #define IP_SET_EMIT_CREATE diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index dcbcceb9a52f..c4383ade5592 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -84,7 +84,6 @@ hash_ipport4_data_next(struct hash_ipport4_elem *next, } #define MTYPE hash_ipport4 -#define PF 4 #define HOST_MASK 32 #define HKEY_DATALEN sizeof(struct hash_ipport4_elem) #include "ip_set_hash_gen.h" @@ -245,12 +244,10 @@ hash_ipport6_data_next(struct hash_ipport4_elem *next, } #undef MTYPE -#undef PF #undef HOST_MASK #undef HKEY_DATALEN #define MTYPE hash_ipport6 -#define PF 6 #define HOST_MASK 128 #define HKEY_DATALEN sizeof(struct hash_ipport6_elem) #define IP_SET_EMIT_CREATE diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index 7ef93fc887a1..c48ff4596e34 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -86,7 +86,6 @@ hash_ipportip4_data_next(struct hash_ipportip4_elem *next, /* Common functions */ #define MTYPE hash_ipportip4 -#define PF 4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" @@ -254,11 +253,9 @@ hash_ipportip6_data_next(struct hash_ipportip4_elem *next, } #undef MTYPE -#undef PF #undef HOST_MASK #define MTYPE hash_ipportip6 -#define PF 6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index b6012ad92781..adc3f76cb169 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -130,7 +130,6 @@ hash_ipportnet4_data_next(struct hash_ipportnet4_elem *next, } #define MTYPE hash_ipportnet4 -#define PF 4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" @@ -381,11 +380,9 @@ hash_ipportnet6_data_next(struct hash_ipportnet4_elem *next, } #undef MTYPE -#undef PF #undef HOST_MASK #define MTYPE hash_ipportnet6 -#define PF 6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c index 65690b52a4d5..2b96234212fa 100644 --- a/net/netfilter/ipset/ip_set_hash_mac.c +++ b/net/netfilter/ipset/ip_set_hash_mac.c @@ -62,7 +62,6 @@ hash_mac4_data_next(struct hash_mac4_elem *next, } #define MTYPE hash_mac4 -#define PF 4 #define HOST_MASK 32 #define IP_SET_EMIT_CREATE #define IP_SET_PROTO_UNDEF diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index 6b3ac10ac2f1..1abfe74d9564 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -109,7 +109,6 @@ hash_net4_data_next(struct hash_net4_elem *next, } #define MTYPE hash_net4 -#define PF 4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" @@ -277,11 +276,9 @@ hash_net6_data_next(struct hash_net4_elem *next, } #undef MTYPE -#undef PF #undef HOST_MASK #define MTYPE hash_net6 -#define PF 6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index 380ef5148ea1..0ba6c5833320 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -207,7 +207,6 @@ hash_netiface4_data_next(struct hash_netiface4_elem *next, } #define MTYPE hash_netiface4 -#define PF 4 #define HOST_MASK 32 #define HKEY_DATALEN sizeof(struct hash_netiface4_elem_hashed) #include "ip_set_hash_gen.h" @@ -457,12 +456,10 @@ hash_netiface6_data_next(struct hash_netiface4_elem *next, } #undef MTYPE -#undef PF #undef HOST_MASK #undef HKEY_DATALEN #define MTYPE hash_netiface6 -#define PF 6 #define HOST_MASK 128 #define HKEY_DATALEN sizeof(struct hash_netiface6_elem_hashed) #define IP_SET_EMIT_CREATE diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index ea8772afb6e7..9c6da0cf6e2a 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -128,7 +128,6 @@ hash_netnet4_data_next(struct hash_netnet4_elem *next, } #define MTYPE hash_netnet4 -#define PF 4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" @@ -354,11 +353,9 @@ hash_netnet6_data_next(struct hash_netnet4_elem *next, } #undef MTYPE -#undef PF #undef HOST_MASK #define MTYPE hash_netnet6 -#define PF 6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index c0ddb58d19dc..f77afd47ca54 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -125,7 +125,6 @@ hash_netport4_data_next(struct hash_netport4_elem *next, } #define MTYPE hash_netport4 -#define PF 4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" @@ -340,11 +339,9 @@ hash_netport6_data_next(struct hash_netport4_elem *next, } #undef MTYPE -#undef PF #undef HOST_MASK #define MTYPE hash_netport6 -#define PF 6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index 7cfd2dfc8f9d..69e544f206c6 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -139,7 +139,6 @@ hash_netportnet4_data_next(struct hash_netportnet4_elem *next, } #define MTYPE hash_netportnet4 -#define PF 4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" @@ -411,11 +410,9 @@ hash_netportnet6_data_next(struct hash_netportnet4_elem *next, } #undef MTYPE -#undef PF #undef HOST_MASK #define MTYPE hash_netportnet6 -#define PF 6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" -- cgit From 8e55d2e5903e4698a964163e0cf81261eee086ee Mon Sep 17 00:00:00 2001 From: Sergey Popovich Date: Sat, 2 May 2015 19:28:07 +0200 Subject: netfilter: ipset: Return ipset error instead of bool Statement ret = func1() || func2() returns 0 when both func1() and func2() return 0, or 1 if func1() or func2() returns non-zero. However in our case func1() and func2() returns error code on failure, so it seems good to propagate such error codes, rather than returning 1 in case of failure. Signed-off-by: Sergey Popovich Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_bitmap_ip.c | 7 +++++-- net/netfilter/ipset/ip_set_bitmap_ipmac.c | 7 +++++-- net/netfilter/ipset/ip_set_hash_ip.c | 14 ++++++++++---- net/netfilter/ipset/ip_set_hash_ipmark.c | 14 ++++++++++---- net/netfilter/ipset/ip_set_hash_ipport.c | 14 ++++++++++---- net/netfilter/ipset/ip_set_hash_ipportip.c | 14 ++++++++++---- net/netfilter/ipset/ip_set_hash_ipportnet.c | 14 ++++++++++---- net/netfilter/ipset/ip_set_hash_net.c | 14 ++++++++++---- net/netfilter/ipset/ip_set_hash_netiface.c | 14 ++++++++++---- net/netfilter/ipset/ip_set_hash_netnet.c | 24 ++++++++++++++++++------ net/netfilter/ipset/ip_set_hash_netport.c | 14 ++++++++++---- net/netfilter/ipset/ip_set_hash_netportnet.c | 24 ++++++++++++++++++------ 12 files changed, 126 insertions(+), 48 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index 55b083ec587a..306a1bf74914 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -149,8 +149,11 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; + + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index 86104744b00f..c5f6a061fa35 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -250,8 +250,11 @@ bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; + + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index d21d1e6621ce..1c469df60551 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -120,8 +120,11 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; + + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -258,8 +261,11 @@ hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); + if (ret) + return ret; + + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c index e408bcfb8b4e..82ef5b3d9afc 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmark.c +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -122,8 +122,11 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip); + if (ret) + return ret; + + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -255,8 +258,11 @@ hash_ipmark6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); + if (ret) + return ret; + + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index c4383ade5592..5b36a4ed3573 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -131,8 +131,11 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip); + if (ret) + return ret; + + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -298,8 +301,11 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); + if (ret) + return ret; + + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index c48ff4596e34..6a580e5955ea 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -133,8 +133,11 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip); + if (ret) + return ret; + + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -306,8 +309,11 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); + if (ret) + return ret; + + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index adc3f76cb169..2b90a1db73f5 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -188,8 +188,11 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; + + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -445,8 +448,11 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); + if (ret) + return ret; + + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index 1abfe74d9564..15382e242c8b 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -159,8 +159,11 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; + + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -330,8 +333,11 @@ hash_net6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); + if (ret) + return ret; + + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index 0ba6c5833320..66ac8dde930d 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -307,8 +307,11 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; + + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -543,8 +546,11 @@ hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); + if (ret) + return ret; + + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index 9c6da0cf6e2a..f065024b486a 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -181,9 +181,15 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || - ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; + + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from); + if (ret) + return ret; + + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -408,9 +414,15 @@ hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip[0]) || - ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip[1]) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip[0]); + if (ret) + return ret; + + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip[1]); + if (ret) + return ret; + + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index f77afd47ca54..624eb5b674c6 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -181,8 +181,11 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; + + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -401,8 +404,11 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); + if (ret) + return ret; + + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index 69e544f206c6..7eed11bce40a 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -199,9 +199,15 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || - ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; + + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from); + if (ret) + return ret; + + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -474,9 +480,15 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip[0]) || - ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip[1]) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip[0]); + if (ret) + return ret; + + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip[1]); + if (ret) + return ret; + + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; -- cgit From d25472e4706329f595377aca38ccb664bdd14531 Mon Sep 17 00:00:00 2001 From: Sergey Popovich Date: Sat, 2 May 2015 19:28:10 +0200 Subject: netfilter: ipset: Check IPSET_ATTR_PORT only once We do not need to check tb[IPSET_ATTR_PORT] != NULL before retrieving port, as this attribute is known to exist due to ip_set_attr_netorder() returning true only when attribute exists and it is in network byte order. Signed-off-by: Sergey Popovich Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_hash_ipport.c | 10 ++-------- net/netfilter/ipset/ip_set_hash_ipportip.c | 10 ++-------- net/netfilter/ipset/ip_set_hash_ipportnet.c | 10 ++-------- net/netfilter/ipset/ip_set_hash_netport.c | 10 ++-------- net/netfilter/ipset/ip_set_hash_netportnet.c | 10 ++-------- 5 files changed, 10 insertions(+), 40 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index 5b36a4ed3573..299fab630d1f 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -139,10 +139,7 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret) return ret; - if (tb[IPSET_ATTR_PORT]) - e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); - else - return -IPSET_ERR_PROTOCOL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); @@ -309,10 +306,7 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], if (ret) return ret; - if (tb[IPSET_ATTR_PORT]) - e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); - else - return -IPSET_ERR_PROTOCOL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index 6a580e5955ea..cb7946662fbd 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -145,10 +145,7 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret) return ret; - if (tb[IPSET_ATTR_PORT]) - e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); - else - return -IPSET_ERR_PROTOCOL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); @@ -321,10 +318,7 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], if (ret) return ret; - if (tb[IPSET_ATTR_PORT]) - e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); - else - return -IPSET_ERR_PROTOCOL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index 2b90a1db73f5..2c39cae0df29 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -207,10 +207,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], e.cidr = cidr - 1; } - if (tb[IPSET_ATTR_PORT]) - e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); - else - return -IPSET_ERR_PROTOCOL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); @@ -469,10 +466,7 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], ip6_netmask(&e.ip2, e.cidr + 1); - if (tb[IPSET_ATTR_PORT]) - e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); - else - return -IPSET_ERR_PROTOCOL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index 624eb5b674c6..91c901ca25d1 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -196,10 +196,7 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], e.cidr = cidr - 1; } - if (tb[IPSET_ATTR_PORT]) - e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); - else - return -IPSET_ERR_PROTOCOL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); @@ -420,10 +417,7 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], } ip6_netmask(&e.ip, e.cidr + 1); - if (tb[IPSET_ATTR_PORT]) - e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); - else - return -IPSET_ERR_PROTOCOL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index 7eed11bce40a..1233442053c9 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -225,10 +225,7 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], e.cidr[1] = cidr; } - if (tb[IPSET_ATTR_PORT]) - e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); - else - return -IPSET_ERR_PROTOCOL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); @@ -505,10 +502,7 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[], ip6_netmask(&e.ip[0], e.cidr[0]); ip6_netmask(&e.ip[1], e.cidr[1]); - if (tb[IPSET_ATTR_PORT]) - e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); - else - return -IPSET_ERR_PROTOCOL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); -- cgit From cabfd139aa9863ebafe0bbba3f203d0c4e43e2cf Mon Sep 17 00:00:00 2001 From: Sergey Popovich Date: Sat, 2 May 2015 19:28:11 +0200 Subject: netfilter: ipset: Use HOST_MASK literal to represent host address CIDR len Signed-off-by: Sergey Popovich Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_bitmap_ip.c | 7 ++++--- net/netfilter/ipset/ip_set_bitmap_ipmac.c | 3 ++- net/netfilter/ipset/ip_set_hash_ip.c | 2 +- net/netfilter/ipset/ip_set_hash_ipmark.c | 2 +- net/netfilter/ipset/ip_set_hash_ipport.c | 2 +- net/netfilter/ipset/ip_set_hash_ipportip.c | 2 +- net/netfilter/ipset/ip_set_hash_ipportnet.c | 2 +- 7 files changed, 11 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index 306a1bf74914..01b88ba7c430 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -36,6 +36,7 @@ IP_SET_MODULE_DESC("bitmap:ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_bitmap:ip"); #define MTYPE bitmap_ip +#define HOST_MASK 32 /* Type structure */ struct bitmap_ip { @@ -177,7 +178,7 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[], } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!cidr || cidr > 32) + if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); } else @@ -280,7 +281,7 @@ bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[], } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (cidr >= 32) + if (cidr >= HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(first_ip, last_ip, cidr); } else @@ -289,7 +290,7 @@ bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_NETMASK]) { netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]); - if (netmask > 32) + if (netmask > HOST_MASK) return -IPSET_ERR_INVALID_NETMASK; first_ip &= ip_set_hostmask(netmask); diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index c5f6a061fa35..46868b3fdf7b 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -36,6 +36,7 @@ IP_SET_MODULE_DESC("bitmap:ip,mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_bitmap:ip,mac"); #define MTYPE bitmap_ipmac +#define HOST_MASK 32 #define IP_SET_BITMAP_STORED_TIMEOUT enum { @@ -346,7 +347,7 @@ bitmap_ipmac_create(struct net *net, struct ip_set *set, struct nlattr *tb[], } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (cidr >= 32) + if (cidr >= HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(first_ip, last_ip, cidr); } else diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index 1c469df60551..1a9ef0c6d457 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -147,7 +147,7 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!cidr || cidr > 32) + if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); } diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c index 82ef5b3d9afc..4499373ef8aa 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmark.c +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -149,7 +149,7 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!cidr || cidr > 32) + if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); } diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index 299fab630d1f..4ae423c86532 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -170,7 +170,7 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!cidr || cidr > 32) + if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); } diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index cb7946662fbd..fb921a53c878 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -176,7 +176,7 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!cidr || cidr > 32) + if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); } diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index 2c39cae0df29..4ae9804386ad 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -248,7 +248,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], } else if (tb[IPSET_ATTR_CIDR]) { cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!cidr || cidr > 32) + if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); } -- cgit From 728a7e6903af6f901eeff9a2d9ffb06d55e9b1e4 Mon Sep 17 00:00:00 2001 From: Sergey Popovich Date: Sat, 2 May 2015 19:28:15 +0200 Subject: netfilter: ipset: Return bool values instead of int Signed-off-by: Sergey Popovich Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_hash_ip.c | 10 +++++----- net/netfilter/ipset/ip_set_hash_ipmark.c | 8 ++++---- net/netfilter/ipset/ip_set_hash_ipport.c | 8 ++++---- net/netfilter/ipset/ip_set_hash_ipportip.c | 8 ++++---- net/netfilter/ipset/ip_set_hash_ipportnet.c | 8 ++++---- net/netfilter/ipset/ip_set_hash_mac.c | 7 ++++++- net/netfilter/ipset/ip_set_hash_net.c | 8 ++++---- net/netfilter/ipset/ip_set_hash_netiface.c | 8 ++++---- net/netfilter/ipset/ip_set_hash_netport.c | 8 ++++---- net/netfilter/ipset/ip_set_hash_netportnet.c | 8 ++++---- 10 files changed, 43 insertions(+), 38 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index 1a9ef0c6d457..e90a82a177bf 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -56,15 +56,15 @@ hash_ip4_data_equal(const struct hash_ip4_elem *e1, return e1->ip == e2->ip; } -static inline bool +static bool hash_ip4_data_list(struct sk_buff *skb, const struct hash_ip4_elem *e) { if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, e->ip)) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -198,10 +198,10 @@ hash_ip6_data_list(struct sk_buff *skb, const struct hash_ip6_elem *e) { if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &e->ip.in6)) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c index 4499373ef8aa..8584284fadf7 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmark.c +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -63,10 +63,10 @@ hash_ipmark4_data_list(struct sk_buff *skb, if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || nla_put_net32(skb, IPSET_ATTR_MARK, htonl(data->mark))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -193,10 +193,10 @@ hash_ipmark6_data_list(struct sk_buff *skb, if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || nla_put_net32(skb, IPSET_ATTR_MARK, htonl(data->mark))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index 4ae423c86532..73e4d8662879 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -69,10 +69,10 @@ hash_ipport4_data_list(struct sk_buff *skb, nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -230,10 +230,10 @@ hash_ipport6_data_list(struct sk_buff *skb, nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index fb921a53c878..4f8e58473db5 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -70,10 +70,10 @@ hash_ipportip4_data_list(struct sk_buff *skb, nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -239,10 +239,10 @@ hash_ipportip6_data_list(struct sk_buff *skb, nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index 4ae9804386ad..4363a184ee6e 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -114,10 +114,10 @@ hash_ipportnet4_data_list(struct sk_buff *skb, (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -366,10 +366,10 @@ hash_ipportnet6_data_list(struct sk_buff *skb, (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c index 2b96234212fa..3614683e690c 100644 --- a/net/netfilter/ipset/ip_set_hash_mac.c +++ b/net/netfilter/ipset/ip_set_hash_mac.c @@ -52,7 +52,12 @@ hash_mac4_data_equal(const struct hash_mac4_elem *e1, static inline bool hash_mac4_data_list(struct sk_buff *skb, const struct hash_mac4_elem *e) { - return nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, e->ether); + if (nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, e->ether)) + goto nla_put_failure; + return false; + +nla_put_failure: + return true; } static inline void diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index 15382e242c8b..2feaed362011 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -95,10 +95,10 @@ hash_net4_data_list(struct sk_buff *skb, const struct hash_net4_elem *data) (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -266,10 +266,10 @@ hash_net6_data_list(struct sk_buff *skb, const struct hash_net6_elem *data) (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index 66ac8dde930d..322b773b3cef 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -193,10 +193,10 @@ hash_netiface4_data_list(struct sk_buff *skb, (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -446,10 +446,10 @@ hash_netiface6_data_list(struct sk_buff *skb, (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index 91c901ca25d1..bd454653edb1 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -110,10 +110,10 @@ hash_netport4_data_list(struct sk_buff *skb, (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -325,10 +325,10 @@ hash_netport6_data_list(struct sk_buff *skb, (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index 1233442053c9..dd64ba935d10 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -124,10 +124,10 @@ hash_netportnet4_data_list(struct sk_buff *skb, (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -399,10 +399,10 @@ hash_netportnet6_data_list(struct sk_buff *skb, (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void -- cgit From 037261866c8dd164c426580160973eb2d68f688c Mon Sep 17 00:00:00 2001 From: Sergey Popovich Date: Sat, 2 May 2015 19:28:16 +0200 Subject: netfilter: ipset: Check for comment netlink attribute length Ensure userspace supplies string not longer than IPSET_MAX_COMMENT_SIZE. Signed-off-by: Sergey Popovich Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_bitmap_ip.c | 3 ++- net/netfilter/ipset/ip_set_bitmap_ipmac.c | 3 ++- net/netfilter/ipset/ip_set_bitmap_port.c | 3 ++- net/netfilter/ipset/ip_set_hash_ip.c | 3 ++- net/netfilter/ipset/ip_set_hash_ipmark.c | 3 ++- net/netfilter/ipset/ip_set_hash_ipport.c | 3 ++- net/netfilter/ipset/ip_set_hash_ipportip.c | 3 ++- net/netfilter/ipset/ip_set_hash_ipportnet.c | 3 ++- net/netfilter/ipset/ip_set_hash_mac.c | 3 ++- net/netfilter/ipset/ip_set_hash_net.c | 3 ++- net/netfilter/ipset/ip_set_hash_netiface.c | 3 ++- net/netfilter/ipset/ip_set_hash_netnet.c | 3 ++- net/netfilter/ipset/ip_set_hash_netport.c | 3 ++- net/netfilter/ipset/ip_set_hash_netportnet.c | 3 ++- net/netfilter/ipset/ip_set_list_set.c | 3 ++- 15 files changed, 30 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index 01b88ba7c430..2fe6de46f6d0 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -364,7 +364,8 @@ static struct ip_set_type bitmap_ip_type __read_mostly = { [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index 46868b3fdf7b..eb188561d65f 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -401,7 +401,8 @@ static struct ip_set_type bitmap_ipmac_type = { [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c index 005dd36444c3..898edb693b3f 100644 --- a/net/netfilter/ipset/ip_set_bitmap_port.c +++ b/net/netfilter/ipset/ip_set_bitmap_port.c @@ -294,7 +294,8 @@ static struct ip_set_type bitmap_port_type = { [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index e90a82a177bf..ee7a72b13a5a 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -304,7 +304,8 @@ static struct ip_set_type hash_ip_type __read_mostly = { [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c index 8584284fadf7..6ac7073fc850 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmark.c +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -310,7 +310,8 @@ static struct ip_set_type hash_ipmark_type __read_mostly = { [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index 73e4d8662879..a3117d4ec267 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -373,7 +373,8 @@ static struct ip_set_type hash_ipport_type __read_mostly = { [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index 4f8e58473db5..89615f134845 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -385,7 +385,8 @@ static struct ip_set_type hash_ipportip_type __read_mostly = { [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index 4363a184ee6e..6ba7a7e083f9 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -544,7 +544,8 @@ static struct ip_set_type hash_ipportnet_type __read_mostly = { [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c index 3614683e690c..1f8668d7a538 100644 --- a/net/netfilter/ipset/ip_set_hash_mac.c +++ b/net/netfilter/ipset/ip_set_hash_mac.c @@ -153,7 +153,8 @@ static struct ip_set_type hash_mac_type __read_mostly = { [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index 2feaed362011..2e63dad8644d 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -386,7 +386,8 @@ static struct ip_set_type hash_net_type __read_mostly = { [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index 322b773b3cef..07920b60bbf8 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -616,7 +616,8 @@ static struct ip_set_type hash_netiface_type __read_mostly = { [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index f065024b486a..847047483560 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -479,7 +479,8 @@ static struct ip_set_type hash_netnet_type __read_mostly = { [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index bd454653edb1..8273819c1a2f 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -492,7 +492,8 @@ static struct ip_set_type hash_netport_type __read_mostly = { [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index dd64ba935d10..1451a8ac938f 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -580,7 +580,8 @@ static struct ip_set_type hash_netportnet_type __read_mostly = { [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index f8f682806e36..5bd3b1eae3fa 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -678,7 +678,8 @@ static struct ip_set_type list_set_type __read_mostly = { [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, -- cgit From 58cc06daeacc8158f7471f3331965ec34aa5a172 Mon Sep 17 00:00:00 2001 From: Sergey Popovich Date: Sat, 2 May 2015 19:28:18 +0200 Subject: netfilter: ipset: Fix hashing for ipv6 sets HKEY_DATALEN remains defined after first inclusion of ip_set_hash_gen.h, so it is incorrectly reused for IPv6 code. Undefine HKEY_DATALEN in ip_set_hash_gen.h at the end. Also remove some useless defines of HKEY_DATALEN in ip_set_hash_{ip{,mark,port},netiface}.c as ip_set_hash_gen.h defines it correctly for such set types anyway. Signed-off-by: Sergey Popovich Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_hash_gen.h | 2 ++ net/netfilter/ipset/ip_set_hash_ip.c | 1 - net/netfilter/ipset/ip_set_hash_ipmark.c | 3 --- net/netfilter/ipset/ip_set_hash_ipport.c | 3 --- net/netfilter/ipset/ip_set_hash_netiface.c | 1 - 5 files changed, 2 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index a0430651ecff..7b72209e8102 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -1166,3 +1166,5 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, return 0; } #endif /* IP_SET_EMIT_CREATE */ + +#undef HKEY_DATALEN diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index ee7a72b13a5a..54df48b5c455 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -211,7 +211,6 @@ hash_ip6_data_next(struct hash_ip4_elem *next, const struct hash_ip6_elem *e) #undef MTYPE #undef HOST_MASK -#undef HKEY_DATALEN #define MTYPE hash_ip6 #define HOST_MASK 128 diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c index 6ac7073fc850..061e7e8a614c 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmark.c +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -78,7 +78,6 @@ hash_ipmark4_data_next(struct hash_ipmark4_elem *next, #define MTYPE hash_ipmark4 #define HOST_MASK 32 -#define HKEY_DATALEN sizeof(struct hash_ipmark4_elem) #include "ip_set_hash_gen.h" static int @@ -207,11 +206,9 @@ hash_ipmark6_data_next(struct hash_ipmark4_elem *next, #undef MTYPE #undef HOST_MASK -#undef HKEY_DATALEN #define MTYPE hash_ipmark6 #define HOST_MASK 128 -#define HKEY_DATALEN sizeof(struct hash_ipmark6_elem) #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index a3117d4ec267..e58704eba365 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -85,7 +85,6 @@ hash_ipport4_data_next(struct hash_ipport4_elem *next, #define MTYPE hash_ipport4 #define HOST_MASK 32 -#define HKEY_DATALEN sizeof(struct hash_ipport4_elem) #include "ip_set_hash_gen.h" static int @@ -245,11 +244,9 @@ hash_ipport6_data_next(struct hash_ipport4_elem *next, #undef MTYPE #undef HOST_MASK -#undef HKEY_DATALEN #define MTYPE hash_ipport6 #define HOST_MASK 128 -#define HKEY_DATALEN sizeof(struct hash_ipport6_elem) #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index 07920b60bbf8..fe481f677f56 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -460,7 +460,6 @@ hash_netiface6_data_next(struct hash_netiface4_elem *next, #undef MTYPE #undef HOST_MASK -#undef HKEY_DATALEN #define MTYPE hash_netiface6 #define HOST_MASK 128 -- cgit From 1823fb79e5b6e3950fd24d2990b3b5c45283f15c Mon Sep 17 00:00:00 2001 From: Sergey Popovich Date: Sat, 2 May 2015 19:28:19 +0200 Subject: netfilter: ipset: Improve preprocessor macros checks Check if mandatory MTYPE, HTYPE and HOST_MASK macros defined. Signed-off-by: Sergey Popovich Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_hash_gen.h | 13 ++++++++++++- net/netfilter/ipset/ip_set_hash_ipmark.c | 6 +++--- net/netfilter/ipset/ip_set_hash_ipport.c | 6 +++--- 3 files changed, 18 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 7b72209e8102..7952869c8023 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -253,6 +253,14 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize) #define mtype_variant IPSET_TOKEN(MTYPE, _variant) #define mtype_data_match IPSET_TOKEN(MTYPE, _data_match) +#ifndef MTYPE +#error "MTYPE is not defined!" +#endif + +#ifndef HOST_MASK +#error "HOST_MASK is not defined!" +#endif + #ifndef HKEY_DATALEN #define HKEY_DATALEN sizeof(struct mtype_elem) #endif @@ -262,6 +270,9 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize) & jhash_mask(htable_bits)) #ifndef htype +#ifndef HTYPE +#error "HTYPE is not defined!" +#endif /* HTYPE */ #define htype HTYPE /* The generic hash structure */ @@ -288,7 +299,7 @@ struct htype { struct net_prefixes nets[0]; /* book-keeping of prefixes */ #endif }; -#endif +#endif /* htype */ #ifdef IP_SET_HASH_WITH_NETS /* Network cidr size book keeping when the hash stores different diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c index 061e7e8a614c..d231248eb3e2 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmark.c +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -76,8 +76,8 @@ hash_ipmark4_data_next(struct hash_ipmark4_elem *next, next->ip = d->ip; } -#define MTYPE hash_ipmark4 -#define HOST_MASK 32 +#define MTYPE hash_ipmark4 +#define HOST_MASK 32 #include "ip_set_hash_gen.h" static int @@ -209,7 +209,7 @@ hash_ipmark6_data_next(struct hash_ipmark4_elem *next, #define MTYPE hash_ipmark6 #define HOST_MASK 128 -#define IP_SET_EMIT_CREATE +#define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index e58704eba365..a47c29f12090 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -83,8 +83,8 @@ hash_ipport4_data_next(struct hash_ipport4_elem *next, next->port = d->port; } -#define MTYPE hash_ipport4 -#define HOST_MASK 32 +#define MTYPE hash_ipport4 +#define HOST_MASK 32 #include "ip_set_hash_gen.h" static int @@ -247,7 +247,7 @@ hash_ipport6_data_next(struct hash_ipport4_elem *next, #define MTYPE hash_ipport6 #define HOST_MASK 128 -#define IP_SET_EMIT_CREATE +#define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" static int -- cgit From a9756e6f63518c6da319904ded5050ec1780b800 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Sat, 2 May 2015 19:28:23 +0200 Subject: netfilter: ipset: Use better include files in xt_set.c Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_set.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c index 89045982ec94..b103e9627716 100644 --- a/net/netfilter/xt_set.c +++ b/net/netfilter/xt_set.c @@ -15,8 +15,9 @@ #include #include -#include +#include #include +#include MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik "); -- cgit From 22c1f67ea5cd3296ee82df7397fb5e5f81eccb98 Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Tue, 12 May 2015 23:03:51 -0700 Subject: switchdev: sparse warning: make __switchdev_port_obj_add static Signed-off-by: Scott Feldman Signed-off-by: David S. Miller --- net/switchdev/switchdev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 65d49d4477b9..a267f7728165 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -188,7 +188,8 @@ int switchdev_port_attr_set(struct net_device *dev, struct switchdev_attr *attr) } EXPORT_SYMBOL_GPL(switchdev_port_attr_set); -int __switchdev_port_obj_add(struct net_device *dev, struct switchdev_obj *obj) +static int __switchdev_port_obj_add(struct net_device *dev, + struct switchdev_obj *obj) { const struct switchdev_ops *ops = dev->switchdev_ops; struct net_device *lower_dev; -- cgit From 7a7ee5312d133a01cb23626c133ae30692ecb748 Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Tue, 12 May 2015 23:03:52 -0700 Subject: switchdev: sparse warning: pass ipv4 fib dst as network-byte order And let driver convert it to host-byte order as needed. Signed-off-by: Scott Feldman Signed-off-by: David S. Miller --- net/switchdev/switchdev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index a267f7728165..77f1b6e3f78e 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -645,7 +645,7 @@ int switchdev_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi, struct switchdev_obj fib_obj = { .id = SWITCHDEV_OBJ_IPV4_FIB, .ipv4_fib = { - .dst = htonl(dst), + .dst = dst, .dst_len = dst_len, .fi = fi, .tos = tos, @@ -699,7 +699,7 @@ int switchdev_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi, struct switchdev_obj fib_obj = { .id = SWITCHDEV_OBJ_IPV4_FIB, .ipv4_fib = { - .dst = htonl(dst), + .dst = dst, .dst_len = dst_len, .fi = fi, .tos = tos, -- cgit From 42275bd8fcb351f951781d8882f359d25976824b Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Wed, 13 May 2015 11:16:50 -0700 Subject: switchdev: don't use anonymous union on switchdev attr/obj structs Older gcc versions (e.g. gcc version 4.4.6) don't like anonymous unions which was causing build issues on the newly added switchdev attr/obj structs. Fix this by using named union on structs. Signed-off-by: Scott Feldman Reported-by: Or Gerlitz Signed-off-by: David S. Miller --- net/bridge/br_stp.c | 2 +- net/core/net-sysfs.c | 4 ++-- net/core/rtnetlink.c | 3 ++- net/dsa/slave.c | 6 +++--- net/switchdev/switchdev.c | 39 ++++++++++++++++++++------------------- 5 files changed, 28 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index b9300da31565..45f1ff113af9 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -41,7 +41,7 @@ void br_set_state(struct net_bridge_port *p, unsigned int state) { struct switchdev_attr attr = { .id = SWITCHDEV_ATTR_PORT_STP_STATE, - .stp_state = state, + .u.stp_state = state, }; int err; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 5a9ce96f6d27..18b34d771ed4 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -465,8 +465,8 @@ static ssize_t phys_switch_id_show(struct device *dev, ret = switchdev_port_attr_get(netdev, &attr); if (!ret) - ret = sprintf(buf, "%*phN\n", attr.ppid.id_len, - attr.ppid.id); + ret = sprintf(buf, "%*phN\n", attr.u.ppid.id_len, + attr.u.ppid.id); } rtnl_unlock(); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index c6c6b2c34926..141ccc357e2e 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1016,7 +1016,8 @@ static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev) return err; } - if (nla_put(skb, IFLA_PHYS_SWITCH_ID, attr.ppid.id_len, attr.ppid.id)) + if (nla_put(skb, IFLA_PHYS_SWITCH_ID, attr.u.ppid.id_len, + attr.u.ppid.id)) return -EMSGSIZE; return 0; diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 3fb5210e318c..04ffad311704 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -353,7 +353,7 @@ static int dsa_slave_port_attr_set(struct net_device *dev, switch (attr->id) { case SWITCHDEV_ATTR_PORT_STP_STATE: if (attr->trans == SWITCHDEV_TRANS_COMMIT) - ret = dsa_slave_stp_update(dev, attr->stp_state); + ret = dsa_slave_stp_update(dev, attr->u.stp_state); break; default: ret = -EOPNOTSUPP; @@ -408,8 +408,8 @@ static int dsa_slave_port_attr_get(struct net_device *dev, switch (attr->id) { case SWITCHDEV_ATTR_PORT_PARENT_ID: - attr->ppid.id_len = sizeof(ds->index); - memcpy(&attr->ppid.id, &ds->index, attr->ppid.id_len); + attr->u.ppid.id_len = sizeof(ds->index); + memcpy(&attr->u.ppid.id, &ds->index, attr->u.ppid.id_len); break; default: return -EOPNOTSUPP; diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 77f1b6e3f78e..0409f9b5bdbc 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -383,7 +383,7 @@ int switchdev_port_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, return err; return ndo_dflt_bridge_getlink(skb, pid, seq, dev, mode, - attr.brport_flags, mask, nlflags); + attr.u.brport_flags, mask, nlflags); } EXPORT_SYMBOL_GPL(switchdev_port_bridge_getlink); @@ -402,9 +402,9 @@ static int switchdev_port_br_setflag(struct net_device *dev, return err; if (flag) - attr.brport_flags |= brport_flag; + attr.u.brport_flags |= brport_flag; else - attr.brport_flags &= ~brport_flag; + attr.u.brport_flags &= ~brport_flag; return switchdev_port_attr_set(dev, &attr); } @@ -466,6 +466,7 @@ static int switchdev_port_br_afspec(struct net_device *dev, struct switchdev_obj obj = { .id = SWITCHDEV_OBJ_PORT_VLAN, }; + struct switchdev_obj_vlan *vlan = &obj.u.vlan; int rem; int err; @@ -475,30 +476,30 @@ static int switchdev_port_br_afspec(struct net_device *dev, if (nla_len(attr) != sizeof(struct bridge_vlan_info)) return -EINVAL; vinfo = nla_data(attr); - obj.vlan.flags = vinfo->flags; + vlan->flags = vinfo->flags; if (vinfo->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) { - if (obj.vlan.vid_start) + if (vlan->vid_start) return -EINVAL; - obj.vlan.vid_start = vinfo->vid; + vlan->vid_start = vinfo->vid; } else if (vinfo->flags & BRIDGE_VLAN_INFO_RANGE_END) { - if (!obj.vlan.vid_start) + if (!vlan->vid_start) return -EINVAL; - obj.vlan.vid_end = vinfo->vid; - if (obj.vlan.vid_end <= obj.vlan.vid_start) + vlan->vid_end = vinfo->vid; + if (vlan->vid_end <= vlan->vid_start) return -EINVAL; err = f(dev, &obj); if (err) return err; - memset(&obj.vlan, 0, sizeof(obj.vlan)); + memset(vlan, 0, sizeof(*vlan)); } else { - if (obj.vlan.vid_start) + if (vlan->vid_start) return -EINVAL; - obj.vlan.vid_start = vinfo->vid; - obj.vlan.vid_end = vinfo->vid; + vlan->vid_start = vinfo->vid; + vlan->vid_end = vinfo->vid; err = f(dev, &obj); if (err) return err; - memset(&obj.vlan, 0, sizeof(obj.vlan)); + memset(vlan, 0, sizeof(*vlan)); } } @@ -613,10 +614,10 @@ static struct net_device *switchdev_get_dev_by_nhs(struct fib_info *fi) return NULL; if (nhsel > 0) { - if (prev_attr.ppid.id_len != attr.ppid.id_len) + if (prev_attr.u.ppid.id_len != attr.u.ppid.id_len) return NULL; - if (memcmp(prev_attr.ppid.id, attr.ppid.id, - attr.ppid.id_len)) + if (memcmp(prev_attr.u.ppid.id, attr.u.ppid.id, + attr.u.ppid.id_len)) return NULL; } @@ -644,7 +645,7 @@ int switchdev_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi, { struct switchdev_obj fib_obj = { .id = SWITCHDEV_OBJ_IPV4_FIB, - .ipv4_fib = { + .u.ipv4_fib = { .dst = dst, .dst_len = dst_len, .fi = fi, @@ -698,7 +699,7 @@ int switchdev_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi, { struct switchdev_obj fib_obj = { .id = SWITCHDEV_OBJ_IPV4_FIB, - .ipv4_fib = { + .u.ipv4_fib = { .dst = dst, .dst_len = dst_len, .fi = fi, -- cgit From e578d9c02587d57bfa7b560767c698a668a468c6 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 11 May 2015 19:50:41 +0200 Subject: net: sched: use counter to break reclassify loops Seems all we want here is to avoid endless 'goto reclassify' loop. tc_classify_compat even resets this counter when something other than TC_ACT_RECLASSIFY is returned, so this skb-counter doesn't break hypothetical loops induced by something other than perpetual TC_ACT_RECLASSIFY return values. skb_act_clone is now identical to skb_clone, so just use that. Tested with following (bogus) filter: tc filter add dev eth0 parent ffff: \ protocol ip u32 match u32 0 0 police rate 10Kbit burst \ 64000 mtu 1500 action reclassify Acked-by: Daniel Borkmann Signed-off-by: Florian Westphal Acked-by: Alexei Starovoitov Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_mirred.c | 2 +- net/sched/sch_api.c | 12 +++--------- 2 files changed, 4 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 3f63ceac8e01..a42a3b257226 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -151,7 +151,7 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, } at = G_TC_AT(skb->tc_verd); - skb2 = skb_act_clone(skb, GFP_ATOMIC, m->tcf_action); + skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2 == NULL) goto out; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index ad9eed70bc8f..0b74dc0ede9c 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1816,13 +1816,8 @@ int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp, continue; err = tp->classify(skb, tp, res); - if (err >= 0) { -#ifdef CONFIG_NET_CLS_ACT - if (err != TC_ACT_RECLASSIFY && skb->tc_verd) - skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0); -#endif + if (err >= 0) return err; - } } return -1; } @@ -1834,23 +1829,22 @@ int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp, int err = 0; #ifdef CONFIG_NET_CLS_ACT const struct tcf_proto *otp = tp; + int limit = 0; reclassify: #endif err = tc_classify_compat(skb, tp, res); #ifdef CONFIG_NET_CLS_ACT if (err == TC_ACT_RECLASSIFY) { - u32 verd = G_TC_VERD(skb->tc_verd); tp = otp; - if (verd++ >= MAX_REC_LOOP) { + if (unlikely(limit++ >= MAX_REC_LOOP)) { net_notice_ratelimited("%s: packet reclassify loop rule prio %u protocol %02x\n", tp->q->ops->id, tp->prio & 0xffff, ntohs(tp->protocol)); return TC_ACT_SHOT; } - skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd); goto reclassify; } #endif -- cgit From 1bd758eb1cab2fa5b71a23f9e5d3c8076f4ed650 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 12 May 2015 14:56:07 +0200 Subject: net: change name of flow_dissector header to match the .c file name add couple of empty lines on the way. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 2 +- net/sched/cls_flow.c | 2 +- net/sched/sch_choke.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index d3acc4dff4ae..eaa86b539221 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include /* copy saddr & daddr, possibly using 64bit load/store diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index a620c4e288a5..4c5a92298906 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) #include diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index dfe3da75594c..2624e991a2d1 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include /* CHOKe stateless AQM for fair bandwidth allocation -- cgit From 10b89ee43e849544eddfe34e535341fc077464ec Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 12 May 2015 14:56:09 +0200 Subject: net: move *skb_get_poff declarations into correct header Since these functions are defined in flow_dissector.c, move header declarations from skbuff.h into flow_dissector.h Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/filter.c | 1 + net/ethernet/eth.c | 1 + 2 files changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index a831f193e2c7..6805717be614 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 9045e2a1108f..9332a0ab0698 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -58,6 +58,7 @@ #include #include #include +#include #include __setup("ether=", netdev_boot_setup); -- cgit From d4fd32757176d1b03533770ef75aa2b781fa5667 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 12 May 2015 14:56:10 +0200 Subject: flow_dissector: fix doc for __skb_get_hash and remove couple of empty lines Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index eaa86b539221..0d9bc3a586ba 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -337,8 +337,11 @@ void make_flow_keys_digest(struct flow_keys_digest *digest, } EXPORT_SYMBOL(make_flow_keys_digest); -/* - * __skb_get_hash: calculate a flow hash based on src/dst addresses +/** + * __skb_get_hash: calculate a flow hash + * @skb: sk_buff to calculate flow hash from + * + * This function calculates a flow hash based on src/dst addresses * and src/dst port numbers. Sets hash in skb to non-zero hash value * on success, zero indicates no valid hash. Also, sets l4_hash in skb * if hash is a canonical 4-tuple hash over transport ports. @@ -353,12 +356,9 @@ void __skb_get_hash(struct sk_buff *skb) hash = ___skb_get_hash(skb, &keys, hashrnd); if (!hash) return; - if (keys.ports) skb->l4_hash = 1; - skb->sw_hash = 1; - skb->hash = hash; } EXPORT_SYMBOL(__skb_get_hash); -- cgit From 5605c76240aadc823e3d46ac9afde2f26fbcf019 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 12 May 2015 14:56:12 +0200 Subject: net: move __skb_tx_hash to dev.c __skb_tx_hash function has no relation to flow_dissect so just move it to dev.c Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/dev.c | 28 ++++++++++++++++++++++++++++ net/core/flow_dissector.c | 28 ---------------------------- 2 files changed, 28 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 90a568a150b4..d044d2f8532b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2350,6 +2350,34 @@ void netif_device_attach(struct net_device *dev) } EXPORT_SYMBOL(netif_device_attach); +/* + * Returns a Tx hash based on the given packet descriptor a Tx queues' number + * to be used as a distribution range. + */ +u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb, + unsigned int num_tx_queues) +{ + u32 hash; + u16 qoffset = 0; + u16 qcount = num_tx_queues; + + if (skb_rx_queue_recorded(skb)) { + hash = skb_get_rx_queue(skb); + while (unlikely(hash >= num_tx_queues)) + hash -= num_tx_queues; + return hash; + } + + if (dev->num_tc) { + u8 tc = netdev_get_prio_tc_map(dev, skb->priority); + qoffset = dev->tc_to_txq[tc].offset; + qcount = dev->tc_to_txq[tc].count; + } + + return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset; +} +EXPORT_SYMBOL(__skb_tx_hash); + static void skb_warn_bad_offload(const struct sk_buff *skb) { static const netdev_features_t null_features = 0; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 0d9bc3a586ba..07ca11d06339 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -371,34 +371,6 @@ __u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb) } EXPORT_SYMBOL(skb_get_hash_perturb); -/* - * Returns a Tx hash based on the given packet descriptor a Tx queues' number - * to be used as a distribution range. - */ -u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb, - unsigned int num_tx_queues) -{ - u32 hash; - u16 qoffset = 0; - u16 qcount = num_tx_queues; - - if (skb_rx_queue_recorded(skb)) { - hash = skb_get_rx_queue(skb); - while (unlikely(hash >= num_tx_queues)) - hash -= num_tx_queues; - return hash; - } - - if (dev->num_tc) { - u8 tc = netdev_get_prio_tc_map(dev, skb->priority); - qoffset = dev->tc_to_txq[tc].offset; - qcount = dev->tc_to_txq[tc].count; - } - - return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset; -} -EXPORT_SYMBOL(__skb_tx_hash); - u32 __skb_get_poff(const struct sk_buff *skb, void *data, const struct flow_keys *keys, int hlen) { -- cgit From 638b2a699fd3ec926d6dda2d2bd96e8f1c49e463 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 12 May 2015 14:56:13 +0200 Subject: net: move netdev_pick_tx and dependencies to net/core/dev.c next to its user. No relation to flow_dissector so it makes no sense to have it in flow_dissector.c Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/dev.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++ net/core/flow_dissector.c | 78 ----------------------------------------------- 2 files changed, 78 insertions(+), 78 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index d044d2f8532b..af549062ae8e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2936,6 +2936,84 @@ int dev_loopback_xmit(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(dev_loopback_xmit); +static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) +{ +#ifdef CONFIG_XPS + struct xps_dev_maps *dev_maps; + struct xps_map *map; + int queue_index = -1; + + rcu_read_lock(); + dev_maps = rcu_dereference(dev->xps_maps); + if (dev_maps) { + map = rcu_dereference( + dev_maps->cpu_map[skb->sender_cpu - 1]); + if (map) { + if (map->len == 1) + queue_index = map->queues[0]; + else + queue_index = map->queues[reciprocal_scale(skb_get_hash(skb), + map->len)]; + if (unlikely(queue_index >= dev->real_num_tx_queues)) + queue_index = -1; + } + } + rcu_read_unlock(); + + return queue_index; +#else + return -1; +#endif +} + +static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + int queue_index = sk_tx_queue_get(sk); + + if (queue_index < 0 || skb->ooo_okay || + queue_index >= dev->real_num_tx_queues) { + int new_index = get_xps_queue(dev, skb); + if (new_index < 0) + new_index = skb_tx_hash(dev, skb); + + if (queue_index != new_index && sk && + rcu_access_pointer(sk->sk_dst_cache)) + sk_tx_queue_set(sk, new_index); + + queue_index = new_index; + } + + return queue_index; +} + +struct netdev_queue *netdev_pick_tx(struct net_device *dev, + struct sk_buff *skb, + void *accel_priv) +{ + int queue_index = 0; + +#ifdef CONFIG_XPS + if (skb->sender_cpu == 0) + skb->sender_cpu = raw_smp_processor_id() + 1; +#endif + + if (dev->real_num_tx_queues != 1) { + const struct net_device_ops *ops = dev->netdev_ops; + if (ops->ndo_select_queue) + queue_index = ops->ndo_select_queue(dev, skb, accel_priv, + __netdev_pick_tx); + else + queue_index = __netdev_pick_tx(dev, skb); + + if (!accel_priv) + queue_index = netdev_cap_txqueue(dev, queue_index); + } + + skb_set_queue_mapping(skb, queue_index); + return netdev_get_tx_queue(dev, queue_index); +} + /** * __dev_queue_xmit - transmit a buffer * @skb: buffer to transmit diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 07ca11d06339..04f87237bd64 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -431,81 +431,3 @@ u32 skb_get_poff(const struct sk_buff *skb) return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb)); } - -static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) -{ -#ifdef CONFIG_XPS - struct xps_dev_maps *dev_maps; - struct xps_map *map; - int queue_index = -1; - - rcu_read_lock(); - dev_maps = rcu_dereference(dev->xps_maps); - if (dev_maps) { - map = rcu_dereference( - dev_maps->cpu_map[skb->sender_cpu - 1]); - if (map) { - if (map->len == 1) - queue_index = map->queues[0]; - else - queue_index = map->queues[reciprocal_scale(skb_get_hash(skb), - map->len)]; - if (unlikely(queue_index >= dev->real_num_tx_queues)) - queue_index = -1; - } - } - rcu_read_unlock(); - - return queue_index; -#else - return -1; -#endif -} - -static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) -{ - struct sock *sk = skb->sk; - int queue_index = sk_tx_queue_get(sk); - - if (queue_index < 0 || skb->ooo_okay || - queue_index >= dev->real_num_tx_queues) { - int new_index = get_xps_queue(dev, skb); - if (new_index < 0) - new_index = skb_tx_hash(dev, skb); - - if (queue_index != new_index && sk && - rcu_access_pointer(sk->sk_dst_cache)) - sk_tx_queue_set(sk, new_index); - - queue_index = new_index; - } - - return queue_index; -} - -struct netdev_queue *netdev_pick_tx(struct net_device *dev, - struct sk_buff *skb, - void *accel_priv) -{ - int queue_index = 0; - -#ifdef CONFIG_XPS - if (skb->sender_cpu == 0) - skb->sender_cpu = raw_smp_processor_id() + 1; -#endif - - if (dev->real_num_tx_queues != 1) { - const struct net_device_ops *ops = dev->netdev_ops; - if (ops->ndo_select_queue) - queue_index = ops->ndo_select_queue(dev, skb, accel_priv, - __netdev_pick_tx); - else - queue_index = __netdev_pick_tx(dev, skb); - - if (!accel_priv) - queue_index = netdev_cap_txqueue(dev, queue_index); - } - - skb_set_queue_mapping(skb, queue_index); - return netdev_get_tx_queue(dev, queue_index); -} -- cgit From 0db89b8b3243829704225e756ca7417e06812bba Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 12 May 2015 14:56:14 +0200 Subject: flow_dissector: fix doc for skb_get_poff Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 04f87237bd64..d885e282908e 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -417,8 +417,12 @@ u32 __skb_get_poff(const struct sk_buff *skb, void *data, return poff; } -/* skb_get_poff() returns the offset to the payload as far as it could - * be dissected. The main user is currently BPF, so that we can dynamically +/** + * skb_get_poff - get the offset to the payload + * @skb: sk_buff to get the payload offset from + * + * The function will get the offset to the payload as far as it could + * be dissected. The main user is currently BPF, so that we can dynamically * truncate packets without needing to push actual payload to the user * space and can analyze headers only, instead. */ -- cgit From fbff949e3bc7f3f7d9e8b3ef4855ec7138276a25 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 12 May 2015 14:56:15 +0200 Subject: flow_dissector: introduce programable flow_dissector Introduce dissector infrastructure which allows user to specify which parts of skb he wants to dissect. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index d885e282908e..6883e22b8fc4 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -1,3 +1,4 @@ +#include #include #include #include @@ -15,6 +16,53 @@ #include #include +static bool skb_flow_dissector_uses_key(struct flow_dissector *flow_dissector, + enum flow_dissector_key_id key_id) +{ + return flow_dissector->used_keys & (1 << key_id); +} + +static void skb_flow_dissector_set_key(struct flow_dissector *flow_dissector, + enum flow_dissector_key_id key_id) +{ + flow_dissector->used_keys |= (1 << key_id); +} + +static void *skb_flow_dissector_target(struct flow_dissector *flow_dissector, + enum flow_dissector_key_id key_id, + void *target_container) +{ + return ((char *) target_container) + flow_dissector->offset[key_id]; +} + +void skb_flow_dissector_init(struct flow_dissector *flow_dissector, + const struct flow_dissector_key *key, + unsigned int key_count) +{ + unsigned int i; + + memset(flow_dissector, 0, sizeof(*flow_dissector)); + + for (i = 0; i < key_count; i++, key++) { + /* User should make sure that every key target offset is withing + * boundaries of unsigned short. + */ + BUG_ON(key->offset > USHRT_MAX); + BUG_ON(skb_flow_dissector_uses_key(flow_dissector, + key->key_id)); + + skb_flow_dissector_set_key(flow_dissector, key->key_id); + flow_dissector->offset[key->key_id] = key->offset; + } + + /* Ensure that the dissector always includes basic key. That way + * we are able to avoid handling lack of it in fast path. + */ + BUG_ON(!skb_flow_dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_BASIC)); +} +EXPORT_SYMBOL(skb_flow_dissector_init); + /* copy saddr & daddr, possibly using 64bit load/store * Equivalent to : flow->src = iph->saddr; * flow->dst = iph->daddr; -- cgit From 06635a35d13d42b95422bba6633f175245cc644e Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 12 May 2015 14:56:16 +0200 Subject: flow_dissect: use programable dissector in skb_flow_dissect and friends Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 199 ++++++++++++++++++++++++++++++++-------------- net/ethernet/eth.c | 6 +- net/sched/cls_flow.c | 20 ++--- net/sched/sch_choke.c | 4 +- 4 files changed, 153 insertions(+), 76 deletions(-) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 6883e22b8fc4..6a49acaa6651 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -63,17 +64,6 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, } EXPORT_SYMBOL(skb_flow_dissector_init); -/* copy saddr & daddr, possibly using 64bit load/store - * Equivalent to : flow->src = iph->saddr; - * flow->dst = iph->daddr; - */ -static void iph_to_flow_copy_addrs(struct flow_keys *flow, const struct iphdr *iph) -{ - BUILD_BUG_ON(offsetof(typeof(*flow), dst) != - offsetof(typeof(*flow), src) + sizeof(flow->src)); - memcpy(&flow->src, &iph->saddr, sizeof(flow->src) + sizeof(flow->dst)); -} - /** * __skb_flow_get_ports - extract the upper layer ports and return them * @skb: sk_buff to extract the ports from @@ -111,17 +101,27 @@ EXPORT_SYMBOL(__skb_flow_get_ports); /** * __skb_flow_dissect - extract the flow_keys struct and return it * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified + * @flow_dissector: list of keys to dissect + * @target_container: target structure to put dissected values into * @data: raw buffer pointer to the packet, if NULL use skb->data * @proto: protocol for which to get the flow, if @data is NULL use skb->protocol * @nhoff: network header offset, if @data is NULL use skb_network_offset(skb) * @hlen: packet header length, if @data is NULL use skb_headlen(skb) * - * The function will try to retrieve the struct flow_keys from either the skbuff - * or a raw buffer specified by the rest parameters + * The function will try to retrieve individual keys into target specified + * by flow_dissector from either the skbuff or a raw buffer specified by the + * rest parameters. + * + * Caller must take care of zeroing target container memory. */ -bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow, +bool __skb_flow_dissect(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, void *data, __be16 proto, int nhoff, int hlen) { + struct flow_dissector_key_basic *key_basic; + struct flow_dissector_key_addrs *key_addrs; + struct flow_dissector_key_ports *key_ports; u8 ip_proto; if (!data) { @@ -131,7 +131,12 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow, hlen = skb_headlen(skb); } - memset(flow, 0, sizeof(*flow)); + /* It is ensured by skb_flow_dissector_init() that basic key will + * be always present. + */ + key_basic = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_BASIC, + target_container); again: switch (proto) { @@ -148,14 +153,13 @@ ip: if (ip_is_fragment(iph)) ip_proto = 0; - /* skip the address processing if skb is NULL. The assumption - * here is that if there is no skb we are not looking for flow - * info but lengths and protocols. - */ - if (!skb) + if (!skb_flow_dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_IPV4_ADDRS)) break; - - iph_to_flow_copy_addrs(flow, iph); + key_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPV4_ADDRS, + target_container); + memcpy(key_addrs, &iph->saddr, sizeof(*key_addrs)); break; } case htons(ETH_P_IPV6): { @@ -171,12 +175,15 @@ ipv6: ip_proto = iph->nexthdr; nhoff += sizeof(struct ipv6hdr); - /* see comment above in IPv4 section */ - if (!skb) + if (!skb_flow_dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS)) break; + key_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS, + target_container); - flow->src = (__force __be32)ipv6_addr_hash(&iph->saddr); - flow->dst = (__force __be32)ipv6_addr_hash(&iph->daddr); + key_addrs->src = (__force __be32)ipv6_addr_hash(&iph->saddr); + key_addrs->dst = (__force __be32)ipv6_addr_hash(&iph->daddr); flow_label = ip6_flowlabel(iph); if (flow_label) { @@ -184,10 +191,18 @@ ipv6: * use that to represent the ports without any * further dissection. */ - flow->n_proto = proto; - flow->ip_proto = ip_proto; - flow->ports = flow_label; - flow->thoff = (u16)nhoff; + + key_basic->n_proto = proto; + key_basic->ip_proto = ip_proto; + key_basic->thoff = (u16)nhoff; + + if (!skb_flow_dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS)) + break; + key_ports = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS, + target_container); + key_ports->ports = flow_label; return true; } @@ -234,14 +249,22 @@ ipv6: hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) return false; - flow->src = hdr->srcnode; - flow->dst = 0; - flow->n_proto = proto; - flow->thoff = (u16)nhoff; + key_basic->n_proto = proto; + key_basic->thoff = (u16)nhoff; + + if (skb_flow_dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS)) { + return true; + key_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS, + target_container); + key_addrs->src = hdr->srcnode; + key_addrs->dst = 0; + } return true; } case htons(ETH_P_FCOE): - flow->thoff = (u16)(nhoff + FCOE_HEADER_LEN); + key_basic->thoff = (u16)(nhoff + FCOE_HEADER_LEN); /* fall through */ default: return false; @@ -296,14 +319,24 @@ ipv6: break; } - flow->n_proto = proto; - flow->ip_proto = ip_proto; - flow->thoff = (u16) nhoff; - - /* unless skb is set we don't need to record port info */ - if (skb) - flow->ports = __skb_flow_get_ports(skb, nhoff, ip_proto, - data, hlen); + /* It is ensured by skb_flow_dissector_init() that basic key will + * be always present. + */ + key_basic = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_BASIC, + target_container); + key_basic->n_proto = proto; + key_basic->ip_proto = ip_proto; + key_basic->thoff = (u16) nhoff; + + if (skb_flow_dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS)) { + key_ports = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS, + target_container); + key_ports->ports = __skb_flow_get_ports(skb, nhoff, ip_proto, + data, hlen); + } return true; } @@ -325,16 +358,16 @@ static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval) u32 hash; /* get a consistent hash (same value on both flow directions) */ - if (((__force u32)keys->dst < (__force u32)keys->src) || - (((__force u32)keys->dst == (__force u32)keys->src) && - ((__force u16)keys->port16[1] < (__force u16)keys->port16[0]))) { - swap(keys->dst, keys->src); - swap(keys->port16[0], keys->port16[1]); + if (((__force u32)keys->addrs.dst < (__force u32)keys->addrs.src) || + (((__force u32)keys->addrs.dst == (__force u32)keys->addrs.src) && + ((__force u16)keys->ports.port16[1] < (__force u16)keys->ports.port16[0]))) { + swap(keys->addrs.dst, keys->addrs.src); + swap(keys->ports.port16[0], keys->ports.port16[1]); } - hash = __flow_hash_3words((__force u32)keys->dst, - (__force u32)keys->src, - (__force u32)keys->ports, + hash = __flow_hash_3words((__force u32)keys->addrs.dst, + (__force u32)keys->addrs.src, + (__force u32)keys->ports.ports, keyval); if (!hash) hash = 1; @@ -352,7 +385,7 @@ EXPORT_SYMBOL(flow_hash_from_keys); static inline u32 ___skb_get_hash(const struct sk_buff *skb, struct flow_keys *keys, u32 keyval) { - if (!skb_flow_dissect(skb, keys)) + if (!skb_flow_dissect_flow_keys(skb, keys)) return 0; return __flow_hash_from_keys(keys, keyval); @@ -377,11 +410,11 @@ void make_flow_keys_digest(struct flow_keys_digest *digest, memset(digest, 0, sizeof(*digest)); - data->n_proto = flow->n_proto; - data->ip_proto = flow->ip_proto; - data->ports = flow->ports; - data->src = flow->src; - data->dst = flow->dst; + data->n_proto = flow->basic.n_proto; + data->ip_proto = flow->basic.ip_proto; + data->ports = flow->ports.ports; + data->src = flow->addrs.src; + data->dst = flow->addrs.dst; } EXPORT_SYMBOL(make_flow_keys_digest); @@ -404,7 +437,7 @@ void __skb_get_hash(struct sk_buff *skb) hash = ___skb_get_hash(skb, &keys, hashrnd); if (!hash) return; - if (keys.ports) + if (keys.ports.ports) skb->l4_hash = 1; skb->sw_hash = 1; skb->hash = hash; @@ -422,9 +455,9 @@ EXPORT_SYMBOL(skb_get_hash_perturb); u32 __skb_get_poff(const struct sk_buff *skb, void *data, const struct flow_keys *keys, int hlen) { - u32 poff = keys->thoff; + u32 poff = keys->basic.thoff; - switch (keys->ip_proto) { + switch (keys->basic.ip_proto) { case IPPROTO_TCP: { /* access doff as u8 to avoid unaligned access */ const u8 *doff; @@ -478,8 +511,52 @@ u32 skb_get_poff(const struct sk_buff *skb) { struct flow_keys keys; - if (!skb_flow_dissect(skb, &keys)) + if (!skb_flow_dissect_flow_keys(skb, &keys)) return 0; return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb)); } + +static const struct flow_dissector_key flow_keys_dissector_keys[] = { + { + .key_id = FLOW_DISSECTOR_KEY_BASIC, + .offset = offsetof(struct flow_keys, basic), + }, + { + .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS, + .offset = offsetof(struct flow_keys, addrs), + }, + { + .key_id = FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS, + .offset = offsetof(struct flow_keys, addrs), + }, + { + .key_id = FLOW_DISSECTOR_KEY_PORTS, + .offset = offsetof(struct flow_keys, ports), + }, +}; + +static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = { + { + .key_id = FLOW_DISSECTOR_KEY_BASIC, + .offset = offsetof(struct flow_keys, basic), + }, +}; + +struct flow_dissector flow_keys_dissector __read_mostly; +EXPORT_SYMBOL(flow_keys_dissector); + +struct flow_dissector flow_keys_buf_dissector __read_mostly; + +static int __init init_default_flow_dissectors(void) +{ + skb_flow_dissector_init(&flow_keys_dissector, + flow_keys_dissector_keys, + ARRAY_SIZE(flow_keys_dissector_keys)); + skb_flow_dissector_init(&flow_keys_buf_dissector, + flow_keys_buf_dissector_keys, + ARRAY_SIZE(flow_keys_buf_dissector_keys)); + return 0; +} + +late_initcall_sync(init_default_flow_dissectors); diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 9332a0ab0698..c3325bd2f3fb 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -131,9 +131,9 @@ u32 eth_get_headlen(void *data, unsigned int len) return len; /* parse any remaining L2/L3 headers, check for L4 */ - if (!__skb_flow_dissect(NULL, &keys, data, - eth->h_proto, sizeof(*eth), len)) - return max_t(u32, keys.thoff, sizeof(*eth)); + if (!skb_flow_dissect_flow_keys_buf(&keys, data, eth->h_proto, + sizeof(*eth), len)) + return max_t(u32, keys.basic.thoff, sizeof(*eth)); /* parse for any L4 headers */ return min_t(u32, __skb_get_poff(NULL, data, &keys, len), len); diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 4c5a92298906..4b3e3e30bf4d 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -68,35 +68,35 @@ static inline u32 addr_fold(void *addr) static u32 flow_get_src(const struct sk_buff *skb, const struct flow_keys *flow) { - if (flow->src) - return ntohl(flow->src); + if (flow->addrs.src) + return ntohl(flow->addrs.src); return addr_fold(skb->sk); } static u32 flow_get_dst(const struct sk_buff *skb, const struct flow_keys *flow) { - if (flow->dst) - return ntohl(flow->dst); + if (flow->addrs.dst) + return ntohl(flow->addrs.dst); return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb); } static u32 flow_get_proto(const struct sk_buff *skb, const struct flow_keys *flow) { - return flow->ip_proto; + return flow->basic.ip_proto; } static u32 flow_get_proto_src(const struct sk_buff *skb, const struct flow_keys *flow) { - if (flow->ports) - return ntohs(flow->port16[0]); + if (flow->ports.ports) + return ntohs(flow->ports.port16[0]); return addr_fold(skb->sk); } static u32 flow_get_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow) { - if (flow->ports) - return ntohs(flow->port16[1]); + if (flow->ports.ports) + return ntohs(flow->ports.port16[1]); return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb); } @@ -295,7 +295,7 @@ static int flow_classify(struct sk_buff *skb, const struct tcf_proto *tp, keymask = f->keymask; if (keymask & FLOW_KEYS_NEEDED) - skb_flow_dissect(skb, &flow_keys); + skb_flow_dissect_flow_keys(skb, &flow_keys); for (n = 0; n < f->nkeys; n++) { key = ffs(keymask) - 1; diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index 2624e991a2d1..93d5742dc7e0 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -170,13 +170,13 @@ static bool choke_match_flow(struct sk_buff *skb1, if (!choke_skb_cb(skb1)->keys_valid) { choke_skb_cb(skb1)->keys_valid = 1; - skb_flow_dissect(skb1, &temp); + skb_flow_dissect_flow_keys(skb1, &temp); make_flow_keys_digest(&choke_skb_cb(skb1)->keys, &temp); } if (!choke_skb_cb(skb2)->keys_valid) { choke_skb_cb(skb2)->keys_valid = 1; - skb_flow_dissect(skb2, &temp); + skb_flow_dissect_flow_keys(skb2, &temp); make_flow_keys_digest(&choke_skb_cb(skb2)->keys, &temp); } -- cgit From b924933cbbfbdcaa2831a39780c116ec6e48c397 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 12 May 2015 14:56:18 +0200 Subject: flow_dissector: introduce support for ipv6 addressses So far, only hashes made out of ipv6 addresses could be dissected. This patch introduces support for dissection of full ipv6 addresses. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 6a49acaa6651..1b95d5ccc9d6 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -175,16 +175,29 @@ ipv6: ip_proto = iph->nexthdr; nhoff += sizeof(struct ipv6hdr); - if (!skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS)) - break; - key_addrs = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS, - target_container); + if (skb_flow_dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS)) { + key_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS, + target_container); - key_addrs->src = (__force __be32)ipv6_addr_hash(&iph->saddr); - key_addrs->dst = (__force __be32)ipv6_addr_hash(&iph->daddr); + key_addrs->src = (__force __be32)ipv6_addr_hash(&iph->saddr); + key_addrs->dst = (__force __be32)ipv6_addr_hash(&iph->daddr); + goto flow_label; + } + if (skb_flow_dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_IPV6_ADDRS)) { + struct flow_dissector_key_ipv6_addrs *key_ipv6_addrs; + key_ipv6_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPV6_ADDRS, + target_container); + + memcpy(key_ipv6_addrs, &iph->saddr, sizeof(*key_ipv6_addrs)); + goto flow_label; + } + break; +flow_label: flow_label = ip6_flowlabel(iph); if (flow_label) { /* Awesome, IPv6 packet has a flow label so we can -- cgit From 67a900cc0436d74e7ff89042371760def087680d Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 12 May 2015 14:56:19 +0200 Subject: flow_dissector: introduce support for Ethernet addresses Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 1b95d5ccc9d6..7a0b391114a5 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -138,6 +139,17 @@ bool __skb_flow_dissect(const struct sk_buff *skb, FLOW_DISSECTOR_KEY_BASIC, target_container); + if (skb_flow_dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ETH_ADDRS)) { + struct ethhdr *eth = eth_hdr(skb); + struct flow_dissector_key_eth_addrs *key_eth_addrs; + + key_eth_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ETH_ADDRS, + target_container); + memcpy(key_eth_addrs, ð->h_dest, sizeof(*key_eth_addrs)); + } + again: switch (proto) { case htons(ETH_P_IP): { -- cgit From 59346afe7a5548ab3e9730aeff33993faa76abbe Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 12 May 2015 14:56:20 +0200 Subject: flow_dissector: change port array into src, dst tuple Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 4 ++-- net/sched/cls_flow.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 7a0b391114a5..204d09c42510 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -385,9 +385,9 @@ static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval) /* get a consistent hash (same value on both flow directions) */ if (((__force u32)keys->addrs.dst < (__force u32)keys->addrs.src) || (((__force u32)keys->addrs.dst == (__force u32)keys->addrs.src) && - ((__force u16)keys->ports.port16[1] < (__force u16)keys->ports.port16[0]))) { + ((__force u16)keys->ports.dst < (__force u16)keys->ports.src))) { swap(keys->addrs.dst, keys->addrs.src); - swap(keys->ports.port16[0], keys->ports.port16[1]); + swap(keys->ports.src, keys->ports.dst); } hash = __flow_hash_3words((__force u32)keys->addrs.dst, diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 4b3e3e30bf4d..b4359924846c 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -88,7 +88,7 @@ static u32 flow_get_proto(const struct sk_buff *skb, const struct flow_keys *flo static u32 flow_get_proto_src(const struct sk_buff *skb, const struct flow_keys *flow) { if (flow->ports.ports) - return ntohs(flow->ports.port16[0]); + return ntohs(flow->ports.src); return addr_fold(skb->sk); } @@ -96,7 +96,7 @@ static u32 flow_get_proto_src(const struct sk_buff *skb, const struct flow_keys static u32 flow_get_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow) { if (flow->ports.ports) - return ntohs(flow->ports.port16[1]); + return ntohs(flow->ports.dst); return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb); } -- cgit From 77b9900ef53ae047e36a37d13a2aa33bb2d60641 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 12 May 2015 14:56:21 +0200 Subject: tc: introduce Flower classifier This patch introduces a flow-based filter. So far, the very essential packet fields are supported. This patch is only the first step. There is a lot of potential performance improvements possible to implement. Also a lot of features are missing now. They will be addressed in follow-up patches. Signed-off-by: Jiri Pirko Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/Kconfig | 10 + net/sched/Makefile | 1 + net/sched/cls_flower.c | 688 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 699 insertions(+) create mode 100644 net/sched/cls_flower.c (limited to 'net') diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 2274e723a3df..5fd1c2f487d2 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -477,6 +477,16 @@ config NET_CLS_BPF To compile this code as a module, choose M here: the module will be called cls_bpf. +config NET_CLS_FLOWER + tristate "Flower classifier" + select NET_CLS + ---help--- + If you say Y here, you will be able to classify packets based on + a configurable combination of packet keys and masks. + + To compile this code as a module, choose M here: the module will + be called cls_flower. + config NET_EMATCH bool "Extended Matches" select NET_CLS diff --git a/net/sched/Makefile b/net/sched/Makefile index 7ca7f4c1b8c2..690c1689e090 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -56,6 +56,7 @@ obj-$(CONFIG_NET_CLS_BASIC) += cls_basic.o obj-$(CONFIG_NET_CLS_FLOW) += cls_flow.o obj-$(CONFIG_NET_CLS_CGROUP) += cls_cgroup.o obj-$(CONFIG_NET_CLS_BPF) += cls_bpf.o +obj-$(CONFIG_NET_CLS_FLOWER) += cls_flower.o obj-$(CONFIG_NET_EMATCH) += ematch.o obj-$(CONFIG_NET_EMATCH_CMP) += em_cmp.o obj-$(CONFIG_NET_EMATCH_NBYTE) += em_nbyte.o diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c new file mode 100644 index 000000000000..9bc654c764cd --- /dev/null +++ b/net/sched/cls_flower.c @@ -0,0 +1,688 @@ +/* + * net/sched/cls_flower.c Flower classifier + * + * Copyright (c) 2015 Jiri Pirko + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +struct fl_flow_key { + int indev_ifindex; + struct flow_dissector_key_basic basic; + struct flow_dissector_key_eth_addrs eth; + union { + struct flow_dissector_key_addrs ipv4; + struct flow_dissector_key_ipv6_addrs ipv6; + }; + struct flow_dissector_key_ports tp; +} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ + +struct fl_flow_mask_range { + unsigned short int start; + unsigned short int end; +}; + +struct fl_flow_mask { + struct fl_flow_key key; + struct fl_flow_mask_range range; + struct rcu_head rcu; +}; + +struct cls_fl_head { + struct rhashtable ht; + struct fl_flow_mask mask; + struct flow_dissector dissector; + u32 hgen; + bool mask_assigned; + struct list_head filters; + struct rhashtable_params ht_params; + struct rcu_head rcu; +}; + +struct cls_fl_filter { + struct rhash_head ht_node; + struct fl_flow_key mkey; + struct tcf_exts exts; + struct tcf_result res; + struct fl_flow_key key; + struct list_head list; + u32 handle; + struct rcu_head rcu; +}; + +static unsigned short int fl_mask_range(const struct fl_flow_mask *mask) +{ + return mask->range.end - mask->range.start; +} + +static void fl_mask_update_range(struct fl_flow_mask *mask) +{ + const u8 *bytes = (const u8 *) &mask->key; + size_t size = sizeof(mask->key); + size_t i, first = 0, last = size - 1; + + for (i = 0; i < sizeof(mask->key); i++) { + if (bytes[i]) { + if (!first && i) + first = i; + last = i; + } + } + mask->range.start = rounddown(first, sizeof(long)); + mask->range.end = roundup(last + 1, sizeof(long)); +} + +static void *fl_key_get_start(struct fl_flow_key *key, + const struct fl_flow_mask *mask) +{ + return (u8 *) key + mask->range.start; +} + +static void fl_set_masked_key(struct fl_flow_key *mkey, struct fl_flow_key *key, + struct fl_flow_mask *mask) +{ + const long *lkey = fl_key_get_start(key, mask); + const long *lmask = fl_key_get_start(&mask->key, mask); + long *lmkey = fl_key_get_start(mkey, mask); + int i; + + for (i = 0; i < fl_mask_range(mask); i += sizeof(long)) + *lmkey++ = *lkey++ & *lmask++; +} + +static void fl_clear_masked_range(struct fl_flow_key *key, + struct fl_flow_mask *mask) +{ + memset(fl_key_get_start(key, mask), 0, fl_mask_range(mask)); +} + +static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, + struct tcf_result *res) +{ + struct cls_fl_head *head = rcu_dereference_bh(tp->root); + struct cls_fl_filter *f; + struct fl_flow_key skb_key; + struct fl_flow_key skb_mkey; + + fl_clear_masked_range(&skb_key, &head->mask); + skb_key.indev_ifindex = skb->skb_iif; + /* skb_flow_dissect() does not set n_proto in case an unknown protocol, + * so do it rather here. + */ + skb_key.basic.n_proto = skb->protocol; + skb_flow_dissect(skb, &head->dissector, &skb_key); + + fl_set_masked_key(&skb_mkey, &skb_key, &head->mask); + + f = rhashtable_lookup_fast(&head->ht, + fl_key_get_start(&skb_mkey, &head->mask), + head->ht_params); + if (f) { + *res = f->res; + return tcf_exts_exec(skb, &f->exts, res); + } + return -1; +} + +static int fl_init(struct tcf_proto *tp) +{ + struct cls_fl_head *head; + + head = kzalloc(sizeof(*head), GFP_KERNEL); + if (!head) + return -ENOBUFS; + + INIT_LIST_HEAD_RCU(&head->filters); + rcu_assign_pointer(tp->root, head); + + return 0; +} + +static void fl_destroy_filter(struct rcu_head *head) +{ + struct cls_fl_filter *f = container_of(head, struct cls_fl_filter, rcu); + + tcf_exts_destroy(&f->exts); + kfree(f); +} + +static bool fl_destroy(struct tcf_proto *tp, bool force) +{ + struct cls_fl_head *head = rtnl_dereference(tp->root); + struct cls_fl_filter *f, *next; + + if (!force && !list_empty(&head->filters)) + return false; + + list_for_each_entry_safe(f, next, &head->filters, list) { + list_del_rcu(&f->list); + call_rcu(&f->rcu, fl_destroy_filter); + } + RCU_INIT_POINTER(tp->root, NULL); + if (head->mask_assigned) + rhashtable_destroy(&head->ht); + kfree_rcu(head, rcu); + return true; +} + +static unsigned long fl_get(struct tcf_proto *tp, u32 handle) +{ + struct cls_fl_head *head = rtnl_dereference(tp->root); + struct cls_fl_filter *f; + + list_for_each_entry(f, &head->filters, list) + if (f->handle == handle) + return (unsigned long) f; + return 0; +} + +static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { + [TCA_FLOWER_UNSPEC] = { .type = NLA_UNSPEC }, + [TCA_FLOWER_CLASSID] = { .type = NLA_U32 }, + [TCA_FLOWER_INDEV] = { .type = NLA_STRING, + .len = IFNAMSIZ }, + [TCA_FLOWER_KEY_ETH_DST] = { .len = ETH_ALEN }, + [TCA_FLOWER_KEY_ETH_DST_MASK] = { .len = ETH_ALEN }, + [TCA_FLOWER_KEY_ETH_SRC] = { .len = ETH_ALEN }, + [TCA_FLOWER_KEY_ETH_SRC_MASK] = { .len = ETH_ALEN }, + [TCA_FLOWER_KEY_ETH_TYPE] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_IP_PROTO] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_IPV4_SRC] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_IPV4_SRC_MASK] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_IPV4_DST] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_IPV4_DST_MASK] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_IPV6_SRC] = { .len = sizeof(struct in6_addr) }, + [TCA_FLOWER_KEY_IPV6_SRC_MASK] = { .len = sizeof(struct in6_addr) }, + [TCA_FLOWER_KEY_IPV6_DST] = { .len = sizeof(struct in6_addr) }, + [TCA_FLOWER_KEY_IPV6_DST_MASK] = { .len = sizeof(struct in6_addr) }, + [TCA_FLOWER_KEY_TCP_SRC] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_TCP_DST] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_TCP_SRC] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_TCP_DST] = { .type = NLA_U16 }, +}; + +static void fl_set_key_val(struct nlattr **tb, + void *val, int val_type, + void *mask, int mask_type, int len) +{ + if (!tb[val_type]) + return; + memcpy(val, nla_data(tb[val_type]), len); + if (mask_type == TCA_FLOWER_UNSPEC || !tb[mask_type]) + memset(mask, 0xff, len); + else + memcpy(mask, nla_data(tb[mask_type]), len); +} + +static int fl_set_key(struct net *net, struct nlattr **tb, + struct fl_flow_key *key, struct fl_flow_key *mask) +{ + int err; + + if (tb[TCA_FLOWER_INDEV]) { + err = tcf_change_indev(net, tb[TCA_FLOWER_INDEV]); + if (err < 0) + return err; + key->indev_ifindex = err; + mask->indev_ifindex = 0xffffffff; + } + + fl_set_key_val(tb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST, + mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK, + sizeof(key->eth.dst)); + fl_set_key_val(tb, key->eth.src, TCA_FLOWER_KEY_ETH_SRC, + mask->eth.src, TCA_FLOWER_KEY_ETH_SRC_MASK, + sizeof(key->eth.src)); + fl_set_key_val(tb, &key->basic.n_proto, TCA_FLOWER_KEY_ETH_TYPE, + &mask->basic.n_proto, TCA_FLOWER_UNSPEC, + sizeof(key->basic.n_proto)); + if (key->basic.n_proto == htons(ETH_P_IP) || + key->basic.n_proto == htons(ETH_P_IPV6)) { + fl_set_key_val(tb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO, + &mask->basic.ip_proto, TCA_FLOWER_UNSPEC, + sizeof(key->basic.ip_proto)); + } + if (key->basic.n_proto == htons(ETH_P_IP)) { + fl_set_key_val(tb, &key->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC, + &mask->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC_MASK, + sizeof(key->ipv4.src)); + fl_set_key_val(tb, &key->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST, + &mask->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST_MASK, + sizeof(key->ipv4.dst)); + } else if (key->basic.n_proto == htons(ETH_P_IPV6)) { + fl_set_key_val(tb, &key->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC, + &mask->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC_MASK, + sizeof(key->ipv6.src)); + fl_set_key_val(tb, &key->ipv6.dst, TCA_FLOWER_KEY_IPV6_DST, + &mask->ipv6.dst, TCA_FLOWER_KEY_IPV6_DST_MASK, + sizeof(key->ipv6.dst)); + } + if (key->basic.ip_proto == IPPROTO_TCP) { + fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_TCP_SRC, + &mask->tp.src, TCA_FLOWER_UNSPEC, + sizeof(key->tp.src)); + fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_TCP_DST, + &mask->tp.dst, TCA_FLOWER_UNSPEC, + sizeof(key->tp.dst)); + } else if (key->basic.ip_proto == IPPROTO_UDP) { + fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_UDP_SRC, + &mask->tp.src, TCA_FLOWER_UNSPEC, + sizeof(key->tp.src)); + fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_UDP_DST, + &mask->tp.dst, TCA_FLOWER_UNSPEC, + sizeof(key->tp.dst)); + } + + return 0; +} + +static bool fl_mask_eq(struct fl_flow_mask *mask1, + struct fl_flow_mask *mask2) +{ + const long *lmask1 = fl_key_get_start(&mask1->key, mask1); + const long *lmask2 = fl_key_get_start(&mask2->key, mask2); + + return !memcmp(&mask1->range, &mask2->range, sizeof(mask1->range)) && + !memcmp(lmask1, lmask2, fl_mask_range(mask1)); +} + +static const struct rhashtable_params fl_ht_params = { + .key_offset = offsetof(struct cls_fl_filter, mkey), /* base offset */ + .head_offset = offsetof(struct cls_fl_filter, ht_node), + .automatic_shrinking = true, +}; + +static int fl_init_hashtable(struct cls_fl_head *head, + struct fl_flow_mask *mask) +{ + head->ht_params = fl_ht_params; + head->ht_params.key_len = fl_mask_range(mask); + head->ht_params.key_offset += mask->range.start; + + return rhashtable_init(&head->ht, &head->ht_params); +} + +#define FL_KEY_MEMBER_OFFSET(member) offsetof(struct fl_flow_key, member) +#define FL_KEY_MEMBER_SIZE(member) (sizeof(((struct fl_flow_key *) 0)->member)) +#define FL_KEY_MEMBER_END_OFFSET(member) \ + (FL_KEY_MEMBER_OFFSET(member) + FL_KEY_MEMBER_SIZE(member)) + +#define FL_KEY_IN_RANGE(mask, member) \ + (FL_KEY_MEMBER_OFFSET(member) <= (mask)->range.end && \ + FL_KEY_MEMBER_END_OFFSET(member) >= (mask)->range.start) + +#define FL_KEY_SET(keys, cnt, id, member) \ + do { \ + keys[cnt].key_id = id; \ + keys[cnt].offset = FL_KEY_MEMBER_OFFSET(member); \ + cnt++; \ + } while(0); + +#define FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, id, member) \ + do { \ + if (FL_KEY_IN_RANGE(mask, member)) \ + FL_KEY_SET(keys, cnt, id, member); \ + } while(0); + +static void fl_init_dissector(struct cls_fl_head *head, + struct fl_flow_mask *mask) +{ + struct flow_dissector_key keys[FLOW_DISSECTOR_KEY_MAX]; + size_t cnt = 0; + + FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_BASIC, basic); + FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, + FLOW_DISSECTOR_KEY_ETH_ADDRS, eth); + FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, + FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4); + FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, + FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6); + FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, + FLOW_DISSECTOR_KEY_PORTS, tp); + + skb_flow_dissector_init(&head->dissector, keys, cnt); +} + +static int fl_check_assign_mask(struct cls_fl_head *head, + struct fl_flow_mask *mask) +{ + int err; + + if (head->mask_assigned) { + if (!fl_mask_eq(&head->mask, mask)) + return -EINVAL; + else + return 0; + } + + /* Mask is not assigned yet. So assign it and init hashtable + * according to that. + */ + err = fl_init_hashtable(head, mask); + if (err) + return err; + memcpy(&head->mask, mask, sizeof(head->mask)); + head->mask_assigned = true; + + fl_init_dissector(head, mask); + + return 0; +} + +static int fl_set_parms(struct net *net, struct tcf_proto *tp, + struct cls_fl_filter *f, struct fl_flow_mask *mask, + unsigned long base, struct nlattr **tb, + struct nlattr *est, bool ovr) +{ + struct tcf_exts e; + int err; + + tcf_exts_init(&e, TCA_FLOWER_ACT, 0); + err = tcf_exts_validate(net, tp, tb, est, &e, ovr); + if (err < 0) + return err; + + if (tb[TCA_FLOWER_CLASSID]) { + f->res.classid = nla_get_u32(tb[TCA_FLOWER_CLASSID]); + tcf_bind_filter(tp, &f->res, base); + } + + err = fl_set_key(net, tb, &f->key, &mask->key); + if (err) + goto errout; + + fl_mask_update_range(mask); + fl_set_masked_key(&f->mkey, &f->key, mask); + + tcf_exts_change(tp, &f->exts, &e); + + return 0; +errout: + tcf_exts_destroy(&e); + return err; +} + +static u32 fl_grab_new_handle(struct tcf_proto *tp, + struct cls_fl_head *head) +{ + unsigned int i = 0x80000000; + u32 handle; + + do { + if (++head->hgen == 0x7FFFFFFF) + head->hgen = 1; + } while (--i > 0 && fl_get(tp, head->hgen)); + + if (unlikely(i == 0)) { + pr_err("Insufficient number of handles\n"); + handle = 0; + } else { + handle = head->hgen; + } + + return handle; +} + +static int fl_change(struct net *net, struct sk_buff *in_skb, + struct tcf_proto *tp, unsigned long base, + u32 handle, struct nlattr **tca, + unsigned long *arg, bool ovr) +{ + struct cls_fl_head *head = rtnl_dereference(tp->root); + struct cls_fl_filter *fold = (struct cls_fl_filter *) *arg; + struct cls_fl_filter *fnew; + struct nlattr *tb[TCA_FLOWER_MAX + 1]; + struct fl_flow_mask mask = {}; + int err; + + if (!tca[TCA_OPTIONS]) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_FLOWER_MAX, tca[TCA_OPTIONS], fl_policy); + if (err < 0) + return err; + + if (fold && handle && fold->handle != handle) + return -EINVAL; + + fnew = kzalloc(sizeof(*fnew), GFP_KERNEL); + if (!fnew) + return -ENOBUFS; + + tcf_exts_init(&fnew->exts, TCA_FLOWER_ACT, 0); + + if (!handle) { + handle = fl_grab_new_handle(tp, head); + if (!handle) { + err = -EINVAL; + goto errout; + } + } + fnew->handle = handle; + + err = fl_set_parms(net, tp, fnew, &mask, base, tb, tca[TCA_RATE], ovr); + if (err) + goto errout; + + err = fl_check_assign_mask(head, &mask); + if (err) + goto errout; + + err = rhashtable_insert_fast(&head->ht, &fnew->ht_node, + head->ht_params); + if (err) + goto errout; + if (fold) + rhashtable_remove_fast(&head->ht, &fold->ht_node, + head->ht_params); + + *arg = (unsigned long) fnew; + + if (fold) { + list_replace_rcu(&fnew->list, &fold->list); + tcf_unbind_filter(tp, &fold->res); + call_rcu(&fold->rcu, fl_destroy_filter); + } else { + list_add_tail_rcu(&fnew->list, &head->filters); + } + + return 0; + +errout: + kfree(fnew); + return err; +} + +static int fl_delete(struct tcf_proto *tp, unsigned long arg) +{ + struct cls_fl_head *head = rtnl_dereference(tp->root); + struct cls_fl_filter *f = (struct cls_fl_filter *) arg; + + rhashtable_remove_fast(&head->ht, &f->ht_node, + head->ht_params); + list_del_rcu(&f->list); + tcf_unbind_filter(tp, &f->res); + call_rcu(&f->rcu, fl_destroy_filter); + return 0; +} + +static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg) +{ + struct cls_fl_head *head = rtnl_dereference(tp->root); + struct cls_fl_filter *f; + + list_for_each_entry_rcu(f, &head->filters, list) { + if (arg->count < arg->skip) + goto skip; + if (arg->fn(tp, (unsigned long) f, arg) < 0) { + arg->stop = 1; + break; + } +skip: + arg->count++; + } +} + +static int fl_dump_key_val(struct sk_buff *skb, + void *val, int val_type, + void *mask, int mask_type, int len) +{ + int err; + + if (!memchr_inv(mask, 0, len)) + return 0; + err = nla_put(skb, val_type, len, val); + if (err) + return err; + if (mask_type != TCA_FLOWER_UNSPEC) { + err = nla_put(skb, mask_type, len, mask); + if (err) + return err; + } + return 0; +} + +static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, + struct sk_buff *skb, struct tcmsg *t) +{ + struct cls_fl_head *head = rtnl_dereference(tp->root); + struct cls_fl_filter *f = (struct cls_fl_filter *) fh; + struct nlattr *nest; + struct fl_flow_key *key, *mask; + + if (!f) + return skb->len; + + t->tcm_handle = f->handle; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (!nest) + goto nla_put_failure; + + if (f->res.classid && + nla_put_u32(skb, TCA_FLOWER_CLASSID, f->res.classid)) + goto nla_put_failure; + + key = &f->key; + mask = &head->mask.key; + + if (mask->indev_ifindex) { + struct net_device *dev; + + dev = __dev_get_by_index(net, key->indev_ifindex); + if (dev && nla_put_string(skb, TCA_FLOWER_INDEV, dev->name)) + goto nla_put_failure; + } + + if (fl_dump_key_val(skb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST, + mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK, + sizeof(key->eth.dst)) || + fl_dump_key_val(skb, key->eth.src, TCA_FLOWER_KEY_ETH_SRC, + mask->eth.src, TCA_FLOWER_KEY_ETH_SRC_MASK, + sizeof(key->eth.src)) || + fl_dump_key_val(skb, &key->basic.n_proto, TCA_FLOWER_KEY_ETH_TYPE, + &mask->basic.n_proto, TCA_FLOWER_UNSPEC, + sizeof(key->basic.n_proto))) + goto nla_put_failure; + if ((key->basic.n_proto == htons(ETH_P_IP) || + key->basic.n_proto == htons(ETH_P_IPV6)) && + fl_dump_key_val(skb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO, + &mask->basic.ip_proto, TCA_FLOWER_UNSPEC, + sizeof(key->basic.ip_proto))) + goto nla_put_failure; + + if (key->basic.n_proto == htons(ETH_P_IP) && + (fl_dump_key_val(skb, &key->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC, + &mask->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC_MASK, + sizeof(key->ipv4.src)) || + fl_dump_key_val(skb, &key->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST, + &mask->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST_MASK, + sizeof(key->ipv4.dst)))) + goto nla_put_failure; + else if (key->basic.n_proto == htons(ETH_P_IPV6) && + (fl_dump_key_val(skb, &key->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC, + &mask->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC_MASK, + sizeof(key->ipv6.src)) || + fl_dump_key_val(skb, &key->ipv6.dst, TCA_FLOWER_KEY_IPV6_DST, + &mask->ipv6.dst, TCA_FLOWER_KEY_IPV6_DST_MASK, + sizeof(key->ipv6.dst)))) + goto nla_put_failure; + + if (key->basic.ip_proto == IPPROTO_TCP && + (fl_dump_key_val(skb, &key->tp.src, TCA_FLOWER_KEY_TCP_SRC, + &mask->tp.src, TCA_FLOWER_UNSPEC, + sizeof(key->tp.src)) || + fl_dump_key_val(skb, &key->tp.dst, TCA_FLOWER_KEY_TCP_DST, + &mask->tp.dst, TCA_FLOWER_UNSPEC, + sizeof(key->tp.dst)))) + goto nla_put_failure; + else if (key->basic.ip_proto == IPPROTO_UDP && + (fl_dump_key_val(skb, &key->tp.src, TCA_FLOWER_KEY_UDP_SRC, + &mask->tp.src, TCA_FLOWER_UNSPEC, + sizeof(key->tp.src)) || + fl_dump_key_val(skb, &key->tp.dst, TCA_FLOWER_KEY_UDP_DST, + &mask->tp.dst, TCA_FLOWER_UNSPEC, + sizeof(key->tp.dst)))) + goto nla_put_failure; + + if (tcf_exts_dump(skb, &f->exts)) + goto nla_put_failure; + + nla_nest_end(skb, nest); + + if (tcf_exts_dump_stats(skb, &f->exts) < 0) + goto nla_put_failure; + + return skb->len; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -1; +} + +static struct tcf_proto_ops cls_fl_ops __read_mostly = { + .kind = "flower", + .classify = fl_classify, + .init = fl_init, + .destroy = fl_destroy, + .get = fl_get, + .change = fl_change, + .delete = fl_delete, + .walk = fl_walk, + .dump = fl_dump, + .owner = THIS_MODULE, +}; + +static int __init cls_fl_init(void) +{ + return register_tcf_proto_ops(&cls_fl_ops); +} + +static void __exit cls_fl_exit(void) +{ + unregister_tcf_proto_ops(&cls_fl_ops); +} + +module_init(cls_fl_init); +module_exit(cls_fl_exit); + +MODULE_AUTHOR("Jiri Pirko "); +MODULE_DESCRIPTION("Flower classifier"); +MODULE_LICENSE("GPL v2"); -- cgit From 216f8bb9f67ff636769064f86812e1ce6466e4af Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 May 2015 06:22:56 -0700 Subject: tcp/dccp: tw_timer_handler() is static tw_timer_handler() is only used from net/ipv4/inet_timewait_sock.c Fixes: 789f558cfb36 ("tcp/dccp: get rid of central timewait timer") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/inet_timewait_sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 00ec8d5d7e7e..2ffbd16b79e0 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -170,7 +170,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, } EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); -void tw_timer_handler(unsigned long data) +static void tw_timer_handler(unsigned long data) { struct inet_timewait_sock *tw = (struct inet_timewait_sock *)data; -- cgit From 7d771aaac7b2459013e9a246f16c06d4f2b819e1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 May 2015 06:31:48 -0700 Subject: ipv4: __ip_local_out_sk() is static __ip_local_out_sk() is only used from net/ipv4/ip_output.c net/ipv4/ip_output.c:94:5: warning: symbol '__ip_local_out_sk' was not declared. Should it be static? Fixes: 7026b1ddb6b8 ("netfilter: Pass socket pointer down through okfn().") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index c65b93a7b711..2acc5dc32807 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -91,7 +91,7 @@ void ip_send_check(struct iphdr *iph) } EXPORT_SYMBOL(ip_send_check); -int __ip_local_out_sk(struct sock *sk, struct sk_buff *skb) +static int __ip_local_out_sk(struct sock *sk, struct sk_buff *skb) { struct iphdr *iph = ip_hdr(skb); -- cgit From ad377cab4966e2f9162f05be1f7eeae466d411a8 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 12 May 2015 11:56:45 -0400 Subject: packet: rollover prepare: move code out of callsites packet_rcv_fanout calls fanout_demux_rollover twice. Move all rollover logic into the callee to simplify these callsites, especially with upcoming changes. The main differences between the two callsites is that the FLAG variant tests whether the socket previously selected by another mode (RR, RND, HASH, ..) has room before migrating flows, whereas the rollover mode has no original socket to test. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/packet/af_packet.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 12c5dde8e344..5c88ecf22cc0 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1318,18 +1318,22 @@ static unsigned int fanout_demux_rnd(struct packet_fanout *f, static unsigned int fanout_demux_rollover(struct packet_fanout *f, struct sk_buff *skb, - unsigned int idx, unsigned int skip, + unsigned int idx, bool try_self, unsigned int num) { unsigned int i, j; + if (try_self && packet_rcv_has_room(pkt_sk(f->arr[idx]), skb)) + return idx; + i = j = min_t(int, f->next[idx], num - 1); do { - if (i != skip && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) { + if (i != idx && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) { if (i != j) f->next[idx] = i; return i; } + if (++i == num) i = 0; } while (i != j); @@ -1386,17 +1390,14 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, idx = fanout_demux_qm(f, skb, num); break; case PACKET_FANOUT_ROLLOVER: - idx = fanout_demux_rollover(f, skb, 0, (unsigned int) -1, num); + idx = fanout_demux_rollover(f, skb, 0, false, num); break; } - po = pkt_sk(f->arr[idx]); - if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER) && - unlikely(!packet_rcv_has_room(po, skb))) { - idx = fanout_demux_rollover(f, skb, idx, idx, num); - po = pkt_sk(f->arr[idx]); - } + if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER)) + idx = fanout_demux_rollover(f, skb, idx, true, num); + po = pkt_sk(f->arr[idx]); return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev); } -- cgit From 0648ab70afe6c3bf2369a6d779b44a85121c063d Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 12 May 2015 11:56:46 -0400 Subject: packet: rollover prepare: per-socket state Replace rollover state per fanout group with state per socket. Future patches will add fields to the new structure. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/packet/af_packet.c | 21 ++++++++++++++++++--- net/packet/internal.h | 6 +++++- 2 files changed, 23 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 5c88ecf22cc0..ad3d9ff56541 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1321,16 +1321,18 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f, unsigned int idx, bool try_self, unsigned int num) { + struct packet_sock *po; unsigned int i, j; - if (try_self && packet_rcv_has_room(pkt_sk(f->arr[idx]), skb)) + po = pkt_sk(f->arr[idx]); + if (try_self && packet_rcv_has_room(po, skb)) return idx; - i = j = min_t(int, f->next[idx], num - 1); + i = j = min_t(int, po->rollover->sock, num - 1); do { if (i != idx && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) { if (i != j) - f->next[idx] = i; + po->rollover->sock = i; return i; } @@ -1468,6 +1470,12 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) if (po->fanout) return -EALREADY; + if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER) { + po->rollover = kzalloc(sizeof(*po->rollover), GFP_KERNEL); + if (!po->rollover) + return -ENOMEM; + } + mutex_lock(&fanout_mutex); match = NULL; list_for_each_entry(f, &fanout_list, list) { @@ -1516,6 +1524,10 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) } out: mutex_unlock(&fanout_mutex); + if (err) { + kfree(po->rollover); + po->rollover = NULL; + } return err; } @@ -1537,6 +1549,8 @@ static void fanout_release(struct sock *sk) kfree(f); } mutex_unlock(&fanout_mutex); + + kfree(po->rollover); } static const struct proto_ops packet_ops; @@ -2866,6 +2880,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol, spin_lock_init(&po->bind_lock); mutex_init(&po->pg_vec_lock); + po->rollover = NULL; po->prot_hook.func = packet_rcv; if (sock->type == SOCK_PACKET) diff --git a/net/packet/internal.h b/net/packet/internal.h index fe6e20caea1d..a9d33a28a019 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -82,12 +82,15 @@ struct packet_fanout { atomic_t rr_cur; struct list_head list; struct sock *arr[PACKET_FANOUT_MAX]; - int next[PACKET_FANOUT_MAX]; spinlock_t lock; atomic_t sk_ref; struct packet_type prot_hook ____cacheline_aligned_in_smp; }; +struct packet_rollover { + int sock; +} ____cacheline_aligned_in_smp; + struct packet_sock { /* struct sock has to be the first member of packet_sock */ struct sock sk; @@ -104,6 +107,7 @@ struct packet_sock { has_vnet_hdr:1; int ifindex; /* bound device */ __be16 num; + struct packet_rollover *rollover; struct packet_mclist *mclist; atomic_t mapped; enum tpacket_versions tp_version; -- cgit From 9954729bc3896998d53040c46a28830a3a3d5063 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 12 May 2015 11:56:47 -0400 Subject: packet: rollover only to socket with headroom Only migrate flows to sockets that have sufficient headroom, where sufficient is defined as having at least 25% empty space. The kernel has three different buffer types: a regular socket, a ring with frames (TPACKET_V[12]) or a ring with blocks (TPACKET_V3). The latter two do not expose a read pointer to the kernel, so headroom is not computed easily. All three needs a different implementation to estimate free space. Tested: Ran bench_rollover for 10 sec with 1.5 Mpps of single flow input. bench_rollover has as many sockets as there are NIC receive queues in the system. Each socket is owned by a process that is pinned to one of the receive cpus. RFS is disabled. RPS is enabled with an identity mapping (cpu x -> cpu x), to count drops with softnettop. lpbb5:/export/hda3/willemb# ./bench_rollover -r -l 1000 -s Press [Enter] to exit cpu rx rx.k drop.k rollover r.huge r.failed 0 16 16 0 0 0 0 1 21 21 0 0 0 0 2 5227502 5227502 0 0 0 0 3 18 18 0 0 0 0 4 6083289 6083289 0 5227496 0 0 5 22 22 0 0 0 0 6 21 21 0 0 0 0 7 9 9 0 0 0 0 Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/packet/af_packet.c | 76 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 59 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index ad3d9ff56541..ffa67205f698 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1234,27 +1234,68 @@ static void packet_free_pending(struct packet_sock *po) free_percpu(po->tx_ring.pending_refcnt); } -static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) +#define ROOM_POW_OFF 2 +#define ROOM_NONE 0x0 +#define ROOM_LOW 0x1 +#define ROOM_NORMAL 0x2 + +static bool __tpacket_has_room(struct packet_sock *po, int pow_off) { - struct sock *sk = &po->sk; - bool has_room; + int idx, len; + + len = po->rx_ring.frame_max + 1; + idx = po->rx_ring.head; + if (pow_off) + idx += len >> pow_off; + if (idx >= len) + idx -= len; + return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL); +} - if (po->prot_hook.func != tpacket_rcv) - return (atomic_read(&sk->sk_rmem_alloc) + skb->truesize) - <= sk->sk_rcvbuf; +static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off) +{ + int idx, len; + + len = po->rx_ring.prb_bdqc.knum_blocks; + idx = po->rx_ring.prb_bdqc.kactive_blk_num; + if (pow_off) + idx += len >> pow_off; + if (idx >= len) + idx -= len; + return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL); +} + +static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) +{ + struct sock *sk = &po->sk; + int ret = ROOM_NONE; + + if (po->prot_hook.func != tpacket_rcv) { + int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc) + - skb->truesize; + if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF)) + return ROOM_NORMAL; + else if (avail > 0) + return ROOM_LOW; + else + return ROOM_NONE; + } spin_lock(&sk->sk_receive_queue.lock); - if (po->tp_version == TPACKET_V3) - has_room = prb_lookup_block(po, &po->rx_ring, - po->rx_ring.prb_bdqc.kactive_blk_num, - TP_STATUS_KERNEL); - else - has_room = packet_lookup_frame(po, &po->rx_ring, - po->rx_ring.head, - TP_STATUS_KERNEL); + if (po->tp_version == TPACKET_V3) { + if (__tpacket_v3_has_room(po, ROOM_POW_OFF)) + ret = ROOM_NORMAL; + else if (__tpacket_v3_has_room(po, 0)) + ret = ROOM_LOW; + } else { + if (__tpacket_has_room(po, ROOM_POW_OFF)) + ret = ROOM_NORMAL; + else if (__tpacket_has_room(po, 0)) + ret = ROOM_LOW; + } spin_unlock(&sk->sk_receive_queue.lock); - return has_room; + return ret; } static void packet_sock_destruct(struct sock *sk) @@ -1325,12 +1366,13 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f, unsigned int i, j; po = pkt_sk(f->arr[idx]); - if (try_self && packet_rcv_has_room(po, skb)) + if (try_self && packet_rcv_has_room(po, skb) != ROOM_NONE) return idx; i = j = min_t(int, po->rollover->sock, num - 1); do { - if (i != idx && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) { + if (i != idx && + packet_rcv_has_room(pkt_sk(f->arr[i]), skb) == ROOM_NORMAL) { if (i != j) po->rollover->sock = i; return i; -- cgit From 2ccdbaa6d55b0656244ba57c4b56765a0af76c0a Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 12 May 2015 11:56:48 -0400 Subject: packet: rollover lock contention avoidance Rollover has to call packet_rcv_has_room on sockets in the fanout group to find a socket to migrate to. This operation is expensive especially if the packet sockets use rings, when a lock has to be acquired. Avoid pounding on the lock by all sockets by temporarily marking a socket as "under memory pressure" when such pressure is detected. While set, only the socket owner may call packet_rcv_has_room on the socket. Once it detects normal conditions, it clears the flag. The socket is not used as a victim by any other socket in the meantime. Under reasonably balanced load, each socket writer frequently calls packet_rcv_has_room and clears its own pressure field. As a backup for when the socket is rarely written to, also clear the flag on reading (packet_recvmsg, packet_poll) if this can be done cheaply (i.e., without calling packet_rcv_has_room). This is only for edge cases. Tested: Ran bench_rollover: a process with 8 sockets in a single fanout group, each pinned to a single cpu that receives one nic recv interrupt. RPS and RFS are disabled. The benchmark uses packet rx_ring, which has to take a lock when determining whether a socket has room. Sent 3.5 Mpps of UDP traffic with sufficient entropy to spread uniformly across the packet sockets (and inserted an iptables rule to drop in PREROUTING to avoid protocol stack processing). Without this patch, all sockets try to migrate traffic to neighbors, causing lock contention when searching for a non- empty neighbor. The lock is the top 9 entries. perf record -a -g sleep 5 - 17.82% bench_rollover [kernel.kallsyms] [k] _raw_spin_lock - _raw_spin_lock - 99.00% spin_lock + 81.77% packet_rcv_has_room.isra.41 + 18.23% tpacket_rcv + 0.84% packet_rcv_has_room.isra.41 + 5.20% ksoftirqd/6 [kernel.kallsyms] [k] _raw_spin_lock + 5.15% ksoftirqd/1 [kernel.kallsyms] [k] _raw_spin_lock + 5.14% ksoftirqd/2 [kernel.kallsyms] [k] _raw_spin_lock + 5.12% ksoftirqd/7 [kernel.kallsyms] [k] _raw_spin_lock + 5.12% ksoftirqd/5 [kernel.kallsyms] [k] _raw_spin_lock + 5.10% ksoftirqd/4 [kernel.kallsyms] [k] _raw_spin_lock + 4.66% ksoftirqd/0 [kernel.kallsyms] [k] _raw_spin_lock + 4.45% ksoftirqd/3 [kernel.kallsyms] [k] _raw_spin_lock + 1.55% bench_rollover [kernel.kallsyms] [k] packet_rcv_has_room.isra.41 On net-next with this patch, this lock contention is no longer a top entry. Most time is spent in the actual read function. Next up are other locks: + 15.52% bench_rollover bench_rollover [.] reader + 4.68% swapper [kernel.kallsyms] [k] memcpy_erms + 2.77% swapper [kernel.kallsyms] [k] packet_lookup_frame.isra.51 + 2.56% ksoftirqd/1 [kernel.kallsyms] [k] memcpy_erms + 2.16% swapper [kernel.kallsyms] [k] tpacket_rcv + 1.93% swapper [kernel.kallsyms] [k] mlx4_en_process_rx_cq Looking closer at the remaining _raw_spin_lock, the cost of probing in rollover is now comparable to the cost of taking the lock later in tpacket_rcv. - 1.51% swapper [kernel.kallsyms] [k] _raw_spin_lock - _raw_spin_lock + 33.41% packet_rcv_has_room + 28.15% tpacket_rcv + 19.54% enqueue_to_backlog + 6.45% __free_pages_ok + 2.78% packet_rcv_fanout + 2.13% fanout_demux_rollover + 2.01% netif_receive_skb_internal Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/packet/af_packet.c | 38 +++++++++++++++++++++++++++++++------- net/packet/internal.h | 1 + 2 files changed, 32 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index ffa67205f698..3a383fd72f82 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1265,14 +1265,14 @@ static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off) return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL); } -static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) +static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) { struct sock *sk = &po->sk; int ret = ROOM_NONE; if (po->prot_hook.func != tpacket_rcv) { int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc) - - skb->truesize; + - (skb ? skb->truesize : 0); if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF)) return ROOM_NORMAL; else if (avail > 0) @@ -1281,7 +1281,6 @@ static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) return ROOM_NONE; } - spin_lock(&sk->sk_receive_queue.lock); if (po->tp_version == TPACKET_V3) { if (__tpacket_v3_has_room(po, ROOM_POW_OFF)) ret = ROOM_NORMAL; @@ -1293,7 +1292,26 @@ static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) else if (__tpacket_has_room(po, 0)) ret = ROOM_LOW; } - spin_unlock(&sk->sk_receive_queue.lock); + + return ret; +} + +static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) +{ + int ret; + bool has_room; + + if (po->prot_hook.func == tpacket_rcv) { + spin_lock(&po->sk.sk_receive_queue.lock); + ret = __packet_rcv_has_room(po, skb); + spin_unlock(&po->sk.sk_receive_queue.lock); + } else { + ret = __packet_rcv_has_room(po, skb); + } + + has_room = ret == ROOM_NORMAL; + if (po->pressure == has_room) + xchg(&po->pressure, !has_room); return ret; } @@ -1362,7 +1380,7 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f, unsigned int idx, bool try_self, unsigned int num) { - struct packet_sock *po; + struct packet_sock *po, *po_next; unsigned int i, j; po = pkt_sk(f->arr[idx]); @@ -1371,8 +1389,9 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f, i = j = min_t(int, po->rollover->sock, num - 1); do { - if (i != idx && - packet_rcv_has_room(pkt_sk(f->arr[i]), skb) == ROOM_NORMAL) { + po_next = pkt_sk(f->arr[i]); + if (po_next != po && !po_next->pressure && + packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) { if (i != j) po->rollover->sock = i; return i; @@ -3000,6 +3019,9 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, if (skb == NULL) goto out; + if (pkt_sk(sk)->pressure) + packet_rcv_has_room(pkt_sk(sk), NULL); + if (pkt_sk(sk)->has_vnet_hdr) { struct virtio_net_hdr vnet_hdr = { 0 }; @@ -3755,6 +3777,8 @@ static unsigned int packet_poll(struct file *file, struct socket *sock, TP_STATUS_KERNEL)) mask |= POLLIN | POLLRDNORM; } + if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL) + xchg(&po->pressure, 0); spin_unlock_bh(&sk->sk_receive_queue.lock); spin_lock_bh(&sk->sk_write_queue.lock); if (po->tx_ring.pg_vec) { diff --git a/net/packet/internal.h b/net/packet/internal.h index a9d33a28a019..22d7d778c5b7 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -105,6 +105,7 @@ struct packet_sock { auxdata:1, origdev:1, has_vnet_hdr:1; + int pressure; int ifindex; /* bound device */ __be16 num; struct packet_rollover *rollover; -- cgit From 3b3a5b0aab5b9ad345d4beb9a364a7dd02c23d40 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 12 May 2015 11:56:49 -0400 Subject: packet: rollover huge flows before small flows Migrate flows from a socket to another socket in the fanout group not only when the socket is full. Start migrating huge flows early, to divert possible 4-tuple attacks without affecting normal traffic. Introduce fanout_flow_is_huge(). This detects huge flows, which are defined as taking up more than half the load. It does so cheaply, by storing the rxhashes of the N most recent packets. If over half of these are the same rxhash as the current packet, then drop it. This only protects against 4-tuple attacks. N is chosen to fit all data in a single cache line. Tested: Ran bench_rollover for 10 sec with 1.5 Mpps of single flow input. lpbb5:/export/hda3/willemb# ./bench_rollover -l 1000 -r -s cpu rx rx.k drop.k rollover r.huge r.failed 0 14 14 0 0 0 0 1 20 20 0 0 0 0 2 16 16 0 0 0 0 3 6168824 6168824 0 4867721 4867721 0 4 4867741 4867741 0 0 0 0 5 12 12 0 0 0 0 6 15 15 0 0 0 0 7 17 17 0 0 0 0 Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/packet/af_packet.c | 25 ++++++++++++++++++++++--- net/packet/internal.h | 2 ++ 2 files changed, 24 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 3a383fd72f82..8f0156b10f8d 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1341,6 +1341,20 @@ static int fanout_rr_next(struct packet_fanout *f, unsigned int num) return x; } +static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb) +{ + u32 rxhash; + int i, count = 0; + + rxhash = skb_get_hash(skb); + for (i = 0; i < ROLLOVER_HLEN; i++) + if (po->rollover->history[i] == rxhash) + count++; + + po->rollover->history[prandom_u32() % ROLLOVER_HLEN] = rxhash; + return count > (ROLLOVER_HLEN >> 1); +} + static unsigned int fanout_demux_hash(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) @@ -1381,11 +1395,16 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f, unsigned int num) { struct packet_sock *po, *po_next; - unsigned int i, j; + unsigned int i, j, room; po = pkt_sk(f->arr[idx]); - if (try_self && packet_rcv_has_room(po, skb) != ROOM_NONE) - return idx; + + if (try_self) { + room = packet_rcv_has_room(po, skb); + if (room == ROOM_NORMAL || + (room == ROOM_LOW && !fanout_flow_is_huge(po, skb))) + return idx; + } i = j = min_t(int, po->rollover->sock, num - 1); do { diff --git a/net/packet/internal.h b/net/packet/internal.h index 22d7d778c5b7..a9d30a17c714 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -89,6 +89,8 @@ struct packet_fanout { struct packet_rollover { int sock; +#define ROLLOVER_HLEN (L1_CACHE_BYTES / sizeof(u32)) + u32 history[ROLLOVER_HLEN] ____cacheline_aligned; } ____cacheline_aligned_in_smp; struct packet_sock { -- cgit From a9b6391814d5d6b8668fca2dace86949b7244e2e Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 12 May 2015 11:56:50 -0400 Subject: packet: rollover statistics Rollover indicates exceptional conditions. Export a counter to inform socket owners of this state. If no socket with sufficient room is found, rollover fails. Also count these events. Finally, also count when flows are rolled over early thanks to huge flow detection, to validate its correctness. Tested: Read counters in bench_rollover on all other tests in the patchset Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/packet/af_packet.c | 19 ++++++++++++++++++- net/packet/internal.h | 3 +++ 2 files changed, 21 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 8f0156b10f8d..31d58565726c 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1395,7 +1395,7 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f, unsigned int num) { struct packet_sock *po, *po_next; - unsigned int i, j, room; + unsigned int i, j, room = ROOM_NONE; po = pkt_sk(f->arr[idx]); @@ -1413,6 +1413,9 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f, packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) { if (i != j) po->rollover->sock = i; + atomic_long_inc(&po->rollover->num); + if (room == ROOM_LOW) + atomic_long_inc(&po->rollover->num_huge); return i; } @@ -1420,6 +1423,7 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f, i = 0; } while (i != j); + atomic_long_inc(&po->rollover->num_failed); return idx; } @@ -1554,6 +1558,9 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) po->rollover = kzalloc(sizeof(*po->rollover), GFP_KERNEL); if (!po->rollover) return -ENOMEM; + atomic_long_set(&po->rollover->num, 0); + atomic_long_set(&po->rollover->num_huge, 0); + atomic_long_set(&po->rollover->num_failed, 0); } mutex_lock(&fanout_mutex); @@ -3584,6 +3591,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, struct packet_sock *po = pkt_sk(sk); void *data = &val; union tpacket_stats_u st; + struct tpacket_rollover_stats rstats; if (level != SOL_PACKET) return -ENOPROTOOPT; @@ -3659,6 +3667,15 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, ((u32)po->fanout->flags << 24)) : 0); break; + case PACKET_ROLLOVER_STATS: + if (!po->rollover) + return -EINVAL; + rstats.tp_all = atomic_long_read(&po->rollover->num); + rstats.tp_huge = atomic_long_read(&po->rollover->num_huge); + rstats.tp_failed = atomic_long_read(&po->rollover->num_failed); + data = &rstats; + lv = sizeof(rstats); + break; case PACKET_TX_HAS_OFF: val = po->tp_tx_has_off; break; diff --git a/net/packet/internal.h b/net/packet/internal.h index a9d30a17c714..c035d263c1e8 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -89,6 +89,9 @@ struct packet_fanout { struct packet_rollover { int sock; + atomic_long_t num; + atomic_long_t num_huge; + atomic_long_t num_failed; #define ROLLOVER_HLEN (L1_CACHE_BYTES / sizeof(u32)) u32 history[ROLLOVER_HLEN] ____cacheline_aligned; } ____cacheline_aligned_in_smp; -- cgit From 125907ae5ef000a3855b33f6545d672275413836 Mon Sep 17 00:00:00 2001 From: "John W. Linville" Date: Wed, 13 May 2015 12:57:26 -0400 Subject: geneve: remove MODULE_ALIAS_RTNL_LINK from net/ipv4/geneve.c This file is essentially a library for implementing the geneve encapsulation protocol. The file does not register any rtnl_link_ops, so the MODULE_ALIAS_RTNL_LINK macro is inappropriate here. Signed-off-by: John W. Linville Signed-off-by: David S. Miller --- net/ipv4/geneve.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c index 8986e63f3bda..8e6a7fe27a4c 100644 --- a/net/ipv4/geneve.c +++ b/net/ipv4/geneve.c @@ -450,4 +450,3 @@ module_exit(geneve_cleanup_module); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jesse Gross "); MODULE_DESCRIPTION("Driver for GENEVE encapsulated traffic"); -MODULE_ALIAS_RTNL_LINK("geneve"); -- cgit From 35d32e8fe4ab44180e46a0dd54abea6985398d00 Mon Sep 17 00:00:00 2001 From: "John W. Linville" Date: Wed, 13 May 2015 12:57:27 -0400 Subject: geneve: move definition of geneve_hdr() to geneve.h This is a static inline with identical definitions in multiple places... Signed-off-by: John W. Linville Signed-off-by: David S. Miller --- net/ipv4/geneve.c | 5 ----- net/openvswitch/vport-geneve.c | 5 ----- 2 files changed, 10 deletions(-) (limited to 'net') diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c index 8e6a7fe27a4c..001843d41135 100644 --- a/net/ipv4/geneve.c +++ b/net/ipv4/geneve.c @@ -60,11 +60,6 @@ struct geneve_net { static int geneve_net_id; -static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb) -{ - return (struct genevehdr *)(udp_hdr(skb) + 1); -} - static struct geneve_sock *geneve_find_sock(struct net *net, sa_family_t family, __be16 port) { diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c index bf02fd5808c9..208c576bd1b6 100644 --- a/net/openvswitch/vport-geneve.c +++ b/net/openvswitch/vport-geneve.c @@ -46,11 +46,6 @@ static inline struct geneve_port *geneve_vport(const struct vport *vport) return vport_priv(vport); } -static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb) -{ - return (struct genevehdr *)(udp_hdr(skb) + 1); -} - /* Convert 64 bit tunnel ID to 24 bit VNI. */ static void tunnel_id_to_vni(__be64 tun_id, __u8 *vni) { -- cgit From 11e1fa46b43216458e0f67f1f0b257586c5d8e5c Mon Sep 17 00:00:00 2001 From: "John W. Linville" Date: Wed, 13 May 2015 12:57:28 -0400 Subject: geneve: Rename support library as geneve_core net/ipv4/geneve.c -> net/ipv4/geneve_core.c This name better reflects the purpose of the module. Signed-off-by: John W. Linville Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 4 +- net/ipv4/Makefile | 2 +- net/ipv4/geneve.c | 447 ------------------------------------------------ net/ipv4/geneve_core.c | 447 ++++++++++++++++++++++++++++++++++++++++++++++++ net/openvswitch/Kconfig | 2 +- 5 files changed, 451 insertions(+), 451 deletions(-) delete mode 100644 net/ipv4/geneve.c create mode 100644 net/ipv4/geneve_core.c (limited to 'net') diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index bd2901604842..d83071dccd74 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -331,8 +331,8 @@ config NET_FOU_IP_TUNNELS When this option is enabled IP tunnels can be configured to use FOU or GUE encapsulation. -config GENEVE - tristate "Generic Network Virtualization Encapsulation (Geneve)" +config GENEVE_CORE + tristate "Generic Network Virtualization Encapsulation library" depends on INET select NET_UDP_TUNNEL ---help--- diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 518c04ed666e..b36236dd6014 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -56,7 +56,7 @@ obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o obj-$(CONFIG_NETLABEL) += cipso_ipv4.o -obj-$(CONFIG_GENEVE) += geneve.o +obj-$(CONFIG_GENEVE_CORE) += geneve_core.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o xfrm4_protocol.o diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c deleted file mode 100644 index 001843d41135..000000000000 --- a/net/ipv4/geneve.c +++ /dev/null @@ -1,447 +0,0 @@ -/* - * Geneve: Generic Network Virtualization Encapsulation - * - * Copyright (c) 2014 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#if IS_ENABLED(CONFIG_IPV6) -#include -#include -#include -#include -#endif - -/* Protects sock_list and refcounts. */ -static DEFINE_MUTEX(geneve_mutex); - -/* per-network namespace private data for this module */ -struct geneve_net { - struct list_head sock_list; -}; - -static int geneve_net_id; - -static struct geneve_sock *geneve_find_sock(struct net *net, - sa_family_t family, __be16 port) -{ - struct geneve_net *gn = net_generic(net, geneve_net_id); - struct geneve_sock *gs; - - list_for_each_entry(gs, &gn->sock_list, list) { - if (inet_sk(gs->sock->sk)->inet_sport == port && - inet_sk(gs->sock->sk)->sk.sk_family == family) - return gs; - } - - return NULL; -} - -static void geneve_build_header(struct genevehdr *geneveh, - __be16 tun_flags, u8 vni[3], - u8 options_len, u8 *options) -{ - geneveh->ver = GENEVE_VER; - geneveh->opt_len = options_len / 4; - geneveh->oam = !!(tun_flags & TUNNEL_OAM); - geneveh->critical = !!(tun_flags & TUNNEL_CRIT_OPT); - geneveh->rsvd1 = 0; - memcpy(geneveh->vni, vni, 3); - geneveh->proto_type = htons(ETH_P_TEB); - geneveh->rsvd2 = 0; - - memcpy(geneveh->options, options, options_len); -} - -/* Transmit a fully formatted Geneve frame. - * - * When calling this function. The skb->data should point - * to the geneve header which is fully formed. - * - * This function will add other UDP tunnel headers. - */ -int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt, - struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, - __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, - __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt, - bool csum, bool xnet) -{ - struct genevehdr *gnvh; - int min_headroom; - int err; - - min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len - + GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr) - + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); - - err = skb_cow_head(skb, min_headroom); - if (unlikely(err)) { - kfree_skb(skb); - return err; - } - - skb = vlan_hwaccel_push_inside(skb); - if (unlikely(!skb)) - return -ENOMEM; - - skb = udp_tunnel_handle_offloads(skb, csum); - if (IS_ERR(skb)) - return PTR_ERR(skb); - - gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len); - geneve_build_header(gnvh, tun_flags, vni, opt_len, opt); - - skb_set_inner_protocol(skb, htons(ETH_P_TEB)); - - return udp_tunnel_xmit_skb(rt, gs->sock->sk, skb, src, dst, - tos, ttl, df, src_port, dst_port, xnet, - !csum); -} -EXPORT_SYMBOL_GPL(geneve_xmit_skb); - -static int geneve_hlen(struct genevehdr *gh) -{ - return sizeof(*gh) + gh->opt_len * 4; -} - -static struct sk_buff **geneve_gro_receive(struct sk_buff **head, - struct sk_buff *skb, - struct udp_offload *uoff) -{ - struct sk_buff *p, **pp = NULL; - struct genevehdr *gh, *gh2; - unsigned int hlen, gh_len, off_gnv; - const struct packet_offload *ptype; - __be16 type; - int flush = 1; - - off_gnv = skb_gro_offset(skb); - hlen = off_gnv + sizeof(*gh); - gh = skb_gro_header_fast(skb, off_gnv); - if (skb_gro_header_hard(skb, hlen)) { - gh = skb_gro_header_slow(skb, hlen, off_gnv); - if (unlikely(!gh)) - goto out; - } - - if (gh->ver != GENEVE_VER || gh->oam) - goto out; - gh_len = geneve_hlen(gh); - - hlen = off_gnv + gh_len; - if (skb_gro_header_hard(skb, hlen)) { - gh = skb_gro_header_slow(skb, hlen, off_gnv); - if (unlikely(!gh)) - goto out; - } - - flush = 0; - - for (p = *head; p; p = p->next) { - if (!NAPI_GRO_CB(p)->same_flow) - continue; - - gh2 = (struct genevehdr *)(p->data + off_gnv); - if (gh->opt_len != gh2->opt_len || - memcmp(gh, gh2, gh_len)) { - NAPI_GRO_CB(p)->same_flow = 0; - continue; - } - } - - type = gh->proto_type; - - rcu_read_lock(); - ptype = gro_find_receive_by_type(type); - if (!ptype) { - flush = 1; - goto out_unlock; - } - - skb_gro_pull(skb, gh_len); - skb_gro_postpull_rcsum(skb, gh, gh_len); - pp = ptype->callbacks.gro_receive(head, skb); - -out_unlock: - rcu_read_unlock(); -out: - NAPI_GRO_CB(skb)->flush |= flush; - - return pp; -} - -static int geneve_gro_complete(struct sk_buff *skb, int nhoff, - struct udp_offload *uoff) -{ - struct genevehdr *gh; - struct packet_offload *ptype; - __be16 type; - int gh_len; - int err = -ENOSYS; - - udp_tunnel_gro_complete(skb, nhoff); - - gh = (struct genevehdr *)(skb->data + nhoff); - gh_len = geneve_hlen(gh); - type = gh->proto_type; - - rcu_read_lock(); - ptype = gro_find_complete_by_type(type); - if (ptype) - err = ptype->callbacks.gro_complete(skb, nhoff + gh_len); - - rcu_read_unlock(); - return err; -} - -static void geneve_notify_add_rx_port(struct geneve_sock *gs) -{ - struct sock *sk = gs->sock->sk; - sa_family_t sa_family = sk->sk_family; - int err; - - if (sa_family == AF_INET) { - err = udp_add_offload(&gs->udp_offloads); - if (err) - pr_warn("geneve: udp_add_offload failed with status %d\n", - err); - } -} - -static void geneve_notify_del_rx_port(struct geneve_sock *gs) -{ - struct sock *sk = gs->sock->sk; - sa_family_t sa_family = sk->sk_family; - - if (sa_family == AF_INET) - udp_del_offload(&gs->udp_offloads); -} - -/* Callback from net/ipv4/udp.c to receive packets */ -static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) -{ - struct genevehdr *geneveh; - struct geneve_sock *gs; - int opts_len; - - /* Need Geneve and inner Ethernet header to be present */ - if (unlikely(!pskb_may_pull(skb, GENEVE_BASE_HLEN))) - goto error; - - /* Return packets with reserved bits set */ - geneveh = geneve_hdr(skb); - - if (unlikely(geneveh->ver != GENEVE_VER)) - goto error; - - if (unlikely(geneveh->proto_type != htons(ETH_P_TEB))) - goto error; - - opts_len = geneveh->opt_len * 4; - if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, - htons(ETH_P_TEB))) - goto drop; - - gs = rcu_dereference_sk_user_data(sk); - if (!gs) - goto drop; - - gs->rcv(gs, skb); - return 0; - -drop: - /* Consume bad packet */ - kfree_skb(skb); - return 0; - -error: - /* Let the UDP layer deal with the skb */ - return 1; -} - -static struct socket *geneve_create_sock(struct net *net, bool ipv6, - __be16 port) -{ - struct socket *sock; - struct udp_port_cfg udp_conf; - int err; - - memset(&udp_conf, 0, sizeof(udp_conf)); - - if (ipv6) { - udp_conf.family = AF_INET6; - } else { - udp_conf.family = AF_INET; - udp_conf.local_ip.s_addr = htonl(INADDR_ANY); - } - - udp_conf.local_udp_port = port; - - /* Open UDP socket */ - err = udp_sock_create(net, &udp_conf, &sock); - if (err < 0) - return ERR_PTR(err); - - return sock; -} - -/* Create new listen socket if needed */ -static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port, - geneve_rcv_t *rcv, void *data, - bool ipv6) -{ - struct geneve_net *gn = net_generic(net, geneve_net_id); - struct geneve_sock *gs; - struct socket *sock; - struct udp_tunnel_sock_cfg tunnel_cfg; - - gs = kzalloc(sizeof(*gs), GFP_KERNEL); - if (!gs) - return ERR_PTR(-ENOMEM); - - sock = geneve_create_sock(net, ipv6, port); - if (IS_ERR(sock)) { - kfree(gs); - return ERR_CAST(sock); - } - - gs->sock = sock; - gs->refcnt = 1; - gs->rcv = rcv; - gs->rcv_data = data; - - /* Initialize the geneve udp offloads structure */ - gs->udp_offloads.port = port; - gs->udp_offloads.callbacks.gro_receive = geneve_gro_receive; - gs->udp_offloads.callbacks.gro_complete = geneve_gro_complete; - geneve_notify_add_rx_port(gs); - - /* Mark socket as an encapsulation socket */ - tunnel_cfg.sk_user_data = gs; - tunnel_cfg.encap_type = 1; - tunnel_cfg.encap_rcv = geneve_udp_encap_recv; - tunnel_cfg.encap_destroy = NULL; - setup_udp_tunnel_sock(net, sock, &tunnel_cfg); - - list_add(&gs->list, &gn->sock_list); - - return gs; -} - -struct geneve_sock *geneve_sock_add(struct net *net, __be16 port, - geneve_rcv_t *rcv, void *data, - bool no_share, bool ipv6) -{ - struct geneve_sock *gs; - - mutex_lock(&geneve_mutex); - - gs = geneve_find_sock(net, ipv6 ? AF_INET6 : AF_INET, port); - if (gs) { - if (!no_share && gs->rcv == rcv) - gs->refcnt++; - else - gs = ERR_PTR(-EBUSY); - } else { - gs = geneve_socket_create(net, port, rcv, data, ipv6); - } - - mutex_unlock(&geneve_mutex); - - return gs; -} -EXPORT_SYMBOL_GPL(geneve_sock_add); - -void geneve_sock_release(struct geneve_sock *gs) -{ - mutex_lock(&geneve_mutex); - - if (--gs->refcnt) - goto unlock; - - list_del(&gs->list); - geneve_notify_del_rx_port(gs); - udp_tunnel_sock_release(gs->sock); - kfree_rcu(gs, rcu); - -unlock: - mutex_unlock(&geneve_mutex); -} -EXPORT_SYMBOL_GPL(geneve_sock_release); - -static __net_init int geneve_init_net(struct net *net) -{ - struct geneve_net *gn = net_generic(net, geneve_net_id); - - INIT_LIST_HEAD(&gn->sock_list); - - return 0; -} - -static struct pernet_operations geneve_net_ops = { - .init = geneve_init_net, - .id = &geneve_net_id, - .size = sizeof(struct geneve_net), -}; - -static int __init geneve_init_module(void) -{ - int rc; - - rc = register_pernet_subsys(&geneve_net_ops); - if (rc) - return rc; - - pr_info("Geneve driver\n"); - - return 0; -} -module_init(geneve_init_module); - -static void __exit geneve_cleanup_module(void) -{ - unregister_pernet_subsys(&geneve_net_ops); -} -module_exit(geneve_cleanup_module); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jesse Gross "); -MODULE_DESCRIPTION("Driver for GENEVE encapsulated traffic"); diff --git a/net/ipv4/geneve_core.c b/net/ipv4/geneve_core.c new file mode 100644 index 000000000000..001843d41135 --- /dev/null +++ b/net/ipv4/geneve_core.c @@ -0,0 +1,447 @@ +/* + * Geneve: Generic Network Virtualization Encapsulation + * + * Copyright (c) 2014 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if IS_ENABLED(CONFIG_IPV6) +#include +#include +#include +#include +#endif + +/* Protects sock_list and refcounts. */ +static DEFINE_MUTEX(geneve_mutex); + +/* per-network namespace private data for this module */ +struct geneve_net { + struct list_head sock_list; +}; + +static int geneve_net_id; + +static struct geneve_sock *geneve_find_sock(struct net *net, + sa_family_t family, __be16 port) +{ + struct geneve_net *gn = net_generic(net, geneve_net_id); + struct geneve_sock *gs; + + list_for_each_entry(gs, &gn->sock_list, list) { + if (inet_sk(gs->sock->sk)->inet_sport == port && + inet_sk(gs->sock->sk)->sk.sk_family == family) + return gs; + } + + return NULL; +} + +static void geneve_build_header(struct genevehdr *geneveh, + __be16 tun_flags, u8 vni[3], + u8 options_len, u8 *options) +{ + geneveh->ver = GENEVE_VER; + geneveh->opt_len = options_len / 4; + geneveh->oam = !!(tun_flags & TUNNEL_OAM); + geneveh->critical = !!(tun_flags & TUNNEL_CRIT_OPT); + geneveh->rsvd1 = 0; + memcpy(geneveh->vni, vni, 3); + geneveh->proto_type = htons(ETH_P_TEB); + geneveh->rsvd2 = 0; + + memcpy(geneveh->options, options, options_len); +} + +/* Transmit a fully formatted Geneve frame. + * + * When calling this function. The skb->data should point + * to the geneve header which is fully formed. + * + * This function will add other UDP tunnel headers. + */ +int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt, + struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, + __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, + __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt, + bool csum, bool xnet) +{ + struct genevehdr *gnvh; + int min_headroom; + int err; + + min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len + + GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr) + + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); + + err = skb_cow_head(skb, min_headroom); + if (unlikely(err)) { + kfree_skb(skb); + return err; + } + + skb = vlan_hwaccel_push_inside(skb); + if (unlikely(!skb)) + return -ENOMEM; + + skb = udp_tunnel_handle_offloads(skb, csum); + if (IS_ERR(skb)) + return PTR_ERR(skb); + + gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len); + geneve_build_header(gnvh, tun_flags, vni, opt_len, opt); + + skb_set_inner_protocol(skb, htons(ETH_P_TEB)); + + return udp_tunnel_xmit_skb(rt, gs->sock->sk, skb, src, dst, + tos, ttl, df, src_port, dst_port, xnet, + !csum); +} +EXPORT_SYMBOL_GPL(geneve_xmit_skb); + +static int geneve_hlen(struct genevehdr *gh) +{ + return sizeof(*gh) + gh->opt_len * 4; +} + +static struct sk_buff **geneve_gro_receive(struct sk_buff **head, + struct sk_buff *skb, + struct udp_offload *uoff) +{ + struct sk_buff *p, **pp = NULL; + struct genevehdr *gh, *gh2; + unsigned int hlen, gh_len, off_gnv; + const struct packet_offload *ptype; + __be16 type; + int flush = 1; + + off_gnv = skb_gro_offset(skb); + hlen = off_gnv + sizeof(*gh); + gh = skb_gro_header_fast(skb, off_gnv); + if (skb_gro_header_hard(skb, hlen)) { + gh = skb_gro_header_slow(skb, hlen, off_gnv); + if (unlikely(!gh)) + goto out; + } + + if (gh->ver != GENEVE_VER || gh->oam) + goto out; + gh_len = geneve_hlen(gh); + + hlen = off_gnv + gh_len; + if (skb_gro_header_hard(skb, hlen)) { + gh = skb_gro_header_slow(skb, hlen, off_gnv); + if (unlikely(!gh)) + goto out; + } + + flush = 0; + + for (p = *head; p; p = p->next) { + if (!NAPI_GRO_CB(p)->same_flow) + continue; + + gh2 = (struct genevehdr *)(p->data + off_gnv); + if (gh->opt_len != gh2->opt_len || + memcmp(gh, gh2, gh_len)) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + } + + type = gh->proto_type; + + rcu_read_lock(); + ptype = gro_find_receive_by_type(type); + if (!ptype) { + flush = 1; + goto out_unlock; + } + + skb_gro_pull(skb, gh_len); + skb_gro_postpull_rcsum(skb, gh, gh_len); + pp = ptype->callbacks.gro_receive(head, skb); + +out_unlock: + rcu_read_unlock(); +out: + NAPI_GRO_CB(skb)->flush |= flush; + + return pp; +} + +static int geneve_gro_complete(struct sk_buff *skb, int nhoff, + struct udp_offload *uoff) +{ + struct genevehdr *gh; + struct packet_offload *ptype; + __be16 type; + int gh_len; + int err = -ENOSYS; + + udp_tunnel_gro_complete(skb, nhoff); + + gh = (struct genevehdr *)(skb->data + nhoff); + gh_len = geneve_hlen(gh); + type = gh->proto_type; + + rcu_read_lock(); + ptype = gro_find_complete_by_type(type); + if (ptype) + err = ptype->callbacks.gro_complete(skb, nhoff + gh_len); + + rcu_read_unlock(); + return err; +} + +static void geneve_notify_add_rx_port(struct geneve_sock *gs) +{ + struct sock *sk = gs->sock->sk; + sa_family_t sa_family = sk->sk_family; + int err; + + if (sa_family == AF_INET) { + err = udp_add_offload(&gs->udp_offloads); + if (err) + pr_warn("geneve: udp_add_offload failed with status %d\n", + err); + } +} + +static void geneve_notify_del_rx_port(struct geneve_sock *gs) +{ + struct sock *sk = gs->sock->sk; + sa_family_t sa_family = sk->sk_family; + + if (sa_family == AF_INET) + udp_del_offload(&gs->udp_offloads); +} + +/* Callback from net/ipv4/udp.c to receive packets */ +static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) +{ + struct genevehdr *geneveh; + struct geneve_sock *gs; + int opts_len; + + /* Need Geneve and inner Ethernet header to be present */ + if (unlikely(!pskb_may_pull(skb, GENEVE_BASE_HLEN))) + goto error; + + /* Return packets with reserved bits set */ + geneveh = geneve_hdr(skb); + + if (unlikely(geneveh->ver != GENEVE_VER)) + goto error; + + if (unlikely(geneveh->proto_type != htons(ETH_P_TEB))) + goto error; + + opts_len = geneveh->opt_len * 4; + if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, + htons(ETH_P_TEB))) + goto drop; + + gs = rcu_dereference_sk_user_data(sk); + if (!gs) + goto drop; + + gs->rcv(gs, skb); + return 0; + +drop: + /* Consume bad packet */ + kfree_skb(skb); + return 0; + +error: + /* Let the UDP layer deal with the skb */ + return 1; +} + +static struct socket *geneve_create_sock(struct net *net, bool ipv6, + __be16 port) +{ + struct socket *sock; + struct udp_port_cfg udp_conf; + int err; + + memset(&udp_conf, 0, sizeof(udp_conf)); + + if (ipv6) { + udp_conf.family = AF_INET6; + } else { + udp_conf.family = AF_INET; + udp_conf.local_ip.s_addr = htonl(INADDR_ANY); + } + + udp_conf.local_udp_port = port; + + /* Open UDP socket */ + err = udp_sock_create(net, &udp_conf, &sock); + if (err < 0) + return ERR_PTR(err); + + return sock; +} + +/* Create new listen socket if needed */ +static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port, + geneve_rcv_t *rcv, void *data, + bool ipv6) +{ + struct geneve_net *gn = net_generic(net, geneve_net_id); + struct geneve_sock *gs; + struct socket *sock; + struct udp_tunnel_sock_cfg tunnel_cfg; + + gs = kzalloc(sizeof(*gs), GFP_KERNEL); + if (!gs) + return ERR_PTR(-ENOMEM); + + sock = geneve_create_sock(net, ipv6, port); + if (IS_ERR(sock)) { + kfree(gs); + return ERR_CAST(sock); + } + + gs->sock = sock; + gs->refcnt = 1; + gs->rcv = rcv; + gs->rcv_data = data; + + /* Initialize the geneve udp offloads structure */ + gs->udp_offloads.port = port; + gs->udp_offloads.callbacks.gro_receive = geneve_gro_receive; + gs->udp_offloads.callbacks.gro_complete = geneve_gro_complete; + geneve_notify_add_rx_port(gs); + + /* Mark socket as an encapsulation socket */ + tunnel_cfg.sk_user_data = gs; + tunnel_cfg.encap_type = 1; + tunnel_cfg.encap_rcv = geneve_udp_encap_recv; + tunnel_cfg.encap_destroy = NULL; + setup_udp_tunnel_sock(net, sock, &tunnel_cfg); + + list_add(&gs->list, &gn->sock_list); + + return gs; +} + +struct geneve_sock *geneve_sock_add(struct net *net, __be16 port, + geneve_rcv_t *rcv, void *data, + bool no_share, bool ipv6) +{ + struct geneve_sock *gs; + + mutex_lock(&geneve_mutex); + + gs = geneve_find_sock(net, ipv6 ? AF_INET6 : AF_INET, port); + if (gs) { + if (!no_share && gs->rcv == rcv) + gs->refcnt++; + else + gs = ERR_PTR(-EBUSY); + } else { + gs = geneve_socket_create(net, port, rcv, data, ipv6); + } + + mutex_unlock(&geneve_mutex); + + return gs; +} +EXPORT_SYMBOL_GPL(geneve_sock_add); + +void geneve_sock_release(struct geneve_sock *gs) +{ + mutex_lock(&geneve_mutex); + + if (--gs->refcnt) + goto unlock; + + list_del(&gs->list); + geneve_notify_del_rx_port(gs); + udp_tunnel_sock_release(gs->sock); + kfree_rcu(gs, rcu); + +unlock: + mutex_unlock(&geneve_mutex); +} +EXPORT_SYMBOL_GPL(geneve_sock_release); + +static __net_init int geneve_init_net(struct net *net) +{ + struct geneve_net *gn = net_generic(net, geneve_net_id); + + INIT_LIST_HEAD(&gn->sock_list); + + return 0; +} + +static struct pernet_operations geneve_net_ops = { + .init = geneve_init_net, + .id = &geneve_net_id, + .size = sizeof(struct geneve_net), +}; + +static int __init geneve_init_module(void) +{ + int rc; + + rc = register_pernet_subsys(&geneve_net_ops); + if (rc) + return rc; + + pr_info("Geneve driver\n"); + + return 0; +} +module_init(geneve_init_module); + +static void __exit geneve_cleanup_module(void) +{ + unregister_pernet_subsys(&geneve_net_ops); +} +module_exit(geneve_cleanup_module); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jesse Gross "); +MODULE_DESCRIPTION("Driver for GENEVE encapsulated traffic"); diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index ed6b0f8dd1bb..15840401a2ce 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -59,7 +59,7 @@ config OPENVSWITCH_VXLAN config OPENVSWITCH_GENEVE tristate "Open vSwitch Geneve tunneling support" depends on OPENVSWITCH - depends on GENEVE + depends on GENEVE_CORE default OPENVSWITCH ---help--- If you say Y here, then the Open vSwitch will be able create geneve vport. -- cgit From d37d29c30509854d15401277f39aab0336525f27 Mon Sep 17 00:00:00 2001 From: "John W. Linville" Date: Wed, 13 May 2015 12:57:29 -0400 Subject: geneve_core: identify as driver library in modules description Signed-off-by: John W. Linville Signed-off-by: David S. Miller --- net/ipv4/geneve_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/geneve_core.c b/net/ipv4/geneve_core.c index 001843d41135..311a4ba6950a 100644 --- a/net/ipv4/geneve_core.c +++ b/net/ipv4/geneve_core.c @@ -430,7 +430,7 @@ static int __init geneve_init_module(void) if (rc) return rc; - pr_info("Geneve driver\n"); + pr_info("Geneve core logic\n"); return 0; } @@ -444,4 +444,4 @@ module_exit(geneve_cleanup_module); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jesse Gross "); -MODULE_DESCRIPTION("Driver for GENEVE encapsulated traffic"); +MODULE_DESCRIPTION("Driver library for GENEVE encapsulated traffic"); -- cgit From a080e7bd0a8e56519d451eaea4ab05212d90e010 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 13 May 2015 13:34:13 -0700 Subject: net: Reserve skb headroom and set skb->dev even if using __alloc_skb When I had inlined __alloc_rx_skb into __netdev_alloc_skb and __napi_alloc_skb I had overlooked the fact that there was a return in the __alloc_rx_skb. As a result we weren't reserving headroom or setting the skb->dev in certain cases. This change corrects that by adding a couple of jump labels to jump to depending on __alloc_skb either succeeding or failing. Fixes: 9451980a6646 ("net: Use cached copy of pfmemalloc to avoid accessing page") Reported-by: Felipe Balbi Signed-off-by: Alexander Duyck Tested-by: Kevin Hilman Signed-off-by: David S. Miller --- net/core/skbuff.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index d67e612bf0ef..f3fe9bd9e672 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -414,8 +414,12 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, len += NET_SKB_PAD; if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) || - (gfp_mask & (__GFP_WAIT | GFP_DMA))) - return __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); + (gfp_mask & (__GFP_WAIT | GFP_DMA))) { + skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); + if (!skb) + goto skb_fail; + goto skb_success; + } len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); len = SKB_DATA_ALIGN(len); @@ -445,9 +449,11 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, skb->pfmemalloc = 1; skb->head_frag = 1; +skb_success: skb_reserve(skb, NET_SKB_PAD); skb->dev = dev; +skb_fail: return skb; } EXPORT_SYMBOL(__netdev_alloc_skb); @@ -475,8 +481,12 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, len += NET_SKB_PAD + NET_IP_ALIGN; if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) || - (gfp_mask & (__GFP_WAIT | GFP_DMA))) - return __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); + (gfp_mask & (__GFP_WAIT | GFP_DMA))) { + skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); + if (!skb) + goto skb_fail; + goto skb_success; + } len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); len = SKB_DATA_ALIGN(len); @@ -499,9 +509,11 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, skb->pfmemalloc = 1; skb->head_frag = 1; +skb_success: skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); skb->dev = napi->dev; +skb_fail: return skb; } EXPORT_SYMBOL(__napi_alloc_skb); -- cgit From f7191483461ce2ae579b6f7227fa7ce49e006656 Mon Sep 17 00:00:00 2001 From: Pablo Neira Date: Wed, 13 May 2015 18:19:35 +0200 Subject: netfilter: add hook list to nf_hook_state Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/netfilter/core.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/core.c b/net/netfilter/core.c index e6163017c42d..e418cfd603c0 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -166,11 +166,9 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state) /* We may already have this, but read-locks nest anyway */ rcu_read_lock(); - elem = list_entry_rcu(&nf_hooks[state->pf][state->hook], - struct nf_hook_ops, list); + elem = list_entry_rcu(state->hook_list, struct nf_hook_ops, list); next_hook: - verdict = nf_iterate(&nf_hooks[state->pf][state->hook], skb, state, - &elem); + verdict = nf_iterate(state->hook_list, skb, state, &elem); if (verdict == NF_ACCEPT || verdict == NF_STOP) { ret = 1; } else if ((verdict & NF_VERDICT_MASK) == NF_DROP) { -- cgit From 1cf51900f8545b358b5deaacfda348d990f671db Mon Sep 17 00:00:00 2001 From: Pablo Neira Date: Wed, 13 May 2015 18:19:37 +0200 Subject: net: add CONFIG_NET_INGRESS to enable ingress filtering This new config switch enables the ingress filtering infrastructure that is controlled through the ingress_needed static key. This prepares the introduction of the Netfilter ingress hook that resides under this unique static key. Note that CONFIG_SCH_INGRESS automatically selects this, that should be no problem since this also depends on CONFIG_NET_CLS_ACT. Signed-off-by: Pablo Neira Ayuso Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/Kconfig | 3 +++ net/core/dev.c | 7 ++++--- net/sched/Kconfig | 1 + 3 files changed, 8 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/Kconfig b/net/Kconfig index 44dd5786ee91..57a7c5af3175 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -45,6 +45,9 @@ config COMPAT_NETLINK_MESSAGES Newly written code should NEVER need this option but do compat-independent messages instead! +config NET_INGRESS + bool + menu "Networking options" source "net/packet/Kconfig" diff --git a/net/core/dev.c b/net/core/dev.c index af549062ae8e..a5ef90016ce7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1630,7 +1630,7 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev) } EXPORT_SYMBOL(call_netdevice_notifiers); -#ifdef CONFIG_NET_CLS_ACT +#ifdef CONFIG_NET_INGRESS static struct static_key ingress_needed __read_mostly; void net_inc_ingress_queue(void) @@ -3798,13 +3798,14 @@ another_round: } skip_taps: -#ifdef CONFIG_NET_CLS_ACT +#ifdef CONFIG_NET_INGRESS if (static_key_false(&ingress_needed)) { skb = handle_ing(skb, &pt_prev, &ret, orig_dev); if (!skb) goto unlock; } - +#endif +#ifdef CONFIG_NET_CLS_ACT skb->tc_verd = 0; ncls: #endif diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 5fd1c2f487d2..daa33432b716 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -312,6 +312,7 @@ config NET_SCH_PIE config NET_SCH_INGRESS tristate "Ingress Qdisc" depends on NET_CLS_ACT + select NET_INGRESS ---help--- Say Y here if you want to use classifiers for incoming packets. If unsure, say Y. -- cgit From e687ad60af09010936bbd0b2a3b5d90a8ee8353c Mon Sep 17 00:00:00 2001 From: Pablo Neira Date: Wed, 13 May 2015 18:19:38 +0200 Subject: netfilter: add netfilter ingress hook after handle_ing() under unique static key This patch adds the Netfilter ingress hook just after the existing tc ingress hook, that seems to be the consensus solution for this. Note that the Netfilter hook resides under the global static key that enables ingress filtering. Nonetheless, Netfilter still also has its own static key for minimal impact on the existing handle_ing(). * Without this patch: Result: OK: 6216490(c6216338+d152) usec, 100000000 (60byte,0frags) 16086246pps 7721Mb/sec (7721398080bps) errors: 100000000 42.46% kpktgend_0 [kernel.kallsyms] [k] __netif_receive_skb_core 25.92% kpktgend_0 [kernel.kallsyms] [k] kfree_skb 7.81% kpktgend_0 [pktgen] [k] pktgen_thread_worker 5.62% kpktgend_0 [kernel.kallsyms] [k] ip_rcv 2.70% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_internal 2.34% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_sk 1.44% kpktgend_0 [kernel.kallsyms] [k] __build_skb * With this patch: Result: OK: 6214833(c6214731+d101) usec, 100000000 (60byte,0frags) 16090536pps 7723Mb/sec (7723457280bps) errors: 100000000 41.23% kpktgend_0 [kernel.kallsyms] [k] __netif_receive_skb_core 26.57% kpktgend_0 [kernel.kallsyms] [k] kfree_skb 7.72% kpktgend_0 [pktgen] [k] pktgen_thread_worker 5.55% kpktgend_0 [kernel.kallsyms] [k] ip_rcv 2.78% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_internal 2.06% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_sk 1.43% kpktgend_0 [kernel.kallsyms] [k] __build_skb * Without this patch + tc ingress: tc filter add dev eth4 parent ffff: protocol ip prio 1 \ u32 match ip dst 4.3.2.1/32 Result: OK: 9269001(c9268821+d179) usec, 100000000 (60byte,0frags) 10788648pps 5178Mb/sec (5178551040bps) errors: 100000000 40.99% kpktgend_0 [kernel.kallsyms] [k] __netif_receive_skb_core 17.50% kpktgend_0 [kernel.kallsyms] [k] kfree_skb 11.77% kpktgend_0 [cls_u32] [k] u32_classify 5.62% kpktgend_0 [kernel.kallsyms] [k] tc_classify_compat 5.18% kpktgend_0 [pktgen] [k] pktgen_thread_worker 3.23% kpktgend_0 [kernel.kallsyms] [k] tc_classify 2.97% kpktgend_0 [kernel.kallsyms] [k] ip_rcv 1.83% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_internal 1.50% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_sk 0.99% kpktgend_0 [kernel.kallsyms] [k] __build_skb * With this patch + tc ingress: tc filter add dev eth4 parent ffff: protocol ip prio 1 \ u32 match ip dst 4.3.2.1/32 Result: OK: 9308218(c9308091+d126) usec, 100000000 (60byte,0frags) 10743194pps 5156Mb/sec (5156733120bps) errors: 100000000 42.01% kpktgend_0 [kernel.kallsyms] [k] __netif_receive_skb_core 17.78% kpktgend_0 [kernel.kallsyms] [k] kfree_skb 11.70% kpktgend_0 [cls_u32] [k] u32_classify 5.46% kpktgend_0 [kernel.kallsyms] [k] tc_classify_compat 5.16% kpktgend_0 [pktgen] [k] pktgen_thread_worker 2.98% kpktgend_0 [kernel.kallsyms] [k] ip_rcv 2.84% kpktgend_0 [kernel.kallsyms] [k] tc_classify 1.96% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_internal 1.57% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_sk Note that the results are very similar before and after. I can see gcc gets the code under the ingress static key out of the hot path. Then, on that cold branch, it generates the code to accomodate the netfilter ingress static key. My explanation for this is that this reduces the pressure on the instruction cache for non-users as the new code is out of the hot path, and it comes with minimal impact for tc ingress users. Using gcc version 4.8.4 on: Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian CPU(s): 8 [...] L1d cache: 16K L1i cache: 64K L2 cache: 2048K L3 cache: 8192K Signed-off-by: Pablo Neira Ayuso Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/dev.c | 36 ++++++++++++++++++++++++++++++++++++ net/netfilter/Kconfig | 7 +++++++ net/netfilter/core.c | 31 ++++++++++++++++++++++++++++++- 3 files changed, 73 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index a5ef90016ce7..29f0d6e6542c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -135,6 +135,7 @@ #include #include #include +#include #include "net-sysfs.h" @@ -3666,6 +3667,13 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb, return skb; } +#else +static inline struct sk_buff *handle_ing(struct sk_buff *skb, + struct packet_type **pt_prev, + int *ret, struct net_device *orig_dev) +{ + return skb; +} #endif /** @@ -3739,6 +3747,28 @@ static bool skb_pfmemalloc_protocol(struct sk_buff *skb) } } +#ifdef CONFIG_NETFILTER_INGRESS +static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, + int *ret, struct net_device *orig_dev) +{ + if (nf_hook_ingress_active(skb)) { + if (*pt_prev) { + *ret = deliver_skb(skb, *pt_prev, orig_dev); + *pt_prev = NULL; + } + + return nf_hook_ingress(skb); + } + return 0; +} +#else +static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, + int *ret, struct net_device *orig_dev) +{ + return 0; +} +#endif + static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc) { struct packet_type *ptype, *pt_prev; @@ -3803,6 +3833,9 @@ skip_taps: skb = handle_ing(skb, &pt_prev, &ret, orig_dev); if (!skb) goto unlock; + + if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0) + goto unlock; } #endif #ifdef CONFIG_NET_CLS_ACT @@ -6968,6 +7001,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->group = INIT_NETDEV_GROUP; if (!dev->ethtool_ops) dev->ethtool_ops = &default_ethtool_ops; + + nf_hook_ingress_init(dev); + return dev; free_all: diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index f70e34a68f70..db1c674397ad 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -1,6 +1,13 @@ menu "Core Netfilter Configuration" depends on NET && INET && NETFILTER +config NETFILTER_INGRESS + bool "Netfilter ingress support" + select NET_INGRESS + help + This allows you to classify packets from ingress using the Netfilter + infrastructure. + config NETFILTER_NETLINK tristate diff --git a/net/netfilter/core.c b/net/netfilter/core.c index e418cfd603c0..653e32eac08c 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -64,10 +64,27 @@ static DEFINE_MUTEX(nf_hook_mutex); int nf_register_hook(struct nf_hook_ops *reg) { + struct list_head *nf_hook_list; struct nf_hook_ops *elem; mutex_lock(&nf_hook_mutex); - list_for_each_entry(elem, &nf_hooks[reg->pf][reg->hooknum], list) { + switch (reg->pf) { + case NFPROTO_NETDEV: +#ifdef CONFIG_NETFILTER_INGRESS + if (reg->hooknum == NF_NETDEV_INGRESS) { + BUG_ON(reg->dev == NULL); + nf_hook_list = ®->dev->nf_hooks_ingress; + net_inc_ingress_queue(); + break; + } +#endif + /* Fall through. */ + default: + nf_hook_list = &nf_hooks[reg->pf][reg->hooknum]; + break; + } + + list_for_each_entry(elem, nf_hook_list, list) { if (reg->priority < elem->priority) break; } @@ -85,6 +102,18 @@ void nf_unregister_hook(struct nf_hook_ops *reg) mutex_lock(&nf_hook_mutex); list_del_rcu(®->list); mutex_unlock(&nf_hook_mutex); + switch (reg->pf) { + case NFPROTO_NETDEV: +#ifdef CONFIG_NETFILTER_INGRESS + if (reg->hooknum == NF_NETDEV_INGRESS) { + net_dec_ingress_queue(); + break; + } + break; +#endif + default: + break; + } #ifdef HAVE_JUMP_LABEL static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif -- cgit From 7fb48c5bc3100f7674a8e26f42c1518196500728 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 3 May 2015 22:05:28 +0200 Subject: netfilter: bridge: neigh_head and physoutdev can't be used at same time The neigh_header is only needed when we detect DNAT after prerouting and neigh cache didn't have a mac address for us. The output port has not been chosen yet so we can re-use the storage area, bringing struct size down to 32 bytes on x86_64. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/bridge/br_netfilter.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index ab55e2472beb..13973da29b2a 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -973,6 +973,8 @@ static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb) nf_bridge->neigh_header, ETH_HLEN - ETH_ALEN); skb->dev = nf_bridge->physindev; + + nf_bridge->physoutdev = NULL; br_handle_frame_finish(NULL, skb); } -- cgit From a9fcc6a41de947108898bc9807f25fa326467365 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 3 May 2015 22:06:07 +0200 Subject: netfilter: bridge: free nf_bridge info on xmit nf_bridge information is only needed for -m physdev, so we can always free it after POST_ROUTING. This has the advantage that allocation and free will typically happen on the same cpu. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/bridge/br_netfilter.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 13973da29b2a..2b0e8bb49944 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -129,6 +129,14 @@ static struct nf_bridge_info *nf_bridge_info_get(const struct sk_buff *skb) return skb->nf_bridge; } +static void nf_bridge_info_free(struct sk_buff *skb) +{ + if (skb->nf_bridge) { + nf_bridge_put(skb->nf_bridge); + skb->nf_bridge = NULL; + } +} + static inline struct rtable *bridge_parent_rtable(const struct net_device *dev) { struct net_bridge_port *port; @@ -841,6 +849,7 @@ static int br_nf_push_frag_xmit(struct sock *sk, struct sk_buff *skb) skb_copy_to_linear_data_offset(skb, -data->size, data->mac, data->size); __skb_push(skb, data->encap_size); + nf_bridge_info_free(skb); return br_dev_queue_push_xmit(sk, skb); } @@ -850,8 +859,10 @@ static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb) int frag_max_size; unsigned int mtu_reserved; - if (skb_is_gso(skb) || skb->protocol != htons(ETH_P_IP)) + if (skb_is_gso(skb) || skb->protocol != htons(ETH_P_IP)) { + nf_bridge_info_free(skb); return br_dev_queue_push_xmit(sk, skb); + } mtu_reserved = nf_bridge_mtu_reduction(skb); /* This is wrong! We should preserve the original fragment @@ -877,6 +888,7 @@ static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb) ret = ip_fragment(sk, skb, br_nf_push_frag_xmit); } else { + nf_bridge_info_free(skb); ret = br_dev_queue_push_xmit(sk, skb); } @@ -885,7 +897,8 @@ static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb) #else static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb) { - return br_dev_queue_push_xmit(sk, skb); + nf_bridge_info_free(skb); + return br_dev_queue_push_xmit(sk, skb); } #endif -- cgit From a3b1c1eb50f9b3e0c73c37157d0c61b2e90ae580 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 6 May 2015 16:28:57 +0200 Subject: netfilter: ipset: deinline ip_set_put_extensions() On x86 allyesconfig build: The function compiles to 489 bytes of machine code. It has 25 callsites. text data bss dec hex filename 82441375 22255384 20627456 125324215 7784bb7 vmlinux.before 82434909 22255384 20627456 125317749 7783275 vmlinux Signed-off-by: Denys Vlasenko CC: Jozsef Kadlecsik CC: Eric W. Biederman CC: David S. Miller CC: Jan Engelhardt CC: Jiri Pirko CC: linux-kernel@vger.kernel.org CC: netdev@vger.kernel.org CC: netfilter-devel@vger.kernel.org Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_core.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 7f9c0565e715..475e4960a164 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -432,6 +432,31 @@ ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[], } EXPORT_SYMBOL_GPL(ip_set_get_extensions); +int +ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set, + const void *e, bool active) +{ + if (SET_WITH_TIMEOUT(set)) { + unsigned long *timeout = ext_timeout(e, set); + + if (nla_put_net32(skb, IPSET_ATTR_TIMEOUT, + htonl(active ? ip_set_timeout_get(timeout) + : *timeout))) + return -EMSGSIZE; + } + if (SET_WITH_COUNTER(set) && + ip_set_put_counter(skb, ext_counter(e, set))) + return -EMSGSIZE; + if (SET_WITH_COMMENT(set) && + ip_set_put_comment(skb, ext_comment(e, set))) + return -EMSGSIZE; + if (SET_WITH_SKBINFO(set) && + ip_set_put_skbinfo(skb, ext_skbinfo(e, set))) + return -EMSGSIZE; + return 0; +} +EXPORT_SYMBOL_GPL(ip_set_put_extensions); + /* * Creating/destroying/renaming/swapping affect the existence and * the properties of a set. All of these can be executed from userspace -- cgit From 12b7ed29bdea2d6dde58f304321d4e8ca453f13c Mon Sep 17 00:00:00 2001 From: Zhang Chunyu Date: Fri, 17 Apr 2015 10:21:32 +0800 Subject: netfilter: xt_MARK: Add ARP support Add arpt_MARK to xt_mark. The corresponding userspace update is available at: http://git.netfilter.org/arptables/commit/?id=4bb2f8340783fd3a3f70aa6f8807428a280f8474 Signed-off-by: Zhang Chunyu Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_mark.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c index 23345238711b..ebd41dc501e5 100644 --- a/net/netfilter/xt_mark.c +++ b/net/netfilter/xt_mark.c @@ -23,6 +23,7 @@ MODULE_ALIAS("ipt_mark"); MODULE_ALIAS("ip6t_mark"); MODULE_ALIAS("ipt_MARK"); MODULE_ALIAS("ip6t_MARK"); +MODULE_ALIAS("arpt_MARK"); static unsigned int mark_tg(struct sk_buff *skb, const struct xt_action_param *par) -- cgit From b1c29f6b10d5981c89d3ea9b9991ca97141ed6d0 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 14 May 2015 10:46:11 -0400 Subject: tipc: simplify resetting and disabling of bearers Since commit 4b475e3f2f8e4e241de101c8240f1d74d0470494 ("tipc: eliminate delayed link deletion at link failover") the extra boolean parameter "shutting_down" is not any longer needed for the functions bearer_disable() and tipc_link_delete_list(). Furhermore, the function tipc_link_reset_links(), called from bearer_reset() is now unnecessary. We can just as well delete all the links, as we do in bearer_disable(), and start over with creating new links. This commit introduces those changes. Reviewed-by: Erik Hugne Reviewed-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/bearer.c | 18 ++++++++---------- net/tipc/link.c | 20 +------------------- net/tipc/link.h | 4 +--- 3 files changed, 10 insertions(+), 32 deletions(-) (limited to 'net') diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 99c0bd43feed..00bc0e620532 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -71,8 +71,7 @@ static const struct nla_policy tipc_nl_media_policy[TIPC_NLA_MEDIA_MAX + 1] = { [TIPC_NLA_MEDIA_PROP] = { .type = NLA_NESTED } }; -static void bearer_disable(struct net *net, struct tipc_bearer *b_ptr, - bool shutting_down); +static void bearer_disable(struct net *net, struct tipc_bearer *b_ptr); /** * tipc_media_find - locates specified media object by name @@ -324,7 +323,7 @@ restart: res = tipc_disc_create(net, b_ptr, &b_ptr->bcast_addr); if (res) { - bearer_disable(net, b_ptr, false); + bearer_disable(net, b_ptr); pr_warn("Bearer <%s> rejected, discovery object creation failed\n", name); return -EINVAL; @@ -344,7 +343,7 @@ restart: static int tipc_reset_bearer(struct net *net, struct tipc_bearer *b_ptr) { pr_info("Resetting bearer <%s>\n", b_ptr->name); - tipc_link_reset_list(net, b_ptr->identity); + tipc_link_delete_list(net, b_ptr->identity); tipc_disc_reset(net, b_ptr); return 0; } @@ -354,8 +353,7 @@ static int tipc_reset_bearer(struct net *net, struct tipc_bearer *b_ptr) * * Note: This routine assumes caller holds RTNL lock. */ -static void bearer_disable(struct net *net, struct tipc_bearer *b_ptr, - bool shutting_down) +static void bearer_disable(struct net *net, struct tipc_bearer *b_ptr) { struct tipc_net *tn = net_generic(net, tipc_net_id); u32 i; @@ -363,7 +361,7 @@ static void bearer_disable(struct net *net, struct tipc_bearer *b_ptr, pr_info("Disabling bearer <%s>\n", b_ptr->name); b_ptr->media->disable_media(b_ptr); - tipc_link_delete_list(net, b_ptr->identity, shutting_down); + tipc_link_delete_list(net, b_ptr->identity); if (b_ptr->link_req) tipc_disc_delete(b_ptr->link_req); @@ -541,7 +539,7 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt, break; case NETDEV_UNREGISTER: case NETDEV_CHANGENAME: - bearer_disable(dev_net(dev), b_ptr, false); + bearer_disable(dev_net(dev), b_ptr); break; } return NOTIFY_OK; @@ -583,7 +581,7 @@ void tipc_bearer_stop(struct net *net) for (i = 0; i < MAX_BEARERS; i++) { b_ptr = rtnl_dereference(tn->bearer_list[i]); if (b_ptr) { - bearer_disable(net, b_ptr, true); + bearer_disable(net, b_ptr); tn->bearer_list[i] = NULL; } } @@ -747,7 +745,7 @@ int tipc_nl_bearer_disable(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } - bearer_disable(net, bearer, false); + bearer_disable(net, bearer); rtnl_unlock(); return 0; diff --git a/net/tipc/link.c b/net/tipc/link.c index 374d52335168..266dbc6a34c1 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -311,8 +311,7 @@ void tipc_link_delete(struct tipc_link *l) tipc_link_put(l); } -void tipc_link_delete_list(struct net *net, unsigned int bearer_id, - bool shutting_down) +void tipc_link_delete_list(struct net *net, unsigned int bearer_id) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_link *link; @@ -476,23 +475,6 @@ void tipc_link_reset(struct tipc_link *l_ptr) link_reset_statistics(l_ptr); } -void tipc_link_reset_list(struct net *net, unsigned int bearer_id) -{ - struct tipc_net *tn = net_generic(net, tipc_net_id); - struct tipc_link *l_ptr; - struct tipc_node *n_ptr; - - rcu_read_lock(); - list_for_each_entry_rcu(n_ptr, &tn->node_list, list) { - tipc_node_lock(n_ptr); - l_ptr = n_ptr->links[bearer_id]; - if (l_ptr) - tipc_link_reset(l_ptr); - tipc_node_unlock(n_ptr); - } - rcu_read_unlock(); -} - static void link_activate(struct tipc_link *link) { struct tipc_node *node = link->owner; diff --git a/net/tipc/link.h b/net/tipc/link.h index b5b4e3554d4e..dc27bb62b1f5 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -213,8 +213,7 @@ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, struct tipc_bearer *b_ptr, const struct tipc_media_addr *media_addr); void tipc_link_delete(struct tipc_link *link); -void tipc_link_delete_list(struct net *net, unsigned int bearer_id, - bool shutting_down); +void tipc_link_delete_list(struct net *net, unsigned int bearer_id); void tipc_link_failover_send_queue(struct tipc_link *l_ptr); void tipc_link_dup_queue_xmit(struct tipc_link *l_ptr, struct tipc_link *dest); void tipc_link_reset_fragments(struct tipc_link *l_ptr); @@ -223,7 +222,6 @@ int tipc_link_is_active(struct tipc_link *l_ptr); void tipc_link_purge_queues(struct tipc_link *l_ptr); void tipc_link_reset_all(struct tipc_node *node); void tipc_link_reset(struct tipc_link *l_ptr); -void tipc_link_reset_list(struct net *net, unsigned int bearer_id); int tipc_link_xmit_skb(struct net *net, struct sk_buff *skb, u32 dest, u32 selector); int tipc_link_xmit(struct net *net, struct sk_buff_head *list, u32 dest, -- cgit From 75b44b018eb086fb461aa0351d2ecb1eba684302 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 14 May 2015 10:46:12 -0400 Subject: tipc: simplify link timer handling Prior to this commit, the link timer has been running at a "continuity interval" of configured link tolerance/4. When a timer wakes up and discovers that there has been no sign of life from the peer during the previous interval, it divides its own timer interval by another factor four, and starts sending one probe per new interval. When the configured link tolerance time has passed without answer, i.e. after 16 unacked probes, the link is declared faulty and reset. This is unnecessary complex. It is sufficient to continue with the original continuity interval, and instead reset the link after four missed probe responses. This makes the timer handling in the link simpler, and opens up for some planned later changes in this area. This commit implements this change. Reviewed-by: Richard Alpe Reviewed-by: Erik Hugne Reviewed-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index 266dbc6a34c1..d71e83d0959b 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -528,7 +528,7 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) l_ptr->fsm_msg_cnt = 0; tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0); l_ptr->fsm_msg_cnt++; - link_set_timer(l_ptr, cont_intv / 4); + link_set_timer(l_ptr, cont_intv); break; case RESET_MSG: pr_debug("%s<%s>, requested by peer\n", @@ -579,7 +579,7 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0); l_ptr->fsm_msg_cnt++; - link_set_timer(l_ptr, cont_intv / 4); + link_set_timer(l_ptr, cont_intv); } else { /* Link has failed */ pr_debug("%s<%s>, peer not responding\n", link_rst_msg, l_ptr->name); @@ -1725,7 +1725,7 @@ static void link_set_supervision_props(struct tipc_link *l_ptr, u32 tol) l_ptr->tolerance = tol; l_ptr->cont_intv = msecs_to_jiffies(intv); - l_ptr->abort_limit = tol / (jiffies_to_msecs(l_ptr->cont_intv) / 4); + l_ptr->abort_limit = tol / (jiffies_to_msecs(l_ptr->cont_intv)); } void tipc_link_set_queue_limits(struct tipc_link *l, u32 win) -- cgit From a6bf70f792963b32e410e5c3d2f96903265b090a Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 14 May 2015 10:46:13 -0400 Subject: tipc: simplify include dependencies When we try to add new inline functions in the code, we sometimes run into circular include dependencies. The main problem is that the file core.h, which really should be at the root of the dependency chain, instead is a leaf. I.e., core.h includes a number of header files that themselves should be allowed to include core.h. In reality this is unnecessary, because core.h does not need to know the full signature of any of the structs it refers to, only their type declaration. In this commit, we remove all dependencies from core.h towards any other tipc header file. As a consequence of this change, we can now move the function tipc_own_addr(net) from addr.c to addr.h, and make it inline. There are no functional changes in this commit. Reviewed-by: Erik Hugne Reviewed-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/addr.c | 7 ------- net/tipc/addr.h | 8 ++++++++ net/tipc/bearer.h | 2 +- net/tipc/core.h | 17 ++++++++++------- net/tipc/net.c | 1 + net/tipc/node.c | 1 + net/tipc/node.h | 2 -- net/tipc/socket.c | 1 + 8 files changed, 22 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/tipc/addr.c b/net/tipc/addr.c index ba7daa864d44..48fd3b5a73fb 100644 --- a/net/tipc/addr.c +++ b/net/tipc/addr.c @@ -38,13 +38,6 @@ #include "addr.h" #include "core.h" -u32 tipc_own_addr(struct net *net) -{ - struct tipc_net *tn = net_generic(net, tipc_net_id); - - return tn->own_addr; -} - /** * in_own_cluster - test for cluster inclusion; <0.0.0> always matches */ diff --git a/net/tipc/addr.h b/net/tipc/addr.h index 7ba6d5c8ae40..93f7c983be33 100644 --- a/net/tipc/addr.h +++ b/net/tipc/addr.h @@ -41,10 +41,18 @@ #include #include #include +#include "core.h" #define TIPC_ZONE_MASK 0xff000000u #define TIPC_CLUSTER_MASK 0xfffff000u +static inline u32 tipc_own_addr(struct net *net) +{ + struct tipc_net *tn = net_generic(net, tipc_net_id); + + return tn->own_addr; +} + static inline u32 tipc_zone_mask(u32 addr) { return addr & TIPC_ZONE_MASK; diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index 5cad243ee8fc..dc714d977768 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -38,9 +38,9 @@ #define _TIPC_BEARER_H #include "netlink.h" +#include "core.h" #include -#define MAX_BEARERS 2 #define MAX_MEDIA 3 #define MAX_NODES 4096 #define WSIZE 32 diff --git a/net/tipc/core.h b/net/tipc/core.h index 3dc68c7a966d..53e8146b14e0 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -60,16 +60,19 @@ #include #include -#include "node.h" -#include "bearer.h" -#include "bcast.h" -#include "netlink.h" -#include "link.h" -#include "node.h" -#include "msg.h" +struct tipc_node; +struct tipc_bearer; +struct tipc_bcbearer; +struct tipc_bclink; +struct tipc_link; +struct tipc_name_table; +struct tipc_server; #define TIPC_MOD_VER "2.0.0" +#define NODE_HTABLE_SIZE 512 +#define MAX_BEARERS 3 + extern int tipc_net_id __read_mostly; extern int sysctl_tipc_rmem[3] __read_mostly; extern int sysctl_tipc_named_timeout __read_mostly; diff --git a/net/tipc/net.c b/net/tipc/net.c index a54f3cbe2246..d6d1399ae229 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -40,6 +40,7 @@ #include "subscr.h" #include "socket.h" #include "node.h" +#include "bcast.h" static const struct nla_policy tipc_nl_net_policy[TIPC_NLA_NET_MAX + 1] = { [TIPC_NLA_NET_UNSPEC] = { .type = NLA_UNSPEC }, diff --git a/net/tipc/node.c b/net/tipc/node.c index 22c059ad2999..eb3856bb8c5a 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -39,6 +39,7 @@ #include "node.h" #include "name_distr.h" #include "socket.h" +#include "bcast.h" static void node_lost_contact(struct tipc_node *n_ptr); static void node_established_contact(struct tipc_node *n_ptr); diff --git a/net/tipc/node.h b/net/tipc/node.h index 02d5c20dc551..5a834cf142c8 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -45,8 +45,6 @@ /* Out-of-range value for node signature */ #define INVALID_NODE_SIG 0x10000 -#define NODE_HTABLE_SIZE 512 - /* Flags used to take different actions according to flag type * TIPC_WAIT_PEER_LINKS_DOWN: wait to see that peer's links are down * TIPC_WAIT_OWN_LINKS_DOWN: wait until peer node is declared down diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 8f3c8e2cef8e..9370f953e16f 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -41,6 +41,7 @@ #include "link.h" #include "name_distr.h" #include "socket.h" +#include "bcast.h" #define SS_LISTENING -1 /* socket is listening */ #define SS_READY -2 /* socket is connectionless */ -- cgit From e4bf4f76962b0869d1048ac6c52a46e7d90eb46f Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 14 May 2015 10:46:14 -0400 Subject: tipc: simplify packet sequence number handling Although the sequence number in the TIPC protocol is 16 bits, we have until now stored it internally as an unsigned 32 bits integer. We got around this by always doing explicit modulo-65535 operations whenever we need to access a sequence number. We now make the incoming and outgoing sequence numbers to unsigned 16-bit integers, and remove the modulo operations where applicable. We also move the arithmetic inline functions for 16 bit integers to core.h, and the function buf_seqno() to msg.h, so they can easily be accessed from anywhere in the code. Reviewed-by: Erik Hugne Reviewed-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/core.h | 20 ++++++++++++++++++++ net/tipc/link.c | 37 ++++++++++++++++++------------------- net/tipc/link.h | 41 ++++------------------------------------- net/tipc/msg.h | 17 +++++++++++------ 4 files changed, 53 insertions(+), 62 deletions(-) (limited to 'net') diff --git a/net/tipc/core.h b/net/tipc/core.h index 53e8146b14e0..0fcf133d5cb7 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -109,6 +109,26 @@ struct tipc_net { atomic_t subscription_count; }; +static inline u16 mod(u16 x) +{ + return x & 0xffffu; +} + +static inline int less_eq(u16 left, u16 right) +{ + return mod(right - left) < 32768u; +} + +static inline int more(u16 left, u16 right) +{ + return !less_eq(left, right); +} + +static inline int less(u16 left, u16 right) +{ + return less_eq(left, right) && (mod(right) != mod(left)); +} + #ifdef CONFIG_SYSCTL int tipc_register_sysctl(void); void tipc_unregister_sysctl(void); diff --git a/net/tipc/link.c b/net/tipc/link.c index d71e83d0959b..391a96f0efc7 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -685,9 +685,9 @@ int __tipc_link_xmit(struct net *net, struct tipc_link *link, unsigned int maxwin = link->window; unsigned int imp = msg_importance(msg); uint mtu = link->mtu; - uint ack = mod(link->next_in_no - 1); - uint seqno = link->next_out_no; - uint bc_last_in = link->owner->bclink.last_in; + u16 ack = mod(link->next_in_no - 1); + u16 seqno = link->next_out_no; + u16 bc_last_in = link->owner->bclink.last_in; struct tipc_media_addr *addr = &link->media_addr; struct sk_buff_head *transmq = &link->transmq; struct sk_buff_head *backlogq = &link->backlogq; @@ -859,7 +859,7 @@ void tipc_link_push_packets(struct tipc_link *link) { struct sk_buff *skb; struct tipc_msg *msg; - unsigned int ack = mod(link->next_in_no - 1); + u16 ack = mod(link->next_in_no - 1); while (skb_queue_len(&link->transmq) < link->window) { skb = __skb_dequeue(&link->backlogq); @@ -998,13 +998,13 @@ synched: static void link_retrieve_defq(struct tipc_link *link, struct sk_buff_head *list) { - u32 seq_no; + u16 seq_no; if (skb_queue_empty(&link->deferdq)) return; seq_no = buf_seqno(skb_peek(&link->deferdq)); - if (seq_no == mod(link->next_in_no)) + if (seq_no == link->next_in_no) skb_queue_splice_tail_init(&link->deferdq, list); } @@ -1025,8 +1025,8 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b_ptr) struct tipc_link *l_ptr; struct sk_buff *skb1, *tmp; struct tipc_msg *msg; - u32 seq_no; - u32 ackd; + u16 seq_no; + u16 ackd; u32 released; skb2list(skb, &head); @@ -1119,7 +1119,7 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b_ptr) } /* Link is now in state WORKING_WORKING */ - if (unlikely(seq_no != mod(l_ptr->next_in_no))) { + if (unlikely(seq_no != l_ptr->next_in_no)) { link_handle_out_of_seq_msg(l_ptr, skb); link_retrieve_defq(l_ptr, &head); skb = NULL; @@ -1250,7 +1250,7 @@ static void tipc_link_input(struct tipc_link *link, struct sk_buff *skb) u32 tipc_link_defer_pkt(struct sk_buff_head *list, struct sk_buff *skb) { struct sk_buff *skb1; - u32 seq_no = buf_seqno(skb); + u16 seq_no = buf_seqno(skb); /* Empty queue ? */ if (skb_queue_empty(list)) { @@ -1266,7 +1266,7 @@ u32 tipc_link_defer_pkt(struct sk_buff_head *list, struct sk_buff *skb) /* Locate insertion point in queue, then insert; discard if duplicate */ skb_queue_walk(list, skb1) { - u32 curr_seqno = buf_seqno(skb1); + u16 curr_seqno = buf_seqno(skb1); if (seq_no == curr_seqno) { kfree_skb(skb); @@ -1301,7 +1301,7 @@ static void link_handle_out_of_seq_msg(struct tipc_link *l_ptr, * Discard packet if a duplicate; otherwise add it to deferred queue * and notify peer of gap as per protocol specification */ - if (less(seq_no, mod(l_ptr->next_in_no))) { + if (less(seq_no, l_ptr->next_in_no)) { l_ptr->stats.duplicates++; kfree_skb(buf); return; @@ -1326,6 +1326,7 @@ void tipc_link_proto_xmit(struct tipc_link *l_ptr, u32 msg_typ, int probe_msg, struct tipc_msg *msg = l_ptr->pmsg; u32 msg_size = sizeof(l_ptr->proto_msg); int r_flag; + u16 last_rcv; /* Don't send protocol message during link failover */ if (l_ptr->flags & LINK_FAILINGOVER) @@ -1342,7 +1343,7 @@ void tipc_link_proto_xmit(struct tipc_link *l_ptr, u32 msg_typ, int probe_msg, msg_set_last_bcast(msg, tipc_bclink_get_last_sent(l_ptr->owner->net)); if (msg_typ == STATE_MSG) { - u32 next_sent = mod(l_ptr->next_out_no); + u16 next_sent = l_ptr->next_out_no; if (!tipc_link_is_up(l_ptr)) return; @@ -1350,8 +1351,8 @@ void tipc_link_proto_xmit(struct tipc_link *l_ptr, u32 msg_typ, int probe_msg, next_sent = buf_seqno(skb_peek(&l_ptr->backlogq)); msg_set_next_sent(msg, next_sent); if (!skb_queue_empty(&l_ptr->deferdq)) { - u32 rec = buf_seqno(skb_peek(&l_ptr->deferdq)); - gap = mod(rec - mod(l_ptr->next_in_no)); + last_rcv = buf_seqno(skb_peek(&l_ptr->deferdq)); + gap = mod(last_rcv - l_ptr->next_in_no); } msg_set_seq_gap(msg, gap); if (gap) @@ -1485,10 +1486,8 @@ static void tipc_link_proto_rcv(struct tipc_link *l_ptr, if (link_reset_unknown(l_ptr)) break; - if (less_eq(mod(l_ptr->next_in_no), msg_next_sent(msg))) { - rec_gap = mod(msg_next_sent(msg) - - mod(l_ptr->next_in_no)); - } + if (less_eq(l_ptr->next_in_no, msg_next_sent(msg))) + rec_gap = mod(msg_next_sent(msg) - l_ptr->next_in_no); if (msg_probe(msg)) l_ptr->stats.recv_probes++; diff --git a/net/tipc/link.h b/net/tipc/link.h index dc27bb62b1f5..a65770bf647c 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -151,7 +151,7 @@ struct tipc_link { /* Management and link supervision data */ unsigned int flags; - u32 checkpoint; + u16 checkpoint; u32 peer_session; u32 peer_bearer_id; u32 bearer_id; @@ -185,13 +185,13 @@ struct tipc_link { u16 len; u16 limit; } backlog[5]; - u32 next_out_no; + u16 next_out_no; + u16 last_retransmitted; u32 window; - u32 last_retransmitted; u32 stale_count; /* Reception */ - u32 next_in_no; + u16 next_in_no; u32 rcv_unacked; struct sk_buff_head deferdq; struct sk_buff_head inputq; @@ -245,39 +245,6 @@ int tipc_nl_link_reset_stats(struct sk_buff *skb, struct genl_info *info); int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[]); void link_prepare_wakeup(struct tipc_link *l); -/* - * Link sequence number manipulation routines (uses modulo 2**16 arithmetic) - */ -static inline u32 buf_seqno(struct sk_buff *buf) -{ - return msg_seqno(buf_msg(buf)); -} - -static inline u32 mod(u32 x) -{ - return x & 0xffffu; -} - -static inline int less_eq(u32 left, u32 right) -{ - return mod(right - left) < 32768u; -} - -static inline int more(u32 left, u32 right) -{ - return !less_eq(left, right); -} - -static inline int less(u32 left, u32 right) -{ - return less_eq(left, right) && (mod(right) != mod(left)); -} - -static inline u32 lesser(u32 left, u32 right) -{ - return less_eq(left, right) ? left : right; -} - static inline u32 link_own_addr(struct tipc_link *l) { return msg_prevnode(l->pmsg); diff --git a/net/tipc/msg.h b/net/tipc/msg.h index e1d3595e2ee9..6ca2366f3a53 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -313,12 +313,12 @@ static inline void msg_set_lookup_scope(struct tipc_msg *m, u32 n) msg_set_bits(m, 1, 19, 0x3, n); } -static inline u32 msg_bcast_ack(struct tipc_msg *m) +static inline u16 msg_bcast_ack(struct tipc_msg *m) { return msg_bits(m, 1, 0, 0xffff); } -static inline void msg_set_bcast_ack(struct tipc_msg *m, u32 n) +static inline void msg_set_bcast_ack(struct tipc_msg *m, u16 n) { msg_set_bits(m, 1, 0, 0xffff, n); } @@ -327,22 +327,22 @@ static inline void msg_set_bcast_ack(struct tipc_msg *m, u32 n) /* * Word 2 */ -static inline u32 msg_ack(struct tipc_msg *m) +static inline u16 msg_ack(struct tipc_msg *m) { return msg_bits(m, 2, 16, 0xffff); } -static inline void msg_set_ack(struct tipc_msg *m, u32 n) +static inline void msg_set_ack(struct tipc_msg *m, u16 n) { msg_set_bits(m, 2, 16, 0xffff, n); } -static inline u32 msg_seqno(struct tipc_msg *m) +static inline u16 msg_seqno(struct tipc_msg *m) { return msg_bits(m, 2, 0, 0xffff); } -static inline void msg_set_seqno(struct tipc_msg *m, u32 n) +static inline void msg_set_seqno(struct tipc_msg *m, u16 n) { msg_set_bits(m, 2, 0, 0xffff, n); } @@ -782,6 +782,11 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, u32 *dnode, int *err); struct sk_buff *tipc_msg_reassemble(struct sk_buff_head *list); +static inline u16 buf_seqno(struct sk_buff *skb) +{ + return msg_seqno(buf_msg(skb)); +} + /* tipc_skb_peek(): peek and reserve first buffer in list * @list: list to be peeked in * Returns pointer to first buffer in list, if any -- cgit From a97b9d3fa9bce0d78dc83a14a9e1ebb3bf5cc414 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 14 May 2015 10:46:15 -0400 Subject: tipc: rename fields in struct tipc_link We rename some fields in struct tipc_link, in order to give them more descriptive names: next_in_no -> rcv_nxt next_out_no-> snd_nxt fsm_msg_cnt-> silent_intv_cnt cont_intv -> keepalive_intv last_retransmitted -> last_retransm There are no functional changes in this commit. Reviewed-by: Erik Hugne Reviewed-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/bcast.c | 18 +++---- net/tipc/link.c | 155 ++++++++++++++++++++++++++++--------------------------- net/tipc/link.h | 20 +++---- 3 files changed, 97 insertions(+), 96 deletions(-) (limited to 'net') diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 3e18fc6c7877..842e19f6abf6 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -118,16 +118,16 @@ static void bclink_set_last_sent(struct net *net) struct sk_buff *skb = skb_peek(&bcl->backlogq); if (skb) - bcl->fsm_msg_cnt = mod(buf_seqno(skb) - 1); + bcl->silent_intv_cnt = mod(buf_seqno(skb) - 1); else - bcl->fsm_msg_cnt = mod(bcl->next_out_no - 1); + bcl->silent_intv_cnt = mod(bcl->snd_nxt - 1); } u32 tipc_bclink_get_last_sent(struct net *net) { struct tipc_net *tn = net_generic(net, tipc_net_id); - return tn->bcl->fsm_msg_cnt; + return tn->bcl->silent_intv_cnt; } static void bclink_update_last_sent(struct tipc_node *node, u32 seqno) @@ -212,16 +212,16 @@ void tipc_bclink_acknowledge(struct tipc_node *n_ptr, u32 acked) * or both sent and unsent messages (otherwise) */ if (tn->bclink->bcast_nodes.count) - acked = tn->bcl->fsm_msg_cnt; + acked = tn->bcl->silent_intv_cnt; else - acked = tn->bcl->next_out_no; + acked = tn->bcl->snd_nxt; } else { /* * Bail out if specified sequence number does not correspond * to a message that has been sent and not yet acknowledged */ if (less(acked, buf_seqno(skb)) || - less(tn->bcl->fsm_msg_cnt, acked) || + less(tn->bcl->silent_intv_cnt, acked) || less_eq(acked, n_ptr->bclink.acked)) goto exit; } @@ -803,9 +803,9 @@ int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg) goto attr_msg_full; if (nla_put_string(msg->skb, TIPC_NLA_LINK_NAME, bcl->name)) goto attr_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, bcl->next_in_no)) + if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, bcl->rcv_nxt)) goto attr_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, bcl->next_out_no)) + if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, bcl->snd_nxt)) goto attr_msg_full; prop = nla_nest_start(msg->skb, TIPC_NLA_LINK_PROP); @@ -914,7 +914,7 @@ int tipc_bclink_init(struct net *net) __skb_queue_head_init(&bcl->backlogq); __skb_queue_head_init(&bcl->deferdq); skb_queue_head_init(&bcl->wakeupq); - bcl->next_out_no = 1; + bcl->snd_nxt = 1; spin_lock_init(&bclink->node.lock); __skb_queue_head_init(&bclink->arrvq); skb_queue_head_init(&bclink->inputq); diff --git a/net/tipc/link.c b/net/tipc/link.c index 391a96f0efc7..c39543e8112d 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -233,8 +233,8 @@ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, if (n_ptr->link_cnt >= MAX_BEARERS) { tipc_addr_string_fill(addr_string, n_ptr->addr); - pr_err("Attempt to establish %uth link to %s. Max %u allowed.\n", - n_ptr->link_cnt, addr_string, MAX_BEARERS); + pr_err("Cannot establish %uth link to %s. Max %u allowed.\n", + n_ptr->link_cnt, addr_string, MAX_BEARERS); return NULL; } @@ -280,7 +280,7 @@ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, l_ptr->mtu = l_ptr->advertised_mtu; l_ptr->priority = b_ptr->priority; tipc_link_set_queue_limits(l_ptr, b_ptr->window); - l_ptr->next_out_no = 1; + l_ptr->snd_nxt = 1; __skb_queue_head_init(&l_ptr->transmq); __skb_queue_head_init(&l_ptr->backlogq); __skb_queue_head_init(&l_ptr->deferdq); @@ -450,9 +450,9 @@ void tipc_link_reset(struct tipc_link *l_ptr) if (was_active_link && tipc_node_is_up(l_ptr->owner) && (pl != l_ptr)) { l_ptr->flags |= LINK_FAILINGOVER; - l_ptr->failover_checkpt = l_ptr->next_in_no; + l_ptr->failover_checkpt = l_ptr->rcv_nxt; pl->failover_pkts = FIRST_FAILOVER; - pl->failover_checkpt = l_ptr->next_in_no; + pl->failover_checkpt = l_ptr->rcv_nxt; pl->failover_skb = l_ptr->reasm_buf; } else { kfree_skb(l_ptr->reasm_buf); @@ -469,8 +469,8 @@ void tipc_link_reset(struct tipc_link *l_ptr) l_ptr->reasm_buf = NULL; l_ptr->rcv_unacked = 0; l_ptr->checkpoint = 1; - l_ptr->next_out_no = 1; - l_ptr->fsm_msg_cnt = 0; + l_ptr->snd_nxt = 1; + l_ptr->silent_intv_cnt = 0; l_ptr->stale_count = 0; link_reset_statistics(l_ptr); } @@ -479,7 +479,7 @@ static void link_activate(struct tipc_link *link) { struct tipc_node *node = link->owner; - link->next_in_no = 1; + link->rcv_nxt = 1; link->stats.recv_info = 1; tipc_node_link_up(node, link); tipc_bearer_add_dest(node->net, link->bearer_id, link->addr); @@ -493,7 +493,7 @@ static void link_activate(struct tipc_link *link) static void link_state_event(struct tipc_link *l_ptr, unsigned int event) { struct tipc_link *other; - unsigned long cont_intv = l_ptr->cont_intv; + unsigned long timer_intv = l_ptr->keepalive_intv; if (l_ptr->flags & LINK_STOPPED) return; @@ -503,7 +503,7 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) if (l_ptr->flags & LINK_FAILINGOVER) { if (event == TIMEOUT_EVT) - link_set_timer(l_ptr, cont_intv); + link_set_timer(l_ptr, timer_intv); return; } @@ -514,32 +514,32 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) case ACTIVATE_MSG: break; case TIMEOUT_EVT: - if (l_ptr->next_in_no != l_ptr->checkpoint) { - l_ptr->checkpoint = l_ptr->next_in_no; + if (l_ptr->rcv_nxt != l_ptr->checkpoint) { + l_ptr->checkpoint = l_ptr->rcv_nxt; if (tipc_bclink_acks_missing(l_ptr->owner)) { tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, 0, 0, 0); - l_ptr->fsm_msg_cnt++; + l_ptr->silent_intv_cnt++; } - link_set_timer(l_ptr, cont_intv); + link_set_timer(l_ptr, timer_intv); break; } l_ptr->state = WORKING_UNKNOWN; - l_ptr->fsm_msg_cnt = 0; + l_ptr->silent_intv_cnt = 0; tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0); - l_ptr->fsm_msg_cnt++; - link_set_timer(l_ptr, cont_intv); + l_ptr->silent_intv_cnt++; + link_set_timer(l_ptr, timer_intv); break; case RESET_MSG: pr_debug("%s<%s>, requested by peer\n", link_rst_msg, l_ptr->name); tipc_link_reset(l_ptr); l_ptr->state = RESET_RESET; - l_ptr->fsm_msg_cnt = 0; + l_ptr->silent_intv_cnt = 0; tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG, 0, 0, 0, 0); - l_ptr->fsm_msg_cnt++; - link_set_timer(l_ptr, cont_intv); + l_ptr->silent_intv_cnt++; + link_set_timer(l_ptr, timer_intv); break; default: pr_debug("%s%u in WW state\n", link_unk_evt, event); @@ -550,46 +550,47 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) case TRAFFIC_MSG_EVT: case ACTIVATE_MSG: l_ptr->state = WORKING_WORKING; - l_ptr->fsm_msg_cnt = 0; - link_set_timer(l_ptr, cont_intv); + l_ptr->silent_intv_cnt = 0; + link_set_timer(l_ptr, timer_intv); break; case RESET_MSG: pr_debug("%s<%s>, requested by peer while probing\n", link_rst_msg, l_ptr->name); tipc_link_reset(l_ptr); l_ptr->state = RESET_RESET; - l_ptr->fsm_msg_cnt = 0; + l_ptr->silent_intv_cnt = 0; tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG, 0, 0, 0, 0); - l_ptr->fsm_msg_cnt++; - link_set_timer(l_ptr, cont_intv); + l_ptr->silent_intv_cnt++; + link_set_timer(l_ptr, timer_intv); break; case TIMEOUT_EVT: - if (l_ptr->next_in_no != l_ptr->checkpoint) { + if (l_ptr->rcv_nxt != l_ptr->checkpoint) { l_ptr->state = WORKING_WORKING; - l_ptr->fsm_msg_cnt = 0; - l_ptr->checkpoint = l_ptr->next_in_no; + l_ptr->silent_intv_cnt = 0; + l_ptr->checkpoint = l_ptr->rcv_nxt; if (tipc_bclink_acks_missing(l_ptr->owner)) { tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, 0, 0, 0); - l_ptr->fsm_msg_cnt++; + l_ptr->silent_intv_cnt++; } - link_set_timer(l_ptr, cont_intv); - } else if (l_ptr->fsm_msg_cnt < l_ptr->abort_limit) { + link_set_timer(l_ptr, timer_intv); + } else if (l_ptr->silent_intv_cnt < + l_ptr->abort_limit) { tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0); - l_ptr->fsm_msg_cnt++; - link_set_timer(l_ptr, cont_intv); + l_ptr->silent_intv_cnt++; + link_set_timer(l_ptr, timer_intv); } else { /* Link has failed */ pr_debug("%s<%s>, peer not responding\n", link_rst_msg, l_ptr->name); tipc_link_reset(l_ptr); l_ptr->state = RESET_UNKNOWN; - l_ptr->fsm_msg_cnt = 0; + l_ptr->silent_intv_cnt = 0; tipc_link_proto_xmit(l_ptr, RESET_MSG, 0, 0, 0, 0); - l_ptr->fsm_msg_cnt++; - link_set_timer(l_ptr, cont_intv); + l_ptr->silent_intv_cnt++; + link_set_timer(l_ptr, timer_intv); } break; default: @@ -605,31 +606,31 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) if (other && link_working_unknown(other)) break; l_ptr->state = WORKING_WORKING; - l_ptr->fsm_msg_cnt = 0; + l_ptr->silent_intv_cnt = 0; link_activate(l_ptr); tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0); - l_ptr->fsm_msg_cnt++; + l_ptr->silent_intv_cnt++; if (l_ptr->owner->working_links == 1) tipc_link_sync_xmit(l_ptr); - link_set_timer(l_ptr, cont_intv); + link_set_timer(l_ptr, timer_intv); break; case RESET_MSG: l_ptr->state = RESET_RESET; - l_ptr->fsm_msg_cnt = 0; + l_ptr->silent_intv_cnt = 0; tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG, 1, 0, 0, 0); - l_ptr->fsm_msg_cnt++; - link_set_timer(l_ptr, cont_intv); + l_ptr->silent_intv_cnt++; + link_set_timer(l_ptr, timer_intv); break; case STARTING_EVT: l_ptr->flags |= LINK_STARTED; - l_ptr->fsm_msg_cnt++; - link_set_timer(l_ptr, cont_intv); + l_ptr->silent_intv_cnt++; + link_set_timer(l_ptr, timer_intv); break; case TIMEOUT_EVT: tipc_link_proto_xmit(l_ptr, RESET_MSG, 0, 0, 0, 0); - l_ptr->fsm_msg_cnt++; - link_set_timer(l_ptr, cont_intv); + l_ptr->silent_intv_cnt++; + link_set_timer(l_ptr, timer_intv); break; default: pr_err("%s%u in RU state\n", link_unk_evt, event); @@ -643,21 +644,21 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) if (other && link_working_unknown(other)) break; l_ptr->state = WORKING_WORKING; - l_ptr->fsm_msg_cnt = 0; + l_ptr->silent_intv_cnt = 0; link_activate(l_ptr); tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0); - l_ptr->fsm_msg_cnt++; + l_ptr->silent_intv_cnt++; if (l_ptr->owner->working_links == 1) tipc_link_sync_xmit(l_ptr); - link_set_timer(l_ptr, cont_intv); + link_set_timer(l_ptr, timer_intv); break; case RESET_MSG: break; case TIMEOUT_EVT: tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG, 0, 0, 0, 0); - l_ptr->fsm_msg_cnt++; - link_set_timer(l_ptr, cont_intv); + l_ptr->silent_intv_cnt++; + link_set_timer(l_ptr, timer_intv); break; default: pr_err("%s%u in RR state\n", link_unk_evt, event); @@ -685,8 +686,8 @@ int __tipc_link_xmit(struct net *net, struct tipc_link *link, unsigned int maxwin = link->window; unsigned int imp = msg_importance(msg); uint mtu = link->mtu; - u16 ack = mod(link->next_in_no - 1); - u16 seqno = link->next_out_no; + u16 ack = mod(link->rcv_nxt - 1); + u16 seqno = link->snd_nxt; u16 bc_last_in = link->owner->bclink.last_in; struct tipc_media_addr *addr = &link->media_addr; struct sk_buff_head *transmq = &link->transmq; @@ -729,7 +730,7 @@ int __tipc_link_xmit(struct net *net, struct tipc_link *link, link->backlog[imp].len++; seqno++; } - link->next_out_no = seqno; + link->snd_nxt = seqno; return 0; } @@ -859,7 +860,7 @@ void tipc_link_push_packets(struct tipc_link *link) { struct sk_buff *skb; struct tipc_msg *msg; - u16 ack = mod(link->next_in_no - 1); + u16 ack = mod(link->rcv_nxt - 1); while (skb_queue_len(&link->transmq) < link->window) { skb = __skb_dequeue(&link->backlogq); @@ -946,13 +947,13 @@ void tipc_link_retransmit(struct tipc_link *l_ptr, struct sk_buff *skb, msg = buf_msg(skb); /* Detect repeated retransmit failures */ - if (l_ptr->last_retransmitted == msg_seqno(msg)) { + if (l_ptr->last_retransm == msg_seqno(msg)) { if (++l_ptr->stale_count > 100) { link_retransmit_failure(l_ptr, skb); return; } } else { - l_ptr->last_retransmitted = msg_seqno(msg); + l_ptr->last_retransm = msg_seqno(msg); l_ptr->stale_count = 1; } @@ -960,7 +961,7 @@ void tipc_link_retransmit(struct tipc_link *l_ptr, struct sk_buff *skb, if (!retransmits) break; msg = buf_msg(skb); - msg_set_ack(msg, mod(l_ptr->next_in_no - 1)); + msg_set_ack(msg, mod(l_ptr->rcv_nxt - 1)); msg_set_bcast_ack(msg, l_ptr->owner->bclink.last_in); tipc_bearer_send(l_ptr->owner->net, l_ptr->bearer_id, skb, &l_ptr->media_addr); @@ -983,11 +984,11 @@ static bool link_synch(struct tipc_link *l) goto synched; /* Was last pre-synch packet added to input queue ? */ - if (less_eq(pl->next_in_no, l->synch_point)) + if (less_eq(pl->rcv_nxt, l->synch_point)) return false; /* Is it still in the input queue ? */ - post_synch = mod(pl->next_in_no - l->synch_point) - 1; + post_synch = mod(pl->rcv_nxt - l->synch_point) - 1; if (skb_queue_len(&pl->inputq) > post_synch) return false; synched: @@ -1004,7 +1005,7 @@ static void link_retrieve_defq(struct tipc_link *link, return; seq_no = buf_seqno(skb_peek(&link->deferdq)); - if (seq_no == link->next_in_no) + if (seq_no == link->rcv_nxt) skb_queue_splice_tail_init(&link->deferdq, list); } @@ -1119,7 +1120,7 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b_ptr) } /* Link is now in state WORKING_WORKING */ - if (unlikely(seq_no != l_ptr->next_in_no)) { + if (unlikely(seq_no != l_ptr->rcv_nxt)) { link_handle_out_of_seq_msg(l_ptr, skb); link_retrieve_defq(l_ptr, &head); skb = NULL; @@ -1130,7 +1131,7 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b_ptr) if (!link_synch(l_ptr)) goto unlock; } - l_ptr->next_in_no++; + l_ptr->rcv_nxt++; if (unlikely(!skb_queue_empty(&l_ptr->deferdq))) link_retrieve_defq(l_ptr, &head); if (unlikely(++l_ptr->rcv_unacked >= TIPC_MIN_LINK_WIN)) { @@ -1301,7 +1302,7 @@ static void link_handle_out_of_seq_msg(struct tipc_link *l_ptr, * Discard packet if a duplicate; otherwise add it to deferred queue * and notify peer of gap as per protocol specification */ - if (less(seq_no, l_ptr->next_in_no)) { + if (less(seq_no, l_ptr->rcv_nxt)) { l_ptr->stats.duplicates++; kfree_skb(buf); return; @@ -1343,7 +1344,7 @@ void tipc_link_proto_xmit(struct tipc_link *l_ptr, u32 msg_typ, int probe_msg, msg_set_last_bcast(msg, tipc_bclink_get_last_sent(l_ptr->owner->net)); if (msg_typ == STATE_MSG) { - u16 next_sent = l_ptr->next_out_no; + u16 next_sent = l_ptr->snd_nxt; if (!tipc_link_is_up(l_ptr)) return; @@ -1352,7 +1353,7 @@ void tipc_link_proto_xmit(struct tipc_link *l_ptr, u32 msg_typ, int probe_msg, msg_set_next_sent(msg, next_sent); if (!skb_queue_empty(&l_ptr->deferdq)) { last_rcv = buf_seqno(skb_peek(&l_ptr->deferdq)); - gap = mod(last_rcv - l_ptr->next_in_no); + gap = mod(last_rcv - l_ptr->rcv_nxt); } msg_set_seq_gap(msg, gap); if (gap) @@ -1360,7 +1361,7 @@ void tipc_link_proto_xmit(struct tipc_link *l_ptr, u32 msg_typ, int probe_msg, msg_set_link_tolerance(msg, tolerance); msg_set_linkprio(msg, priority); msg_set_max_pkt(msg, l_ptr->mtu); - msg_set_ack(msg, mod(l_ptr->next_in_no - 1)); + msg_set_ack(msg, mod(l_ptr->rcv_nxt - 1)); msg_set_probe(msg, probe_msg != 0); if (probe_msg) l_ptr->stats.sent_probes++; @@ -1380,7 +1381,7 @@ void tipc_link_proto_xmit(struct tipc_link *l_ptr, u32 msg_typ, int probe_msg, msg_set_linkprio(msg, l_ptr->priority); msg_set_size(msg, msg_size); - msg_set_seqno(msg, mod(l_ptr->next_out_no + (0xffff/2))); + msg_set_seqno(msg, mod(l_ptr->snd_nxt + (0xffff / 2))); buf = tipc_buf_acquire(msg_size); if (!buf) @@ -1486,8 +1487,8 @@ static void tipc_link_proto_rcv(struct tipc_link *l_ptr, if (link_reset_unknown(l_ptr)) break; - if (less_eq(l_ptr->next_in_no, msg_next_sent(msg))) - rec_gap = mod(msg_next_sent(msg) - l_ptr->next_in_no); + if (less_eq(l_ptr->rcv_nxt, msg_next_sent(msg))) + rec_gap = mod(msg_next_sent(msg) - l_ptr->rcv_nxt); if (msg_probe(msg)) l_ptr->stats.recv_probes++; @@ -1634,7 +1635,7 @@ tunnel_queue: struct tipc_msg *msg = buf_msg(skb); u32 len = msg_size(msg); - msg_set_ack(msg, mod(link->next_in_no - 1)); + msg_set_ack(msg, mod(link->rcv_nxt - 1)); msg_set_bcast_ack(msg, link->owner->bclink.last_in); msg_set_size(&tnl_hdr, len + INT_H_SIZE); outskb = tipc_buf_acquire(len + INT_H_SIZE); @@ -1723,8 +1724,8 @@ static void link_set_supervision_props(struct tipc_link *l_ptr, u32 tol) return; l_ptr->tolerance = tol; - l_ptr->cont_intv = msecs_to_jiffies(intv); - l_ptr->abort_limit = tol / (jiffies_to_msecs(l_ptr->cont_intv)); + l_ptr->keepalive_intv = msecs_to_jiffies(intv); + l_ptr->abort_limit = tol / (jiffies_to_msecs(l_ptr->keepalive_intv)); } void tipc_link_set_queue_limits(struct tipc_link *l, u32 win) @@ -1784,8 +1785,8 @@ static struct tipc_node *tipc_link_find_owner(struct net *net, static void link_reset_statistics(struct tipc_link *l_ptr) { memset(&l_ptr->stats, 0, sizeof(l_ptr->stats)); - l_ptr->stats.sent_info = l_ptr->next_out_no; - l_ptr->stats.recv_info = l_ptr->next_in_no; + l_ptr->stats.sent_info = l_ptr->snd_nxt; + l_ptr->stats.recv_info = l_ptr->rcv_nxt; } static void link_print(struct tipc_link *l_ptr, const char *str) @@ -2018,9 +2019,9 @@ static int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg, goto attr_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_LINK_MTU, link->mtu)) goto attr_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, link->next_in_no)) + if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, link->rcv_nxt)) goto attr_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, link->next_out_no)) + if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, link->snd_nxt)) goto attr_msg_full; if (tipc_link_is_up(link)) diff --git a/net/tipc/link.h b/net/tipc/link.h index a65770bf647c..74d2d12fc120 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -112,25 +112,25 @@ struct tipc_stats { * @peer_bearer_id: bearer id used by link's peer endpoint * @bearer_id: local bearer id used by link * @tolerance: minimum link continuity loss needed to reset link [in ms] - * @cont_intv: link continuity testing interval + * @keepalive_intv: link keepalive timer interval * @abort_limit: # of unacknowledged continuity probes needed to reset link * @state: current state of link FSM - * @fsm_msg_cnt: # of protocol messages link FSM has sent in current state + * @silent_intv_cnt: # of timer intervals without any reception from peer * @proto_msg: template for control messages generated by link * @pmsg: convenience pointer to "proto_msg" field * @priority: current link priority * @net_plane: current link network plane ('A' through 'H') * @backlog_limit: backlog queue congestion thresholds (indexed by importance) * @exp_msg_count: # of tunnelled messages expected during link changeover - * @reset_checkpoint: seq # of last acknowledged message at time of link reset + * @reset_rcv_checkpt: seq # of last acknowledged message at time of link reset * @mtu: current maximum packet size for this link * @advertised_mtu: advertised own mtu when link is being established * @transmitq: queue for sent, non-acked messages * @backlogq: queue for messages waiting to be sent - * @next_out_no: next sequence number to use for outbound messages + * @snt_nxt: next sequence number to use for outbound messages * @last_retransmitted: sequence number of most recently retransmitted message * @stale_count: # of identical retransmit requests made by peer - * @next_in_no: next sequence number to expect for inbound messages + * @rcv_nxt: next sequence number to expect for inbound messages * @deferred_queue: deferred queue saved OOS b'cast message received from node * @unacked_window: # of inbound messages rx'd without ack'ing back to peer * @inputq: buffer queue for messages to be delivered upwards @@ -156,10 +156,10 @@ struct tipc_link { u32 peer_bearer_id; u32 bearer_id; u32 tolerance; - unsigned long cont_intv; + unsigned long keepalive_intv; u32 abort_limit; int state; - u32 fsm_msg_cnt; + u32 silent_intv_cnt; struct { unchar hdr[INT_H_SIZE]; unchar body[TIPC_MAX_IF_NAME]; @@ -185,13 +185,13 @@ struct tipc_link { u16 len; u16 limit; } backlog[5]; - u16 next_out_no; - u16 last_retransmitted; + u16 snd_nxt; + u16 last_retransm; u32 window; u32 stale_count; /* Reception */ - u16 next_in_no; + u16 rcv_nxt; u32 rcv_unacked; struct sk_buff_head deferdq; struct sk_buff_head inputq; -- cgit From cd4eee3c2e3e01590df5cada0d56b396dd726d05 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 14 May 2015 10:46:16 -0400 Subject: tipc: simplify link supervision checkpointing We change the sequence number checkpointing that is performed by the timer in order to discover if the peer is active. Currently, we store a checkpoint of the next expected sequence number "rcv_nxt" at each timer expiration, and compare it to the current expected number at next timeout expiration. Instead, we now use the already existing field "silent_intv_cnt" for this task. We step the counter at each timeout expiration, and zero it at each valid received packet. If no valid packet has been received from the peer after "abort_limit" number of silent timer intervals, the link is declared faulty and reset. We also remove the multiple instances of timer activation from inside the FSM function "link_state_event()", and now do it at only one place; at the end of the timer function itself. Reviewed-by: Erik Hugne Reviewed-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 81 ++++++++++++++++----------------------------------------- net/tipc/link.h | 2 -- 2 files changed, 22 insertions(+), 61 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index c39543e8112d..a5ea19e9690f 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -86,7 +86,7 @@ static const struct nla_policy tipc_nl_prop_policy[TIPC_NLA_PROP_MAX + 1] = { */ #define STARTING_EVT 856384768 /* link processing trigger */ #define TRAFFIC_MSG_EVT 560815u /* rx'd ??? */ -#define TIMEOUT_EVT 560817u /* link timer expired */ +#define SILENCE_EVT 560817u /* timer dicovered silence from peer */ /* * State value stored in 'failover_pkts' @@ -106,6 +106,7 @@ static void tipc_link_sync_rcv(struct tipc_node *n, struct sk_buff *buf); static void tipc_link_input(struct tipc_link *l, struct sk_buff *skb); static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb); static bool tipc_link_failover_rcv(struct tipc_link *l, struct sk_buff **skb); +static void link_set_timer(struct tipc_link *link, unsigned long time); /* * Simple link routines */ @@ -197,11 +198,12 @@ static void link_timeout(unsigned long data) } /* do all other link processing performed on a periodic basis */ - link_state_event(l_ptr, TIMEOUT_EVT); - + if (l_ptr->silent_intv_cnt || tipc_bclink_acks_missing(l_ptr->owner)) + link_state_event(l_ptr, SILENCE_EVT); + l_ptr->silent_intv_cnt++; if (skb_queue_len(&l_ptr->backlogq)) tipc_link_push_packets(l_ptr); - + link_set_timer(l_ptr, l_ptr->keepalive_intv); tipc_node_unlock(l_ptr->owner); tipc_link_put(l_ptr); } @@ -261,7 +263,6 @@ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, /* note: peer i/f name is updated by reset/activate message */ memcpy(&l_ptr->media_addr, media_addr, sizeof(*media_addr)); l_ptr->owner = n_ptr; - l_ptr->checkpoint = 1; l_ptr->peer_session = INVALID_SESSION; l_ptr->bearer_id = b_ptr->identity; link_set_supervision_props(l_ptr, b_ptr->tolerance); @@ -468,7 +469,6 @@ void tipc_link_reset(struct tipc_link *l_ptr) tipc_link_purge_backlog(l_ptr); l_ptr->reasm_buf = NULL; l_ptr->rcv_unacked = 0; - l_ptr->checkpoint = 1; l_ptr->snd_nxt = 1; l_ptr->silent_intv_cnt = 0; l_ptr->stale_count = 0; @@ -481,6 +481,7 @@ static void link_activate(struct tipc_link *link) link->rcv_nxt = 1; link->stats.recv_info = 1; + link->silent_intv_cnt = 0; tipc_node_link_up(node, link); tipc_bearer_add_dest(node->net, link->bearer_id, link->addr); } @@ -501,45 +502,33 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) if (!(l_ptr->flags & LINK_STARTED) && (event != STARTING_EVT)) return; /* Not yet. */ - if (l_ptr->flags & LINK_FAILINGOVER) { - if (event == TIMEOUT_EVT) - link_set_timer(l_ptr, timer_intv); + if (l_ptr->flags & LINK_FAILINGOVER) return; - } switch (l_ptr->state) { case WORKING_WORKING: switch (event) { case TRAFFIC_MSG_EVT: case ACTIVATE_MSG: + l_ptr->silent_intv_cnt = 0; break; - case TIMEOUT_EVT: - if (l_ptr->rcv_nxt != l_ptr->checkpoint) { - l_ptr->checkpoint = l_ptr->rcv_nxt; - if (tipc_bclink_acks_missing(l_ptr->owner)) { + case SILENCE_EVT: + if (!l_ptr->silent_intv_cnt) { + if (tipc_bclink_acks_missing(l_ptr->owner)) tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, 0, 0, 0); - l_ptr->silent_intv_cnt++; - } - link_set_timer(l_ptr, timer_intv); break; } l_ptr->state = WORKING_UNKNOWN; - l_ptr->silent_intv_cnt = 0; tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0); - l_ptr->silent_intv_cnt++; - link_set_timer(l_ptr, timer_intv); break; case RESET_MSG: pr_debug("%s<%s>, requested by peer\n", link_rst_msg, l_ptr->name); tipc_link_reset(l_ptr); l_ptr->state = RESET_RESET; - l_ptr->silent_intv_cnt = 0; tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG, 0, 0, 0, 0); - l_ptr->silent_intv_cnt++; - link_set_timer(l_ptr, timer_intv); break; default: pr_debug("%s%u in WW state\n", link_unk_evt, event); @@ -551,46 +540,32 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) case ACTIVATE_MSG: l_ptr->state = WORKING_WORKING; l_ptr->silent_intv_cnt = 0; - link_set_timer(l_ptr, timer_intv); break; case RESET_MSG: pr_debug("%s<%s>, requested by peer while probing\n", link_rst_msg, l_ptr->name); tipc_link_reset(l_ptr); l_ptr->state = RESET_RESET; - l_ptr->silent_intv_cnt = 0; tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG, 0, 0, 0, 0); - l_ptr->silent_intv_cnt++; - link_set_timer(l_ptr, timer_intv); break; - case TIMEOUT_EVT: - if (l_ptr->rcv_nxt != l_ptr->checkpoint) { + case SILENCE_EVT: + if (!l_ptr->silent_intv_cnt) { l_ptr->state = WORKING_WORKING; - l_ptr->silent_intv_cnt = 0; - l_ptr->checkpoint = l_ptr->rcv_nxt; - if (tipc_bclink_acks_missing(l_ptr->owner)) { + if (tipc_bclink_acks_missing(l_ptr->owner)) tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, 0, 0, 0); - l_ptr->silent_intv_cnt++; - } - link_set_timer(l_ptr, timer_intv); } else if (l_ptr->silent_intv_cnt < l_ptr->abort_limit) { tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0); - l_ptr->silent_intv_cnt++; - link_set_timer(l_ptr, timer_intv); } else { /* Link has failed */ pr_debug("%s<%s>, peer not responding\n", link_rst_msg, l_ptr->name); tipc_link_reset(l_ptr); l_ptr->state = RESET_UNKNOWN; - l_ptr->silent_intv_cnt = 0; tipc_link_proto_xmit(l_ptr, RESET_MSG, 0, 0, 0, 0); - l_ptr->silent_intv_cnt++; - link_set_timer(l_ptr, timer_intv); } break; default: @@ -606,31 +581,22 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) if (other && link_working_unknown(other)) break; l_ptr->state = WORKING_WORKING; - l_ptr->silent_intv_cnt = 0; link_activate(l_ptr); tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0); - l_ptr->silent_intv_cnt++; if (l_ptr->owner->working_links == 1) tipc_link_sync_xmit(l_ptr); - link_set_timer(l_ptr, timer_intv); break; case RESET_MSG: l_ptr->state = RESET_RESET; - l_ptr->silent_intv_cnt = 0; tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG, 1, 0, 0, 0); - l_ptr->silent_intv_cnt++; - link_set_timer(l_ptr, timer_intv); break; case STARTING_EVT: l_ptr->flags |= LINK_STARTED; - l_ptr->silent_intv_cnt++; link_set_timer(l_ptr, timer_intv); break; - case TIMEOUT_EVT: + case SILENCE_EVT: tipc_link_proto_xmit(l_ptr, RESET_MSG, 0, 0, 0, 0); - l_ptr->silent_intv_cnt++; - link_set_timer(l_ptr, timer_intv); break; default: pr_err("%s%u in RU state\n", link_unk_evt, event); @@ -644,21 +610,16 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) if (other && link_working_unknown(other)) break; l_ptr->state = WORKING_WORKING; - l_ptr->silent_intv_cnt = 0; link_activate(l_ptr); tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0); - l_ptr->silent_intv_cnt++; if (l_ptr->owner->working_links == 1) tipc_link_sync_xmit(l_ptr); - link_set_timer(l_ptr, timer_intv); break; case RESET_MSG: break; - case TIMEOUT_EVT: + case SILENCE_EVT: tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG, 0, 0, 0, 0); - l_ptr->silent_intv_cnt++; - link_set_timer(l_ptr, timer_intv); break; default: pr_err("%s%u in RR state\n", link_unk_evt, event); @@ -1126,6 +1087,8 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b_ptr) skb = NULL; goto unlock; } + l_ptr->silent_intv_cnt = 0; + /* Synchronize with parallel link if applicable */ if (unlikely((l_ptr->flags & LINK_SYNCHING) && !msg_dup(msg))) { if (!link_synch(l_ptr)) @@ -1295,8 +1258,8 @@ static void link_handle_out_of_seq_msg(struct tipc_link *l_ptr, return; } - /* Record OOS packet arrival (force mismatch on next timeout) */ - l_ptr->checkpoint--; + /* Record OOS packet arrival */ + l_ptr->silent_intv_cnt = 0; /* * Discard packet if a duplicate; otherwise add it to deferred queue @@ -1480,7 +1443,7 @@ static void tipc_link_proto_rcv(struct tipc_link *l_ptr, } /* Record reception; force mismatch at next timeout: */ - l_ptr->checkpoint--; + l_ptr->silent_intv_cnt = 0; link_state_event(l_ptr, TRAFFIC_MSG_EVT); l_ptr->stats.recv_states++; diff --git a/net/tipc/link.h b/net/tipc/link.h index 74d2d12fc120..0c02c973e985 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -107,7 +107,6 @@ struct tipc_stats { * @owner: pointer to peer node * @refcnt: reference counter for permanent references (owner node & timer) * @flags: execution state flags for link endpoint instance - * @checkpoint: reference point for triggering link continuity checking * @peer_session: link session # being used by peer end of link * @peer_bearer_id: bearer id used by link's peer endpoint * @bearer_id: local bearer id used by link @@ -151,7 +150,6 @@ struct tipc_link { /* Management and link supervision data */ unsigned int flags; - u16 checkpoint; u32 peer_session; u32 peer_bearer_id; u32 bearer_id; -- cgit From f21e897eccb5a236f4191ecc1b4391eda895d6ed Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 14 May 2015 10:46:17 -0400 Subject: tipc: improve link congestion algorithm The link congestion algorithm used until now implies two problems. - It is too generous towards lower-level messages in situations of high load by giving "absolute" bandwidth guarantees to the different priority levels. LOW traffic is guaranteed 10%, MEDIUM is guaranted 20%, HIGH is guaranteed 30%, and CRITICAL is guaranteed 40% of the available bandwidth. But, in the absence of higher level traffic, the ratio between two distinct levels becomes unreasonable. E.g. if there is only LOW and MEDIUM traffic on a system, the former is guaranteed 1/3 of the bandwidth, and the latter 2/3. This again means that if there is e.g. one LOW user and 10 MEDIUM users, the former will have 33.3% of the bandwidth, and the others will have to compete for the remainder, i.e. each will end up with 6.7% of the capacity. - Packets of type MSG_BUNDLER are created at SYSTEM importance level, but only after the packets bundled into it have passed the congestion test for their own respective levels. Since bundled packets don't result in incrementing the level counter for their own importance, only occasionally for the SYSTEM level counter, they do in practice obtain SYSTEM level importance. Hence, the current implementation provides a gap in the congestion algorithm that in the worst case may lead to a link reset. We now refine the congestion algorithm as follows: - A message is accepted to the link backlog only if its own level counter, and all superior level counters, permit it. - The importance of a created bundle packet is set according to its contents. A bundle packet created from messges at levels LOW to CRITICAL is given importance level CRITICAL, while a bundle created from a SYSTEM level message is given importance SYSTEM. In the latter case only subsequent SYSTEM level messages are allowed to be bundled into it. This solves the first problem described above, by making the bandwidth guarantee relative to the total number of users at all levels; only the upper limit for each level remains absolute. In the example described above, the single LOW user would use 1/11th of the bandwidth, the same as each of the ten MEDIUM users, but he still has the same guarantee against starvation as the latter ones. The fix also solves the second problem. If the CRITICAL level is filled up by bundle packets of that level, no lower level packets will be accepted any more. Suggested-by: Gergely Kiss Reviewed-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 11 ++++++----- net/tipc/msg.c | 7 +++++++ net/tipc/msg.h | 14 +++++++++----- 3 files changed, 22 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index a5ea19e9690f..c1aba697776f 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -645,7 +645,7 @@ int __tipc_link_xmit(struct net *net, struct tipc_link *link, { struct tipc_msg *msg = buf_msg(skb_peek(list)); unsigned int maxwin = link->window; - unsigned int imp = msg_importance(msg); + unsigned int i, imp = msg_importance(msg); uint mtu = link->mtu; u16 ack = mod(link->rcv_nxt - 1); u16 seqno = link->snd_nxt; @@ -655,10 +655,11 @@ int __tipc_link_xmit(struct net *net, struct tipc_link *link, struct sk_buff_head *backlogq = &link->backlogq; struct sk_buff *skb, *tmp; - /* Match backlog limit against msg importance: */ - if (unlikely(link->backlog[imp].len >= link->backlog[imp].limit)) - return link_schedule_user(link, list); - + /* Match msg importance against this and all higher backlog limits: */ + for (i = imp; i <= TIPC_SYSTEM_IMPORTANCE; i++) { + if (unlikely(link->backlog[i].len >= link->backlog[i].limit)) + return link_schedule_user(link, list); + } if (unlikely(msg_size(msg) > mtu)) { __skb_queue_purge(list); return -EMSGSIZE; diff --git a/net/tipc/msg.c b/net/tipc/msg.c index c3e96e815418..ff7362d40cb3 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -365,6 +365,9 @@ bool tipc_msg_bundle(struct sk_buff *bskb, struct sk_buff *skb, u32 mtu) return false; if (unlikely(max < (start + msz))) return false; + if ((msg_importance(msg) < TIPC_SYSTEM_IMPORTANCE) && + (msg_importance(bmsg) == TIPC_SYSTEM_IMPORTANCE)) + return false; skb_put(bskb, pad + msz); skb_copy_to_linear_data_offset(bskb, start, skb->data, msz); @@ -448,6 +451,10 @@ bool tipc_msg_make_bundle(struct sk_buff **skb, u32 mtu, u32 dnode) bmsg = buf_msg(bskb); tipc_msg_init(msg_prevnode(msg), bmsg, MSG_BUNDLER, 0, INT_H_SIZE, dnode); + if (msg_isdata(msg)) + msg_set_importance(bmsg, TIPC_CRITICAL_IMPORTANCE); + else + msg_set_importance(bmsg, TIPC_SYSTEM_IMPORTANCE); msg_set_seqno(bmsg, msg_seqno(msg)); msg_set_ack(bmsg, msg_ack(msg)); msg_set_bcast_ack(bmsg, msg_bcast_ack(msg)); diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 6ca2366f3a53..6caf16c475e0 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -352,18 +352,22 @@ static inline void msg_set_seqno(struct tipc_msg *m, u16 n) */ static inline u32 msg_importance(struct tipc_msg *m) { - if (unlikely(msg_user(m) == MSG_FRAGMENTER)) + int usr = msg_user(m); + + if (likely((usr <= TIPC_CRITICAL_IMPORTANCE) && !msg_errcode(m))) + return usr; + if ((usr == MSG_FRAGMENTER) || (usr == MSG_BUNDLER)) return msg_bits(m, 5, 13, 0x7); - if (likely(msg_isdata(m) && !msg_errcode(m))) - return msg_user(m); return TIPC_SYSTEM_IMPORTANCE; } static inline void msg_set_importance(struct tipc_msg *m, u32 i) { - if (unlikely(msg_user(m) == MSG_FRAGMENTER)) + int usr = msg_user(m); + + if (likely((usr == MSG_FRAGMENTER) || (usr == MSG_BUNDLER))) msg_set_bits(m, 5, 13, 0x7, i); - else if (likely(i < TIPC_SYSTEM_IMPORTANCE)) + else if (i < TIPC_SYSTEM_IMPORTANCE) msg_set_user(m, i); else pr_warn("Trying to set illegal importance in message\n"); -- cgit From dd3f9e70f59f43a5712eba9cf3ee4f1e6999540c Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 14 May 2015 10:46:18 -0400 Subject: tipc: add packet sequence number at instant of transmission Currently, the packet sequence number is updated and added to each packet at the moment a packet is added to the link backlog queue. This is wasteful, since it forces the code to traverse the send packet list packet by packet when adding them to the backlog queue. It would be better to just splice the whole packet list into the backlog queue when that is the right action to do. In this commit, we do this change. Also, since the sequence numbers cannot now be assigned to the packets at the moment they are added the backlog queue, we do instead calculate and add them at the moment of transmission, when the backlog queue has to be traversed anyway. We do this in the function tipc_link_push_packet(). Reviewed-by: Erik Hugne Reviewed-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/bcast.c | 6 +----- net/tipc/link.c | 37 ++++++++++++++++++++++++++++--------- net/tipc/msg.c | 44 +++++++++++++++++++++----------------------- net/tipc/msg.h | 6 +++--- net/tipc/node.c | 2 +- 5 files changed, 54 insertions(+), 41 deletions(-) (limited to 'net') diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 842e19f6abf6..4906ca3c0f3a 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -115,12 +115,8 @@ static void bclink_set_last_sent(struct net *net) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_link *bcl = tn->bcl; - struct sk_buff *skb = skb_peek(&bcl->backlogq); - if (skb) - bcl->silent_intv_cnt = mod(buf_seqno(skb) - 1); - else - bcl->silent_intv_cnt = mod(bcl->snd_nxt - 1); + bcl->silent_intv_cnt = mod(bcl->snd_nxt - 1); } u32 tipc_bclink_get_last_sent(struct net *net) diff --git a/net/tipc/link.c b/net/tipc/link.c index c1aba697776f..fb2a003c8e6d 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -653,7 +653,7 @@ int __tipc_link_xmit(struct net *net, struct tipc_link *link, struct tipc_media_addr *addr = &link->media_addr; struct sk_buff_head *transmq = &link->transmq; struct sk_buff_head *backlogq = &link->backlogq; - struct sk_buff *skb, *tmp; + struct sk_buff *skb, *bskb; /* Match msg importance against this and all higher backlog limits: */ for (i = imp; i <= TIPC_SYSTEM_IMPORTANCE; i++) { @@ -665,32 +665,36 @@ int __tipc_link_xmit(struct net *net, struct tipc_link *link, return -EMSGSIZE; } /* Prepare each packet for sending, and add to relevant queue: */ - skb_queue_walk_safe(list, skb, tmp) { - __skb_unlink(skb, list); + while (skb_queue_len(list)) { + skb = skb_peek(list); msg = buf_msg(skb); msg_set_seqno(msg, seqno); msg_set_ack(msg, ack); msg_set_bcast_ack(msg, bc_last_in); if (likely(skb_queue_len(transmq) < maxwin)) { + __skb_dequeue(list); __skb_queue_tail(transmq, skb); tipc_bearer_send(net, link->bearer_id, skb, addr); link->rcv_unacked = 0; seqno++; continue; } - if (tipc_msg_bundle(skb_peek_tail(backlogq), skb, mtu)) { + if (tipc_msg_bundle(skb_peek_tail(backlogq), msg, mtu)) { + kfree_skb(__skb_dequeue(list)); link->stats.sent_bundled++; continue; } - if (tipc_msg_make_bundle(&skb, mtu, link->addr)) { + if (tipc_msg_make_bundle(&bskb, msg, mtu, link->addr)) { + kfree_skb(__skb_dequeue(list)); + __skb_queue_tail(backlogq, bskb); + link->backlog[msg_importance(buf_msg(bskb))].len++; link->stats.sent_bundled++; link->stats.sent_bundles++; - imp = msg_importance(buf_msg(skb)); + continue; } - __skb_queue_tail(backlogq, skb); - link->backlog[imp].len++; - seqno++; + link->backlog[imp].len += skb_queue_len(list); + skb_queue_splice_tail_init(list, backlogq); } link->snd_nxt = seqno; return 0; @@ -822,6 +826,7 @@ void tipc_link_push_packets(struct tipc_link *link) { struct sk_buff *skb; struct tipc_msg *msg; + u16 seqno = link->snd_nxt; u16 ack = mod(link->rcv_nxt - 1); while (skb_queue_len(&link->transmq) < link->window) { @@ -831,12 +836,15 @@ void tipc_link_push_packets(struct tipc_link *link) msg = buf_msg(skb); link->backlog[msg_importance(msg)].len--; msg_set_ack(msg, ack); + msg_set_seqno(msg, seqno); + seqno = mod(seqno + 1); msg_set_bcast_ack(msg, link->owner->bclink.last_in); link->rcv_unacked = 0; __skb_queue_tail(&link->transmq, skb); tipc_bearer_send(link->owner->net, link->bearer_id, skb, &link->media_addr); } + link->snd_nxt = seqno; } void tipc_link_reset_all(struct tipc_node *node) @@ -1526,6 +1534,11 @@ void tipc_link_failover_send_queue(struct tipc_link *l_ptr) tipc_msg_init(link_own_addr(l_ptr), &tunnel_hdr, TUNNEL_PROTOCOL, FAILOVER_MSG, INT_H_SIZE, l_ptr->addr); + + skb_queue_walk(&l_ptr->backlogq, skb) { + msg_set_seqno(buf_msg(skb), l_ptr->snd_nxt); + l_ptr->snd_nxt = mod(l_ptr->snd_nxt + 1); + } skb_queue_splice_tail_init(&l_ptr->backlogq, &l_ptr->transmq); tipc_link_purge_backlog(l_ptr); msgcount = skb_queue_len(&l_ptr->transmq); @@ -1586,6 +1599,7 @@ void tipc_link_dup_queue_xmit(struct tipc_link *link, struct tipc_msg tnl_hdr; struct sk_buff_head *queue = &link->transmq; int mcnt; + u16 seqno; tipc_msg_init(link_own_addr(link), &tnl_hdr, TUNNEL_PROTOCOL, SYNCH_MSG, INT_H_SIZE, link->addr); @@ -1617,6 +1631,11 @@ tunnel_queue: } if (queue == &link->backlogq) return; + seqno = link->snd_nxt; + skb_queue_walk(&link->backlogq, skb) { + msg_set_seqno(buf_msg(skb), seqno); + seqno = mod(seqno + 1); + } queue = &link->backlogq; goto tunnel_queue; } diff --git a/net/tipc/msg.c b/net/tipc/msg.c index ff7362d40cb3..08b4cc7d496d 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -331,16 +331,15 @@ error: /** * tipc_msg_bundle(): Append contents of a buffer to tail of an existing one - * @bskb: the buffer to append to ("bundle") - * @skb: buffer to be appended + * @skb: the buffer to append to ("bundle") + * @msg: message to be appended * @mtu: max allowable size for the bundle buffer * Consumes buffer if successful * Returns true if bundling could be performed, otherwise false */ -bool tipc_msg_bundle(struct sk_buff *bskb, struct sk_buff *skb, u32 mtu) +bool tipc_msg_bundle(struct sk_buff *skb, struct tipc_msg *msg, u32 mtu) { struct tipc_msg *bmsg; - struct tipc_msg *msg = buf_msg(skb); unsigned int bsz; unsigned int msz = msg_size(msg); u32 start, pad; @@ -348,9 +347,9 @@ bool tipc_msg_bundle(struct sk_buff *bskb, struct sk_buff *skb, u32 mtu) if (likely(msg_user(msg) == MSG_FRAGMENTER)) return false; - if (!bskb) + if (!skb) return false; - bmsg = buf_msg(bskb); + bmsg = buf_msg(skb); bsz = msg_size(bmsg); start = align(bsz); pad = start - bsz; @@ -359,9 +358,9 @@ bool tipc_msg_bundle(struct sk_buff *bskb, struct sk_buff *skb, u32 mtu) return false; if (unlikely(msg_user(msg) == BCAST_PROTOCOL)) return false; - if (likely(msg_user(bmsg) != MSG_BUNDLER)) + if (unlikely(msg_user(bmsg) != MSG_BUNDLER)) return false; - if (unlikely(skb_tailroom(bskb) < (pad + msz))) + if (unlikely(skb_tailroom(skb) < (pad + msz))) return false; if (unlikely(max < (start + msz))) return false; @@ -369,11 +368,10 @@ bool tipc_msg_bundle(struct sk_buff *bskb, struct sk_buff *skb, u32 mtu) (msg_importance(bmsg) == TIPC_SYSTEM_IMPORTANCE)) return false; - skb_put(bskb, pad + msz); - skb_copy_to_linear_data_offset(bskb, start, skb->data, msz); + skb_put(skb, pad + msz); + skb_copy_to_linear_data_offset(skb, start, msg, msz); msg_set_size(bmsg, start + msz); msg_set_msgcnt(bmsg, msg_msgcnt(bmsg) + 1); - kfree_skb(skb); return true; } @@ -419,18 +417,18 @@ none: /** * tipc_msg_make_bundle(): Create bundle buf and append message to its tail - * @list: the buffer chain - * @skb: buffer to be appended and replaced + * @list: the buffer chain, where head is the buffer to replace/append + * @skb: buffer to be created, appended to and returned in case of success + * @msg: message to be appended * @mtu: max allowable size for the bundle buffer, inclusive header * @dnode: destination node for message. (Not always present in header) - * Replaces buffer if successful * Returns true if success, otherwise false */ -bool tipc_msg_make_bundle(struct sk_buff **skb, u32 mtu, u32 dnode) +bool tipc_msg_make_bundle(struct sk_buff **skb, struct tipc_msg *msg, + u32 mtu, u32 dnode) { - struct sk_buff *bskb; + struct sk_buff *_skb; struct tipc_msg *bmsg; - struct tipc_msg *msg = buf_msg(*skb); u32 msz = msg_size(msg); u32 max = mtu - INT_H_SIZE; @@ -443,12 +441,12 @@ bool tipc_msg_make_bundle(struct sk_buff **skb, u32 mtu, u32 dnode) if (msz > (max / 2)) return false; - bskb = tipc_buf_acquire(max); - if (!bskb) + _skb = tipc_buf_acquire(max); + if (!_skb) return false; - skb_trim(bskb, INT_H_SIZE); - bmsg = buf_msg(bskb); + skb_trim(_skb, INT_H_SIZE); + bmsg = buf_msg(_skb); tipc_msg_init(msg_prevnode(msg), bmsg, MSG_BUNDLER, 0, INT_H_SIZE, dnode); if (msg_isdata(msg)) @@ -458,8 +456,8 @@ bool tipc_msg_make_bundle(struct sk_buff **skb, u32 mtu, u32 dnode) msg_set_seqno(bmsg, msg_seqno(msg)); msg_set_ack(bmsg, msg_ack(msg)); msg_set_bcast_ack(bmsg, msg_bcast_ack(msg)); - tipc_msg_bundle(bskb, *skb, mtu); - *skb = bskb; + tipc_msg_bundle(_skb, msg, mtu); + *skb = _skb; return true; } diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 6caf16c475e0..19c45fb66238 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -776,9 +776,9 @@ struct sk_buff *tipc_msg_create(uint user, uint type, uint hdr_sz, uint data_sz, u32 dnode, u32 onode, u32 dport, u32 oport, int errcode); int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf); -bool tipc_msg_bundle(struct sk_buff *bskb, struct sk_buff *skb, u32 mtu); - -bool tipc_msg_make_bundle(struct sk_buff **skb, u32 mtu, u32 dnode); +bool tipc_msg_bundle(struct sk_buff *skb, struct tipc_msg *msg, u32 mtu); +bool tipc_msg_make_bundle(struct sk_buff **skb, struct tipc_msg *msg, + u32 mtu, u32 dnode); bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos); int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, int offset, int dsz, int mtu, struct sk_buff_head *list); diff --git a/net/tipc/node.c b/net/tipc/node.c index eb3856bb8c5a..0b1d61a5f853 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1,7 +1,7 @@ /* * net/tipc/node.c: TIPC node management routines * - * Copyright (c) 2000-2006, 2012-2014, Ericsson AB + * Copyright (c) 2000-2006, 2012-2015, Ericsson AB * Copyright (c) 2005-2006, 2010-2014, Wind River Systems * All rights reserved. * -- cgit From dd3aa3b5fbf61adb0ef5d44a6753646b767b9c77 Mon Sep 17 00:00:00 2001 From: Brian Haley Date: Thu, 14 May 2015 13:20:15 -0400 Subject: cls_flower: Fix compile error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix compile error in net/sched/cls_flower.c net/sched/cls_flower.c: In function ‘fl_set_key’: net/sched/cls_flower.c:240:3: error: implicit declaration of function ‘tcf_change_indev’ [-Werror=implicit-function-declaration] err = tcf_change_indev(net, tb[TCA_FLOWER_INDEV]); Introduced in 77b9900ef53ae Fixes: 77b9900ef53ae ("tc: introduce Flower classifier") Signed-off-by: Brian Haley Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 9bc654c764cd..8c8f34ef6980 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -234,15 +234,15 @@ static void fl_set_key_val(struct nlattr **tb, static int fl_set_key(struct net *net, struct nlattr **tb, struct fl_flow_key *key, struct fl_flow_key *mask) { - int err; - +#ifdef CONFIG_NET_CLS_IND if (tb[TCA_FLOWER_INDEV]) { - err = tcf_change_indev(net, tb[TCA_FLOWER_INDEV]); + int err = tcf_change_indev(net, tb[TCA_FLOWER_INDEV]); if (err < 0) return err; key->indev_ifindex = err; mask->indev_ifindex = 0xffffffff; } +#endif fl_set_key_val(tb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST, mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK, -- cgit From fa787ae0624c46d0c894c19ae74f53d7bb4406f8 Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Wed, 13 May 2015 11:20:38 +0800 Subject: tipc: use sock_create_kern interface to create kernel socket After commit eeb1bd5c40ed ("net: Add a struct net parameter to sock_create_kern"), we should use sock_create_kern() to create kernel socket as the interface doesn't reference count struct net any more. Signed-off-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/server.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/server.c b/net/tipc/server.c index a91a2f79209a..922e04a43396 100644 --- a/net/tipc/server.c +++ b/net/tipc/server.c @@ -325,7 +325,7 @@ static struct socket *tipc_create_listen_sock(struct tipc_conn *con) struct socket *sock = NULL; int ret; - ret = __sock_create(s->net, AF_TIPC, SOCK_SEQPACKET, 0, &sock, 1); + ret = sock_create_kern(s->net, AF_TIPC, SOCK_SEQPACKET, 0, &sock); if (ret < 0) return NULL; ret = kernel_setsockopt(sock, SOL_TIPC, TIPC_IMPORTANCE, -- cgit From 54d7c01d3ed699cfc213115eaecfe1175cfaff8f Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 14 May 2015 15:25:02 -0400 Subject: packet: fix warnings in rollover lock contention Avoid two xchg calls whose return values were unused, causing a warning on some architectures. The relevant variable is a hint and read without mutual exclusion. This fix makes all writers hold the receive_queue lock. Suggested-by: David S. Miller Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/packet/af_packet.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 31d58565726c..c30d14781576 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1301,17 +1301,12 @@ static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) int ret; bool has_room; - if (po->prot_hook.func == tpacket_rcv) { - spin_lock(&po->sk.sk_receive_queue.lock); - ret = __packet_rcv_has_room(po, skb); - spin_unlock(&po->sk.sk_receive_queue.lock); - } else { - ret = __packet_rcv_has_room(po, skb); - } - + spin_lock_bh(&po->sk.sk_receive_queue.lock); + ret = __packet_rcv_has_room(po, skb); has_room = ret == ROOM_NORMAL; if (po->pressure == has_room) - xchg(&po->pressure, !has_room); + po->pressure = !has_room; + spin_unlock_bh(&po->sk.sk_receive_queue.lock); return ret; } @@ -3814,7 +3809,7 @@ static unsigned int packet_poll(struct file *file, struct socket *sock, mask |= POLLIN | POLLRDNORM; } if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL) - xchg(&po->pressure, 0); + po->pressure = 0; spin_unlock_bh(&sk->sk_receive_queue.lock); spin_lock_bh(&sk->sk_write_queue.lock); if (po->tx_ring.pg_vec) { -- cgit From c24a59649f3c8f4f78adc2d0e31423fa883b012b Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 14 May 2015 14:31:28 -0700 Subject: ip_tunnel: Report Rx dropped in ip_tunnel_get_stats64 The rx_dropped stat wasn't being reported when ip_tunnel_get_stats64 was called. This was leading to some confusing results in my debug as I was seeing rx_errors increment but no other value which pointed me toward the type of error being seen. This change corrects that by using netdev_stats_to_stats64 to copy all available dev stats instead of just the few that were hand picked. Signed-off-by: Alexander Duyck Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel_core.c | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 3998b1822d85..6a51a71a6c67 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -165,6 +165,8 @@ struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev, { int i; + netdev_stats_to_stats64(tot, &dev->stats); + for_each_possible_cpu(i) { const struct pcpu_sw_netstats *tstats = per_cpu_ptr(dev->tstats, i); @@ -185,22 +187,6 @@ struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev, tot->tx_bytes += tx_bytes; } - tot->multicast = dev->stats.multicast; - - tot->rx_crc_errors = dev->stats.rx_crc_errors; - tot->rx_fifo_errors = dev->stats.rx_fifo_errors; - tot->rx_length_errors = dev->stats.rx_length_errors; - tot->rx_frame_errors = dev->stats.rx_frame_errors; - tot->rx_errors = dev->stats.rx_errors; - - tot->tx_fifo_errors = dev->stats.tx_fifo_errors; - tot->tx_carrier_errors = dev->stats.tx_carrier_errors; - tot->tx_dropped = dev->stats.tx_dropped; - tot->tx_aborted_errors = dev->stats.tx_aborted_errors; - tot->tx_errors = dev->stats.tx_errors; - - tot->collisions = dev->stats.collisions; - return tot; } EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64); -- cgit From 0c58a2db91747c841d042b1d56615fb1eaf138c7 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Wed, 13 May 2015 13:43:09 +0200 Subject: netns: fix unbalanced spin_lock on error Unlock was missing on error path. Fixes: 95f38411df05 ("netns: use a spin_lock to protect nsid management") Reported-by: Dan Carpenter Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/core/net_namespace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index cbee75f2fc28..d2f42da9479b 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -565,6 +565,7 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh) spin_lock_irqsave(&nsid_lock, flags); if (__peernet2id(net, peer) >= 0) { + spin_unlock_irqrestore(&nsid_lock, flags); err = -EEXIST; goto out; } -- cgit From 3365495c1883561e4a8811f46a3800c512a3c00a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 14 May 2015 00:36:28 +0200 Subject: net: core: set qdisc pkt len before tc_classify commit d2788d34885d4ce5ba ("net: sched: further simplify handle_ing") removed the call to qdisc_enqueue_root(). However, after this removal we no longer set qdisc pkt length. This breaks traffic policing on ingress. This is the minimum fix: set qdisc pkt length before tc_classify. Only setting the length does remove support for 'stab' on ingress, but as Alexei pointed out: "Though it was allowed to add qdisc_size_table to ingress, it's useless. Nothing takes advantage of recomputed qdisc_pkt_len". Jamal suggested to use qdisc_pkt_len_init(), but as Eric mentioned that would result in qdisc_pkt_len_init to no longer get inlined due to the additional 2nd call site. ingress policing is rare and GRO doesn't really work that well with police on ingress, as we see packets > mtu and drop skbs that -- without aggregation -- would still have fitted the policier budget. Thus to have reliable/smooth ingress policing GRO has to be turned off. Cc: Alexei Starovoitov Cc: Eric Dumazet Cc: Jamal Hadi Salim Fixes: d2788d34885d ("net: sched: further simplify handle_ing") Acked-by: Daniel Borkmann Signed-off-by: Florian Westphal Acked-by: Eric Dumazet Acked-by: Alexei Starovoitov Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/core/dev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 29f0d6e6542c..0e7afefb5072 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3647,8 +3647,9 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb, *pt_prev = NULL; } - qdisc_bstats_update_cpu(cl->q, skb); + qdisc_skb_cb(skb)->pkt_len = skb->len; skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); + qdisc_bstats_update_cpu(cl->q, skb); switch (tc_classify(skb, cl, &cl_res)) { case TC_ACT_OK: -- cgit From cffd2eedf91aa9e459b8640807ee6ea7bd8ee145 Mon Sep 17 00:00:00 2001 From: Frederic Danis Date: Fri, 15 May 2015 11:58:39 +0200 Subject: Bluetooth: Fix calls to __hci_cmd_sync() Remove test of command reply status as it is already performed by __hci_cmd_sync(). __hci_cmd_sync_ev() function already returns an error if it got a non-zero status either through a Command Complete or a Command Status event. For both of these events the status is collected up in the event handlers called by hci_event_packet() and then passed as the second parameter to req_complete_skb(). The req_complete_skb() callback in turn is hci_req_sync_complete() for __hci_cmd_sync_ev() which stores the status in hdev->req_result. The hdev->req_result is then further converted through bt_to_errno() back in __hci_cmd_sync_ev(). Signed-off-by: Frederic Danis Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_core.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 4663c3dad3f5..db11b9d728b3 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -94,7 +94,6 @@ static ssize_t dut_mode_write(struct file *file, const char __user *user_buf, char buf[32]; size_t buf_size = min(count, (sizeof(buf)-1)); bool enable; - int err; if (!test_bit(HCI_UP, &hdev->flags)) return -ENETDOWN; @@ -121,12 +120,8 @@ static ssize_t dut_mode_write(struct file *file, const char __user *user_buf, if (IS_ERR(skb)) return PTR_ERR(skb); - err = -bt_to_errno(skb->data[0]); kfree_skb(skb); - if (err < 0) - return err; - hci_dev_change_flag(hdev, HCI_DUT_MODE); return count; -- cgit From 55917a21d0cc012bb6073bb05bb768fd51d8e237 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 14 May 2015 14:57:23 +0200 Subject: netfilter: x_tables: add context to know if extension runs from nft_compat Currently, we have four xtables extensions that cannot be used from the xt over nft compat layer. The problem is that they need real access to the full blown xt_entry to validate that the rule comes with the right dependencies. This check was introduced to overcome the lack of sufficient userspace dependency validation in iptables. To resolve this problem, this patch introduces a new field to the xt_tgchk_param structure that tell us if the extension is run from nft_compat context. The three affected extensions are: 1) CLUSTERIP, this target has been superseded by xt_cluster. So just bail out by returning -EINVAL. 2) TCPMSS. Relax the checking when used from nft_compat. If used with the wrong configuration, it will corrupt !syn packets by adding TCP MSS option. 3) ebt_stp. Relax the check to make sure it uses the reserved destination MAC address for STP. Signed-off-by: Pablo Neira Ayuso Tested-by: Arturo Borrero Gonzalez --- net/bridge/netfilter/ebt_stp.c | 6 ++++-- net/ipv4/netfilter/ipt_CLUSTERIP.c | 5 +++++ net/netfilter/nft_compat.c | 2 ++ net/netfilter/xt_TCPMSS.c | 6 ++++++ 4 files changed, 17 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bridge/netfilter/ebt_stp.c b/net/bridge/netfilter/ebt_stp.c index 071d87214dde..0c40570069ba 100644 --- a/net/bridge/netfilter/ebt_stp.c +++ b/net/bridge/netfilter/ebt_stp.c @@ -164,8 +164,10 @@ static int ebt_stp_mt_check(const struct xt_mtchk_param *par) !(info->bitmask & EBT_STP_MASK)) return -EINVAL; /* Make sure the match only receives stp frames */ - if (!ether_addr_equal(e->destmac, bridge_ula) || - !ether_addr_equal(e->destmsk, msk) || !(e->bitmask & EBT_DESTMAC)) + if (!par->nft_compat && + (!ether_addr_equal(e->destmac, bridge_ula) || + !ether_addr_equal(e->destmsk, msk) || + !(e->bitmask & EBT_DESTMAC))) return -EINVAL; return 0; diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 771ab3d01ad3..45cb16a6a4a3 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -367,6 +367,11 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par) struct clusterip_config *config; int ret; + if (par->nft_compat) { + pr_err("cannot use CLUSTERIP target from nftables compat\n"); + return -EOPNOTSUPP; + } + if (cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP && cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT && cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT_DPT) { diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 7f29cfc76349..66def315eb56 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -161,6 +161,7 @@ nft_target_set_tgchk_param(struct xt_tgchk_param *par, par->hook_mask = 0; } par->family = ctx->afi->family; + par->nft_compat = true; } static void target_compat_from_user(struct xt_target *t, void *in, void *out) @@ -377,6 +378,7 @@ nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx, par->hook_mask = 0; } par->family = ctx->afi->family; + par->nft_compat = true; } static void match_compat_from_user(struct xt_match *m, void *in, void *out) diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index e762de5ee89b..8c3190e2fc6a 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -277,6 +277,9 @@ static int tcpmss_tg4_check(const struct xt_tgchk_param *par) "FORWARD, OUTPUT and POSTROUTING hooks\n"); return -EINVAL; } + if (par->nft_compat) + return 0; + xt_ematch_foreach(ematch, e) if (find_syn_match(ematch)) return 0; @@ -299,6 +302,9 @@ static int tcpmss_tg6_check(const struct xt_tgchk_param *par) "FORWARD, OUTPUT and POSTROUTING hooks\n"); return -EINVAL; } + if (par->nft_compat) + return 0; + xt_ematch_foreach(ematch, e) if (find_syn_match(ematch)) return 0; -- cgit From 861fb1078fd4ea09b442987b3e20fced0f15eb92 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 12 May 2015 18:28:23 -0700 Subject: netfilter: Use correct return for seq_show functions Using seq_has_overflowed doesn't produce the right return value. Either 0 or -1 is, but 0 is much more common and works well when seq allocation retries. I believe this doesn't matter as the initial allocation is always sufficient, this is just a correctness patch. Miscellanea: o Don't use strlen, use *ptr to determine if a string should be emitted like all the other tests here o Delete unnecessary return statements Signed-off-by: Joe Perches Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_queue_core.c | 2 +- net/netfilter/x_tables.c | 18 ++++++------------ 2 files changed, 7 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c index 0b98c7420239..bec7c60fe4d0 100644 --- a/net/netfilter/nfnetlink_queue_core.c +++ b/net/netfilter/nfnetlink_queue_core.c @@ -1257,7 +1257,7 @@ static int seq_show(struct seq_file *s, void *v) inst->copy_mode, inst->copy_range, inst->queue_dropped, inst->queue_user_dropped, inst->id_sequence, 1); - return seq_has_overflowed(s); + return 0; } static const struct seq_operations nfqnl_seq_ops = { diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 51a459c3c649..83032464a4bd 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -947,11 +947,9 @@ static int xt_table_seq_show(struct seq_file *seq, void *v) { struct xt_table *table = list_entry(v, struct xt_table, list); - if (strlen(table->name)) { + if (*table->name) seq_printf(seq, "%s\n", table->name); - return seq_has_overflowed(seq); - } else - return 0; + return 0; } static const struct seq_operations xt_table_seq_ops = { @@ -1087,10 +1085,8 @@ static int xt_match_seq_show(struct seq_file *seq, void *v) if (trav->curr == trav->head) return 0; match = list_entry(trav->curr, struct xt_match, list); - if (*match->name == '\0') - return 0; - seq_printf(seq, "%s\n", match->name); - return seq_has_overflowed(seq); + if (*match->name) + seq_printf(seq, "%s\n", match->name); } return 0; } @@ -1142,10 +1138,8 @@ static int xt_target_seq_show(struct seq_file *seq, void *v) if (trav->curr == trav->head) return 0; target = list_entry(trav->curr, struct xt_target, list); - if (*target->name == '\0') - return 0; - seq_printf(seq, "%s\n", target->name); - return seq_has_overflowed(seq); + if (*target->name) + seq_printf(seq, "%s\n", target->name); } return 0; } -- cgit From 252a8fbe819d041b29789e2035cd1760f373345f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 15 May 2015 08:58:45 -0700 Subject: ipip: fix one sparse error make C=2 CF=-D__CHECK_ENDIAN__ net/ipv4/ipip.o CHECK net/ipv4/ipip.c net/ipv4/ipip.c:254:27: warning: incorrect type in assignment (different base types) net/ipv4/ipip.c:254:27: expected restricted __be32 [addressable] [usertype] o_key net/ipv4/ipip.c:254:27: got restricted __be16 [addressable] [usertype] i_flags Fixes: 3b7b514f44bf ("ipip: fix a regression in ioctl") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ipip.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index ff96396ebec5..254238daf58b 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -251,7 +251,8 @@ ipip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) return -EINVAL; } - p.i_key = p.o_key = p.i_flags = p.o_flags = 0; + p.i_key = p.o_key = 0; + p.i_flags = p.o_flags = 0; if (p.iph.ttl) p.iph.frag_off |= htons(IP_DF); -- cgit From ba6d05641c8a22272a181ffbdacde139d4b986b5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 15 May 2015 09:07:31 -0700 Subject: netfilter: synproxy: fix sparse errors Fix verbose sparse errors : make C=2 CF=-D__CHECK_ENDIAN__ net/ipv4/netfilter/ipt_SYNPROXY.o Signed-off-by: Eric Dumazet Acked-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/ipv4/netfilter/ipt_SYNPROXY.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index e9e67793055f..fe8cc183411e 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -18,7 +18,7 @@ #include static struct iphdr * -synproxy_build_ip(struct sk_buff *skb, u32 saddr, u32 daddr) +synproxy_build_ip(struct sk_buff *skb, __be32 saddr, __be32 daddr) { struct iphdr *iph; @@ -220,7 +220,7 @@ synproxy_send_client_ack(const struct synproxy_net *snet, nth->ack_seq = th->ack_seq; tcp_flag_word(nth) = TCP_FLAG_ACK; nth->doff = tcp_hdr_size / 4; - nth->window = ntohs(htons(th->window) >> opts->wscale); + nth->window = htons(ntohs(th->window) >> opts->wscale); nth->check = 0; nth->urg_ptr = 0; -- cgit From 4633c9e07b3b7d7fc262a5f59ff635c1f702af6f Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Sun, 17 May 2015 19:44:02 -0400 Subject: net-packet: fix null pointer exception in rollover mode Rollover can be enabled as flag or mode. Allocate state in both cases. This solves a NULL pointer exception in fanout_demux_rollover on referencing po->rollover if using mode rollover. Also make sure that in rollover mode each silo is tried (contrary to rollover flag, where the main socket is excluded after an initial try_self). Tested: Passes tools/testing/net/psock_fanout.c, which tests both modes and flag. My previous tests were limited to bench_rollover, which only stresses the flag. The test now completes safely. it still gives an error for mode rollover, because it does not expect the new headroom (ROOM_NORMAL) requirement. I will send a separate patch to the test. Fixes: 0648ab70afe6 ("packet: rollover prepare: per-socket state") Signed-off-by: Willem de Bruijn ---- I should have run this test and caught this before submission, of course. Apologies for the oversight. Signed-off-by: David S. Miller --- net/packet/af_packet.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index c30d14781576..fd5164139bf0 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1389,7 +1389,7 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f, unsigned int idx, bool try_self, unsigned int num) { - struct packet_sock *po, *po_next; + struct packet_sock *po, *po_next, *po_skip = NULL; unsigned int i, j, room = ROOM_NONE; po = pkt_sk(f->arr[idx]); @@ -1399,12 +1399,13 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f, if (room == ROOM_NORMAL || (room == ROOM_LOW && !fanout_flow_is_huge(po, skb))) return idx; + po_skip = po; } i = j = min_t(int, po->rollover->sock, num - 1); do { po_next = pkt_sk(f->arr[i]); - if (po_next != po && !po_next->pressure && + if (po_next != po_skip && !po_next->pressure && packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) { if (i != j) po->rollover->sock = i; @@ -1549,7 +1550,8 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) if (po->fanout) return -EALREADY; - if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER) { + if (type == PACKET_FANOUT_ROLLOVER || + (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) { po->rollover = kzalloc(sizeof(*po->rollover), GFP_KERNEL); if (!po->rollover) return -ENOMEM; -- cgit From 1a24e04e4b50939daa3041682b38b82c896ca438 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 15 May 2015 12:39:25 -0700 Subject: net: fix sk_mem_reclaim_partial() sk_mem_reclaim_partial() goal is to ensure each socket has one SK_MEM_QUANTUM forward allocation. This is needed both for performance and better handling of memory pressure situations in follow up patches. SK_MEM_QUANTUM is currently a page, but might be reduced to 4096 bytes as some arches have 64KB pages. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index c18738a795b0..29124fcdc42a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2069,12 +2069,13 @@ EXPORT_SYMBOL(__sk_mem_schedule); /** * __sk_reclaim - reclaim memory_allocated * @sk: socket + * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple) */ -void __sk_mem_reclaim(struct sock *sk) +void __sk_mem_reclaim(struct sock *sk, int amount) { - sk_memory_allocated_sub(sk, - sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT); - sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1; + amount >>= SK_MEM_QUANTUM_SHIFT; + sk_memory_allocated_sub(sk, amount); + sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT; if (sk_under_memory_pressure(sk) && (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) -- cgit From a6c5ea4ccf0033591e6e476d7a273c0074c07aa7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 15 May 2015 12:39:26 -0700 Subject: tcp: rename sk_forced_wmem_schedule() to sk_forced_mem_schedule() We plan to use sk_forced_wmem_schedule() in input path as well, so make it non static and rename it. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 7386d32cd670..bac1a950d087 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2816,8 +2816,10 @@ begin_fwd: * connection tear down and (memory) recovery. * Otherwise tcp_send_fin() could be tempted to either delay FIN * or even be forced to close flow without any FIN. + * In general, we want to allow one skb per socket to avoid hangs + * with edge trigger epoll() */ -static void sk_forced_wmem_schedule(struct sock *sk, int size) +void sk_forced_mem_schedule(struct sock *sk, int size) { int amt, status; @@ -2864,7 +2866,7 @@ coalesce: return; } skb_reserve(skb, MAX_TCP_HEADER); - sk_forced_wmem_schedule(sk, skb->truesize); + sk_forced_mem_schedule(sk, skb->truesize); /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ tcp_init_nondata_skb(skb, tp->write_seq, TCPHDR_ACK | TCPHDR_FIN); -- cgit From b8da51ebb1aa93908350f95efae73aecbc2e266c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 15 May 2015 12:39:27 -0700 Subject: tcp: introduce tcp_under_memory_pressure() Introduce an optimized version of sk_under_memory_pressure() for TCP. Our intent is to use it in fast paths. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 8 ++++---- net/ipv4/tcp_output.c | 4 ++-- net/ipv4/tcp_timer.c | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index cf8b20ff6658..093779f7e893 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -359,7 +359,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) /* Check #1 */ if (tp->rcv_ssthresh < tp->window_clamp && (int)tp->rcv_ssthresh < tcp_space(sk) && - !sk_under_memory_pressure(sk)) { + !tcp_under_memory_pressure(sk)) { int incr; /* Check #2. Increase window, if skb with such overhead @@ -446,7 +446,7 @@ static void tcp_clamp_window(struct sock *sk) if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && - !sk_under_memory_pressure(sk) && + !tcp_under_memory_pressure(sk) && sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) { sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), sysctl_tcp_rmem[2]); @@ -4781,7 +4781,7 @@ static int tcp_prune_queue(struct sock *sk) if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) tcp_clamp_window(sk); - else if (sk_under_memory_pressure(sk)) + else if (tcp_under_memory_pressure(sk)) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); tcp_collapse_ofo_queue(sk); @@ -4825,7 +4825,7 @@ static bool tcp_should_expand_sndbuf(const struct sock *sk) return false; /* If we are under global TCP memory pressure, do not expand. */ - if (sk_under_memory_pressure(sk)) + if (tcp_under_memory_pressure(sk)) return false; /* If we are under soft global TCP memory pressure, do not expand. */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index bac1a950d087..08c2cc40b26d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2392,7 +2392,7 @@ u32 __tcp_select_window(struct sock *sk) if (free_space < (full_space >> 1)) { icsk->icsk_ack.quick = 0; - if (sk_under_memory_pressure(sk)) + if (tcp_under_memory_pressure(sk)) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); @@ -2843,7 +2843,7 @@ void tcp_send_fin(struct sock *sk) * Note: in the latter case, FIN packet will be sent after a timeout, * as TCP stack thinks it has already been transmitted. */ - if (tskb && (tcp_send_head(sk) || sk_under_memory_pressure(sk))) { + if (tskb && (tcp_send_head(sk) || tcp_under_memory_pressure(sk))) { coalesce: TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN; TCP_SKB_CB(tskb)->end_seq++; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 65bf670e8714..5b752f58a900 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -247,7 +247,7 @@ void tcp_delack_timer_handler(struct sock *sk) } out: - if (sk_under_memory_pressure(sk)) + if (tcp_under_memory_pressure(sk)) sk_mem_reclaim(sk); } -- cgit From 8e4d980ac21596a9b91d8e720c77ad081975a0a8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 15 May 2015 12:39:28 -0700 Subject: tcp: fix behavior for epoll edge trigger Under memory pressure, tcp_sendmsg() can fail to queue a packet while no packet is present in write queue. If we return -EAGAIN with no packet in write queue, no ACK packet will ever come to raise EPOLLOUT. We need to allow one skb per TCP socket, and make sure that tcp sockets can release their forward allocations under pressure. This is a followup to commit 790ba4566c1a ("tcp: set SOCK_NOSPACE under memory pressure") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ecccfdc50d76..9eabfd3e0925 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -815,9 +815,20 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) /* The TCP header must be at least 32-bit aligned. */ size = ALIGN(size, 4); + if (unlikely(tcp_under_memory_pressure(sk))) + sk_mem_reclaim_partial(sk); + skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp); - if (skb) { - if (sk_wmem_schedule(sk, skb->truesize)) { + if (likely(skb)) { + bool mem_schedule; + + if (skb_queue_len(&sk->sk_write_queue) == 0) { + mem_schedule = true; + sk_forced_mem_schedule(sk, skb->truesize); + } else { + mem_schedule = sk_wmem_schedule(sk, skb->truesize); + } + if (likely(mem_schedule)) { skb_reserve(skb, sk->sk_prot->max_header); /* * Make sure that we have exactly size bytes -- cgit From 76dfa6082032b5c179864816fa508879421678eb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 15 May 2015 12:39:29 -0700 Subject: tcp: allow one skb to be received per socket under memory pressure While testing tight tcp_mem settings, I found tcp sessions could be stuck because we do not allow even one skb to be received on them. By allowing one skb to be received, we introduce fairness and eventuallu force memory hogs to release their allocation. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 093779f7e893..40c435997e54 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4507,10 +4507,12 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) if (eaten <= 0) { queue_and_out: - if (eaten < 0 && - tcp_try_rmem_schedule(sk, skb, skb->truesize)) - goto drop; - + if (eaten < 0) { + if (skb_queue_len(&sk->sk_receive_queue) == 0) + sk_forced_mem_schedule(sk, skb->truesize); + else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) + goto drop; + } eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); } tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); -- cgit From b66e91ccbc34ebd5a2f90f9e1bc1597e2924a500 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 15 May 2015 12:39:30 -0700 Subject: tcp: halves tcp_mem[] limits Allowing tcp to use ~19% of physical memory is way too much, and allowed bugs to be hidden. Add to this that some drivers use a full page per incoming frame, so real cost can be twice the advertized one. Reduce tcp_mem by 50 % as a first step to sanity. tcp_mem[0,1,2] defaults are now 4.68%, 6.25%, 9.37% of physical memory. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9eabfd3e0925..c724195e5862 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3068,11 +3068,12 @@ __setup("thash_entries=", set_thash_entries); static void __init tcp_init_mem(void) { - unsigned long limit = nr_free_buffer_pages() / 8; + unsigned long limit = nr_free_buffer_pages() / 16; + limit = max(limit, 128UL); - sysctl_tcp_mem[0] = limit / 4 * 3; - sysctl_tcp_mem[1] = limit; - sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; + sysctl_tcp_mem[0] = limit / 4 * 3; /* 4.68 % */ + sysctl_tcp_mem[1] = limit; /* 6.25 % */ + sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; /* 9.37 % */ } void __init tcp_init(void) -- cgit From 45d4122ca7cdb3a4b91f392605cd22cfa75f1d99 Mon Sep 17 00:00:00 2001 From: "Samudrala, Sridhar" Date: Wed, 13 May 2015 21:55:43 -0700 Subject: switchdev: add support for fdb add/del/dump via switchdev_port_obj ops. - introduce port fdb obj and generic switchdev_port_fdb_add/del/dump() - use switchdev_port_fdb_add/del/dump in rocker/team/bonding ndo ops. - add support for fdb obj in switchdev_port_obj_add/del/dump() - switch rocker to implement fdb ops via switchdev_ops v3: updated to sync with named union changes. Signed-off-by: Sridhar Samudrala Signed-off-by: Scott Feldman Signed-off-by: David S. Miller --- net/switchdev/switchdev.c | 175 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 175 insertions(+) (limited to 'net') diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 0409f9b5bdbc..d4c8cf828240 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -296,6 +296,36 @@ int switchdev_port_obj_del(struct net_device *dev, struct switchdev_obj *obj) } EXPORT_SYMBOL_GPL(switchdev_port_obj_del); +/** + * switchdev_port_obj_dump - Dump port objects + * + * @dev: port device + * @obj: object to dump + */ +int switchdev_port_obj_dump(struct net_device *dev, struct switchdev_obj *obj) +{ + const struct switchdev_ops *ops = dev->switchdev_ops; + struct net_device *lower_dev; + struct list_head *iter; + int err = -EOPNOTSUPP; + + if (ops && ops->switchdev_port_obj_dump) + return ops->switchdev_port_obj_dump(dev, obj); + + /* Switch device port(s) may be stacked under + * bond/team/vlan dev, so recurse down to dump objects on + * first port at bottom of stack. + */ + + netdev_for_each_lower_dev(dev, lower_dev, iter) { + err = switchdev_port_obj_dump(lower_dev, obj); + break; + } + + return err; +} +EXPORT_SYMBOL_GPL(switchdev_port_obj_dump); + static DEFINE_MUTEX(switchdev_mutex); static RAW_NOTIFIER_HEAD(switchdev_notif_chain); @@ -566,6 +596,151 @@ int switchdev_port_bridge_dellink(struct net_device *dev, } EXPORT_SYMBOL_GPL(switchdev_port_bridge_dellink); +/** + * switchdev_port_fdb_add - Add FDB (MAC/VLAN) entry to port + * + * @ndmsg: netlink hdr + * @nlattr: netlink attributes + * @dev: port device + * @addr: MAC address to add + * @vid: VLAN to add + * + * Add FDB entry to switch device. + */ +int switchdev_port_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], + struct net_device *dev, const unsigned char *addr, + u16 vid, u16 nlm_flags) +{ + struct switchdev_obj obj = { + .id = SWITCHDEV_OBJ_PORT_FDB, + .u.fdb = { + .addr = addr, + .vid = vid, + }, + }; + + return switchdev_port_obj_add(dev, &obj); +} +EXPORT_SYMBOL_GPL(switchdev_port_fdb_add); + +/** + * switchdev_port_fdb_del - Delete FDB (MAC/VLAN) entry from port + * + * @ndmsg: netlink hdr + * @nlattr: netlink attributes + * @dev: port device + * @addr: MAC address to delete + * @vid: VLAN to delete + * + * Delete FDB entry from switch device. + */ +int switchdev_port_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], + struct net_device *dev, const unsigned char *addr, + u16 vid) +{ + struct switchdev_obj obj = { + .id = SWITCHDEV_OBJ_PORT_FDB, + .u.fdb = { + .addr = addr, + .vid = vid, + }, + }; + + return switchdev_port_obj_del(dev, &obj); +} +EXPORT_SYMBOL_GPL(switchdev_port_fdb_del); + +struct switchdev_fdb_dump { + struct switchdev_obj obj; + struct sk_buff *skb; + struct netlink_callback *cb; + struct net_device *filter_dev; + int idx; +}; + +static int switchdev_port_fdb_dump_cb(struct net_device *dev, + struct switchdev_obj *obj) +{ + struct switchdev_fdb_dump *dump = + container_of(obj, struct switchdev_fdb_dump, obj); + u32 portid = NETLINK_CB(dump->cb->skb).portid; + u32 seq = dump->cb->nlh->nlmsg_seq; + struct nlmsghdr *nlh; + struct ndmsg *ndm; + struct net_device *master = netdev_master_upper_dev_get(dev); + + if (dump->idx < dump->cb->args[0]) + goto skip; + + if (master && dump->filter_dev != master) + goto skip; + + nlh = nlmsg_put(dump->skb, portid, seq, RTM_NEWNEIGH, + sizeof(*ndm), NLM_F_MULTI); + if (!nlh) + return -EMSGSIZE; + + ndm = nlmsg_data(nlh); + ndm->ndm_family = AF_BRIDGE; + ndm->ndm_pad1 = 0; + ndm->ndm_pad2 = 0; + ndm->ndm_flags = NTF_SELF; + ndm->ndm_type = 0; + ndm->ndm_ifindex = dev->ifindex; + ndm->ndm_state = NUD_REACHABLE; + + if (nla_put(dump->skb, NDA_LLADDR, ETH_ALEN, obj->u.fdb.addr)) + goto nla_put_failure; + + if (obj->u.fdb.vid && nla_put_u16(dump->skb, NDA_VLAN, obj->u.fdb.vid)) + goto nla_put_failure; + + nlmsg_end(dump->skb, nlh); + +skip: + dump->idx++; + return 0; + +nla_put_failure: + nlmsg_cancel(dump->skb, nlh); + return -EMSGSIZE; +} + +/** + * switchdev_port_fdb_dump - Dump port FDB (MAC/VLAN) entries + * + * @skb: netlink skb + * @cb: netlink callback + * @dev: port device + * @filter_dev: filter device + * @idx: + * + * Delete FDB entry from switch device. + */ +int switchdev_port_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, + struct net_device *dev, + struct net_device *filter_dev, int idx) +{ + struct switchdev_fdb_dump dump = { + .obj = { + .id = SWITCHDEV_OBJ_PORT_FDB, + .cb = switchdev_port_fdb_dump_cb, + }, + .skb = skb, + .cb = cb, + .filter_dev = filter_dev, + .idx = idx, + }; + int err; + + err = switchdev_port_obj_dump(dev, &dump.obj); + if (err) + return err; + + return dump.idx; +} +EXPORT_SYMBOL_GPL(switchdev_port_fdb_dump); + static struct net_device *switchdev_get_lowest_dev(struct net_device *dev) { const struct switchdev_ops *ops = dev->switchdev_ops; -- cgit From 74b80e841b4da2f83d8f548f32cfe421fde85aca Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 15 May 2015 13:27:32 +0200 Subject: flow_dissector: remove bogus return in tipc section Fixes: 06635a35d13d42b9 ("flow_dissect: use programable dissector in skb_flow_dissect and friends") Reported-by: Dan Carpenter Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 204d09c42510..703d05916a80 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -279,7 +279,6 @@ flow_label: if (skb_flow_dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS)) { - return true; key_addrs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS, target_container); -- cgit From de133464c9e70808d3e5a861294bc55940988178 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Fri, 15 May 2015 14:47:32 -0700 Subject: netns: make nsid_lock per net The spinlock is used to protect netns_ids which is per net, so there is no need to use a global spinlock. Cc: Nicolas Dichtel Signed-off-by: Cong Wang Acked-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/core/net_namespace.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index d2f42da9479b..2c2eb1b629b1 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -28,7 +28,6 @@ static LIST_HEAD(pernet_list); static struct list_head *first_device = &pernet_list; DEFINE_MUTEX(net_mutex); -static DEFINE_SPINLOCK(nsid_lock); LIST_HEAD(net_namespace_list); EXPORT_SYMBOL_GPL(net_namespace_list); @@ -218,10 +217,10 @@ int peernet2id_alloc(struct net *net, struct net *peer) bool alloc; int id; - spin_lock_irqsave(&nsid_lock, flags); + spin_lock_irqsave(&net->nsid_lock, flags); alloc = atomic_read(&peer->count) == 0 ? false : true; id = __peernet2id_alloc(net, peer, &alloc); - spin_unlock_irqrestore(&nsid_lock, flags); + spin_unlock_irqrestore(&net->nsid_lock, flags); if (alloc && id >= 0) rtnl_net_notifyid(net, RTM_NEWNSID, id); return id; @@ -234,9 +233,9 @@ int peernet2id(struct net *net, struct net *peer) unsigned long flags; int id; - spin_lock_irqsave(&nsid_lock, flags); + spin_lock_irqsave(&net->nsid_lock, flags); id = __peernet2id(net, peer); - spin_unlock_irqrestore(&nsid_lock, flags); + spin_unlock_irqrestore(&net->nsid_lock, flags); return id; } @@ -257,11 +256,11 @@ struct net *get_net_ns_by_id(struct net *net, int id) return NULL; rcu_read_lock(); - spin_lock_irqsave(&nsid_lock, flags); + spin_lock_irqsave(&net->nsid_lock, flags); peer = idr_find(&net->netns_ids, id); if (peer) get_net(peer); - spin_unlock_irqrestore(&nsid_lock, flags); + spin_unlock_irqrestore(&net->nsid_lock, flags); rcu_read_unlock(); return peer; @@ -282,6 +281,7 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) net->dev_base_seq = 1; net->user_ns = user_ns; idr_init(&net->netns_ids); + spin_lock_init(&net->nsid_lock); list_for_each_entry(ops, &pernet_list, list) { error = ops_init(ops, net); @@ -404,17 +404,17 @@ static void cleanup_net(struct work_struct *work) for_each_net(tmp) { int id; - spin_lock_irq(&nsid_lock); + spin_lock_irq(&tmp->nsid_lock); id = __peernet2id(tmp, net); if (id >= 0) idr_remove(&tmp->netns_ids, id); - spin_unlock_irq(&nsid_lock); + spin_unlock_irq(&tmp->nsid_lock); if (id >= 0) rtnl_net_notifyid(tmp, RTM_DELNSID, id); } - spin_lock_irq(&nsid_lock); + spin_lock_irq(&net->nsid_lock); idr_destroy(&net->netns_ids); - spin_unlock_irq(&nsid_lock); + spin_unlock_irq(&net->nsid_lock); } rtnl_unlock(); @@ -563,15 +563,15 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh) if (IS_ERR(peer)) return PTR_ERR(peer); - spin_lock_irqsave(&nsid_lock, flags); + spin_lock_irqsave(&net->nsid_lock, flags); if (__peernet2id(net, peer) >= 0) { - spin_unlock_irqrestore(&nsid_lock, flags); + spin_unlock_irqrestore(&net->nsid_lock, flags); err = -EEXIST; goto out; } err = alloc_netid(net, peer, nsid); - spin_unlock_irqrestore(&nsid_lock, flags); + spin_unlock_irqrestore(&net->nsid_lock, flags); if (err >= 0) { rtnl_net_notifyid(net, RTM_NEWNSID, err); err = 0; @@ -695,9 +695,9 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb) }; unsigned long flags; - spin_lock_irqsave(&nsid_lock, flags); + spin_lock_irqsave(&net->nsid_lock, flags); idr_for_each(&net->netns_ids, rtnl_net_dumpid_one, &net_cb); - spin_unlock_irqrestore(&nsid_lock, flags); + spin_unlock_irqrestore(&net->nsid_lock, flags); cb->args[0] = net_cb.idx; return skb->len; -- cgit From b9fbe709de4dbe663613ebb852f35aef2467872c Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sun, 17 May 2015 10:45:34 +0800 Subject: netlink: Use random autobind rover Currently we use a global rover to select a port ID that is unique. This used to work consistently when it was protected with a global lock. However as we're now lockless, the global rover can exhibit pathological behaviour should multiple threads all stomp on it at the same time. Granted this will eventually resolve itself but the process is suboptimal. This patch replaces the global rover with a pseudorandom starting point to avoid this issue. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 164ded7050b8..136056f0c62c 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1299,20 +1299,24 @@ static int netlink_autobind(struct socket *sock) struct netlink_table *table = &nl_table[sk->sk_protocol]; s32 portid = task_tgid_vnr(current); int err; - static s32 rover = -4097; + s32 rover = -4096; + bool ok; retry: cond_resched(); rcu_read_lock(); - if (__netlink_lookup(table, portid, net)) { + ok = !__netlink_lookup(table, portid, net); + rcu_read_unlock(); + if (!ok) { /* Bind collision, search negative portid values. */ - portid = rover--; - if (rover > -4097) + if (rover == -4096) + /* rover will be in range [S32_MIN, -4097] */ + rover = S32_MIN + prandom_u32_max(-4096 - S32_MIN); + else if (rover >= -4096) rover = -4097; - rcu_read_unlock(); + portid = rover--; goto retry; } - rcu_read_unlock(); err = netlink_insert(sk, portid); if (err == -EADDRINUSE) -- cgit From 8faf491e642011f449d6405be52bedb70964aed6 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Thu, 14 May 2015 11:16:59 +0800 Subject: xfrm: optimise to search the inexact policy list The policies are organized into list by priority ascent of policy, so it is unnecessary to continue to loop the policy if the priority of current looped police is larger than or equal priority which is from the policy_bydst list. This allows to match policy with ~0U priority in inexact list too. Signed-off-by: Li RongQing Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 3d264e5e7409..18cead7645be 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1114,6 +1114,9 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, } chain = &net->xfrm.policy_inexact[dir]; hlist_for_each_entry(pol, chain, bydst) { + if ((pol->priority >= priority) && ret) + break; + err = xfrm_policy_match(pol, fl, type, family, dir); if (err) { if (err == -ESRCH) @@ -1122,7 +1125,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, ret = ERR_PTR(err); goto fail; } - } else if (pol->priority < priority) { + } else { ret = pol; break; } @@ -3203,9 +3206,11 @@ static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector * } chain = &net->xfrm.policy_inexact[dir]; hlist_for_each_entry(pol, chain, bydst) { + if ((pol->priority >= priority) && ret) + break; + if (xfrm_migrate_selector_match(sel, &pol->selector) && - pol->type == type && - pol->priority < priority) { + pol->type == type) { ret = pol; break; } -- cgit From 5cf422808244ca8f1177c72fe6e1ce8322794b57 Mon Sep 17 00:00:00 2001 From: Andy Zhou Date: Fri, 15 May 2015 14:15:35 -0700 Subject: ipv4: introduce frag_expire_skip_icmp() Improve readability of skip ICMP for de-fragmentation expiration logic. This change will also make the logic easier to maintain when the following patches in this series are applied. Signed-off-by: Andy Zhou Signed-off-by: David S. Miller --- net/ipv4/ip_fragment.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index cc1da6d9cb35..83424f1742b8 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -173,6 +173,13 @@ static void ipq_kill(struct ipq *ipq) inet_frag_kill(&ipq->q, &ip4_frags); } +static bool frag_expire_skip_icmp(u32 user) +{ + return user == IP_DEFRAG_AF_PACKET || + ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_IN, + __IP_DEFRAG_CONNTRACK_IN_END); +} + /* * Oops, a fragment queue timed out. Kill it and send an ICMP reply. */ @@ -217,10 +224,8 @@ static void ip_expire(unsigned long arg) /* Only an end host needs to send an ICMP * "Fragment Reassembly Timeout" message, per RFC792. */ - if (qp->user == IP_DEFRAG_AF_PACKET || - ((qp->user >= IP_DEFRAG_CONNTRACK_IN) && - (qp->user <= __IP_DEFRAG_CONNTRACK_IN_END) && - (skb_rtable(head)->rt_type != RTN_LOCAL))) + if (frag_expire_skip_icmp(qp->user) && + (skb_rtable(head)->rt_type != RTN_LOCAL)) goto out_rcu_unlock; /* Send an ICMP "Fragment Reassembly Timeout" message. */ -- cgit From 8bc04864ac89616e55fc8a196dd32b7066433ea8 Mon Sep 17 00:00:00 2001 From: Andy Zhou Date: Fri, 15 May 2015 14:15:36 -0700 Subject: IPv4: skip ICMP for bridge contrack users when defrag expires users in [IP_DEFRAG_CONNTRACK_BRIDGE_IN, __IP_DEFRAG_CONNTRACK_BR_IN] should not ICMP message also. Reported-by: Florian Westphal Signed-off-by: Andy Zhou Signed-off-by: David S. Miller --- net/ipv4/ip_fragment.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 83424f1742b8..47fa64ee82b1 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -177,7 +177,9 @@ static bool frag_expire_skip_icmp(u32 user) { return user == IP_DEFRAG_AF_PACKET || ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_IN, - __IP_DEFRAG_CONNTRACK_IN_END); + __IP_DEFRAG_CONNTRACK_IN_END) || + ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_BRIDGE_IN, + __IP_DEFRAG_CONNTRACK_BRIDGE_IN); } /* -- cgit From 49d16b23cd1e61c028ee088c5a64e9ac6a9c6147 Mon Sep 17 00:00:00 2001 From: Andy Zhou Date: Fri, 15 May 2015 14:15:37 -0700 Subject: bridge_netfilter: No ICMP packet on IPv4 fragmentation error When bridge netfilter re-fragments an IP packet for output, all packets that can not be re-fragmented to their original input size should be silently discarded. However, current bridge netfilter output path generates an ICMP packet with 'size exceeded MTU' message for such packets, this is a bug. This patch refactors the ip_fragment() API to allow two separate use cases. The bridge netfilter user case will not send ICMP, the routing output will, as before. Signed-off-by: Andy Zhou Acked-by: Florian Westphal Signed-off-by: David S. Miller --- net/bridge/br_netfilter.c | 21 ++++++++++++++++++++- net/ipv4/ip_output.c | 40 ++++++++++++++++++++++++++++------------ 2 files changed, 48 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 2b0e8bb49944..1d2eb32d8270 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -853,6 +853,25 @@ static int br_nf_push_frag_xmit(struct sock *sk, struct sk_buff *skb) return br_dev_queue_push_xmit(sk, skb); } +static int br_nf_ip_fragment(struct sock *sk, struct sk_buff *skb, + int (*output)(struct sock *, struct sk_buff *)) +{ + unsigned int mtu = ip_skb_dst_mtu(skb); + struct iphdr *iph = ip_hdr(skb); + struct rtable *rt = skb_rtable(skb); + struct net_device *dev = rt->dst.dev; + + if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) || + (IPCB(skb)->frag_max_size && + IPCB(skb)->frag_max_size > mtu))) { + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); + kfree_skb(skb); + return -EMSGSIZE; + } + + return ip_do_fragment(sk, skb, output); +} + static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb) { int ret; @@ -886,7 +905,7 @@ static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb) skb_copy_from_linear_data_offset(skb, -data->size, data->mac, data->size); - ret = ip_fragment(sk, skb, br_nf_push_frag_xmit); + ret = br_nf_ip_fragment(sk, skb, br_nf_push_frag_xmit); } else { nf_bridge_info_free(skb); ret = br_dev_queue_push_xmit(sk, skb); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 2acc5dc32807..8d91b922fcfe 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -83,6 +83,9 @@ int sysctl_ip_default_ttl __read_mostly = IPDEFTTL; EXPORT_SYMBOL(sysctl_ip_default_ttl); +static int ip_fragment(struct sock *sk, struct sk_buff *skb, + int (*output)(struct sock *, struct sk_buff *)); + /* Generate a checksum for an outgoing IP datagram. */ void ip_send_check(struct iphdr *iph) { @@ -478,6 +481,28 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) skb_copy_secmark(to, from); } +static int ip_fragment(struct sock *sk, struct sk_buff *skb, + int (*output)(struct sock *, struct sk_buff *)) +{ + struct iphdr *iph = ip_hdr(skb); + unsigned int mtu = ip_skb_dst_mtu(skb); + + if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) || + (IPCB(skb)->frag_max_size && + IPCB(skb)->frag_max_size > mtu))) { + struct rtable *rt = skb_rtable(skb); + struct net_device *dev = rt->dst.dev; + + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(mtu)); + kfree_skb(skb); + return -EMSGSIZE; + } + + return ip_do_fragment(sk, skb, output); +} + /* * This IP datagram is too large to be sent in one piece. Break it up into * smaller pieces (each of size equal to IP header plus @@ -485,8 +510,8 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) * single device frame, and queue such a frame for sending. */ -int ip_fragment(struct sock *sk, struct sk_buff *skb, - int (*output)(struct sock *, struct sk_buff *)) +int ip_do_fragment(struct sock *sk, struct sk_buff *skb, + int (*output)(struct sock *, struct sk_buff *)) { struct iphdr *iph; int ptr; @@ -507,15 +532,6 @@ int ip_fragment(struct sock *sk, struct sk_buff *skb, iph = ip_hdr(skb); mtu = ip_skb_dst_mtu(skb); - if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) || - (IPCB(skb)->frag_max_size && - IPCB(skb)->frag_max_size > mtu))) { - IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, - htonl(mtu)); - kfree_skb(skb); - return -EMSGSIZE; - } /* * Setup starting values. @@ -751,7 +767,7 @@ fail: IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); return err; } -EXPORT_SYMBOL(ip_fragment); +EXPORT_SYMBOL(ip_do_fragment); int ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) -- cgit From 0cf0879acdbeff695963d78940b89497803ae55c Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Sun, 17 May 2015 21:44:37 +0200 Subject: nl802154: cleanup invalid argument handling This patch cleanups the -EINVAL cases by combining them in one condition. Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/ieee802154/nl802154.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index f3c12f6a4a39..c01f4bbe7c7c 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -668,10 +668,8 @@ static int nl802154_set_pan_id(struct sk_buff *skb, struct genl_info *info) return -EBUSY; /* don't change address fields on monitor */ - if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) - return -EINVAL; - - if (!info->attrs[NL802154_ATTR_PAN_ID]) + if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR || + !info->attrs[NL802154_ATTR_PAN_ID]) return -EINVAL; pan_id = nla_get_le16(info->attrs[NL802154_ATTR_PAN_ID]); @@ -691,10 +689,8 @@ static int nl802154_set_short_addr(struct sk_buff *skb, struct genl_info *info) return -EBUSY; /* don't change address fields on monitor */ - if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) - return -EINVAL; - - if (!info->attrs[NL802154_ATTR_SHORT_ADDR]) + if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR || + !info->attrs[NL802154_ATTR_SHORT_ADDR]) return -EINVAL; short_addr = nla_get_le16(info->attrs[NL802154_ATTR_SHORT_ADDR]); -- cgit From 673692faf3fb42ec73ef7210238b5869d6e2873d Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Sun, 17 May 2015 21:44:38 +0200 Subject: ieee802154: move validation check out of softmac This patch moves the value validation out of softmac layer. We need to be sure now that this value is accepted by the transceiver/mac802154 or "possible" hardmac drivers before calling rdev-ops. Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/ieee802154/nl802154.c | 28 +++++++++++++++++++++++++++- net/mac802154/cfg.c | 29 ----------------------------- 2 files changed, 27 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index c01f4bbe7c7c..d750d9778ba7 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -625,7 +625,8 @@ static int nl802154_set_channel(struct sk_buff *skb, struct genl_info *info) channel = nla_get_u8(info->attrs[NL802154_ATTR_CHANNEL]); /* check 802.15.4 constraints */ - if (page > IEEE802154_MAX_PAGE || channel > IEEE802154_MAX_CHANNEL) + if (page > IEEE802154_MAX_PAGE || channel > IEEE802154_MAX_CHANNEL || + !(rdev->wpan_phy.channels_supported[page] & BIT(channel))) return -EINVAL; return rdev_set_channel(rdev, page, channel); @@ -674,6 +675,16 @@ static int nl802154_set_pan_id(struct sk_buff *skb, struct genl_info *info) pan_id = nla_get_le16(info->attrs[NL802154_ATTR_PAN_ID]); + /* TODO + * I am not sure about to check here on broadcast pan_id. + * Broadcast is a valid setting, comment from 802.15.4: + * If this value is 0xffff, the device is not associated. + * + * This could useful to simple deassociate an device. + */ + if (pan_id == cpu_to_le16(IEEE802154_PAN_ID_BROADCAST)) + return -EINVAL; + return rdev_set_pan_id(rdev, wpan_dev, pan_id); } @@ -695,6 +706,21 @@ static int nl802154_set_short_addr(struct sk_buff *skb, struct genl_info *info) short_addr = nla_get_le16(info->attrs[NL802154_ATTR_SHORT_ADDR]); + /* TODO + * I am not sure about to check here on broadcast short_addr. + * Broadcast is a valid setting, comment from 802.15.4: + * A value of 0xfffe indicates that the device has + * associated but has not been allocated an address. A + * value of 0xffff indicates that the device does not + * have a short address. + * + * I think we should allow to set these settings but + * don't allow to allow socket communication with it. + */ + if (short_addr == cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC) || + short_addr == cpu_to_le16(IEEE802154_ADDR_SHORT_BROADCAST)) + return -EINVAL; + return rdev_set_short_addr(rdev, wpan_dev, short_addr); } diff --git a/net/mac802154/cfg.c b/net/mac802154/cfg.c index 70be9c799f8a..c63601582c71 100644 --- a/net/mac802154/cfg.c +++ b/net/mac802154/cfg.c @@ -73,10 +73,6 @@ ieee802154_set_channel(struct wpan_phy *wpan_phy, u8 page, u8 channel) ASSERT_RTNL(); - /* check if phy support this setting */ - if (!(wpan_phy->channels_supported[page] & BIT(channel))) - return -EINVAL; - ret = drv_set_channel(local, page, channel); if (!ret) { wpan_phy->current_page = page; @@ -112,16 +108,6 @@ ieee802154_set_pan_id(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, { ASSERT_RTNL(); - /* TODO - * I am not sure about to check here on broadcast pan_id. - * Broadcast is a valid setting, comment from 802.15.4: - * If this value is 0xffff, the device is not associated. - * - * This could useful to simple deassociate an device. - */ - if (pan_id == cpu_to_le16(IEEE802154_PAN_ID_BROADCAST)) - return -EINVAL; - wpan_dev->pan_id = pan_id; return 0; } @@ -149,21 +135,6 @@ ieee802154_set_short_addr(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, { ASSERT_RTNL(); - /* TODO - * I am not sure about to check here on broadcast short_addr. - * Broadcast is a valid setting, comment from 802.15.4: - * A value of 0xfffe indicates that the device has - * associated but has not been allocated an address. A - * value of 0xffff indicates that the device does not - * have a short address. - * - * I think we should allow to set these settings but - * don't allow to allow socket communication with it. - */ - if (short_addr == cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC) || - short_addr == cpu_to_le16(IEEE802154_ADDR_SHORT_BROADCAST)) - return -EINVAL; - wpan_dev->short_addr = short_addr; return 0; } -- cgit From 1a19cb680be0d4b06ce9a9d6516b8f45f544d3e8 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Sun, 17 May 2015 21:44:39 +0200 Subject: ieee802154: change transmit power to s32 This patch change the transmit power from s8 to s32. This prepares to store a mbm value instead dbm inside the transmit power variable. The old interface keep the a s8 dbm value, which should be backward compatibility when assign s8 to s32. Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/ieee802154/nl802154.c | 6 +++--- net/mac802154/driver-ops.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index d750d9778ba7..f3185c71af10 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -207,7 +207,7 @@ static const struct nla_policy nl802154_policy[NL802154_ATTR_MAX+1] = { [NL802154_ATTR_PAGE] = { .type = NLA_U8, }, [NL802154_ATTR_CHANNEL] = { .type = NLA_U8, }, - [NL802154_ATTR_TX_POWER] = { .type = NLA_S8, }, + [NL802154_ATTR_TX_POWER] = { .type = NLA_S32, }, [NL802154_ATTR_CCA_MODE] = { .type = NLA_U32, }, [NL802154_ATTR_CCA_OPT] = { .type = NLA_U32, }, @@ -301,8 +301,8 @@ static int nl802154_send_wpan_phy(struct cfg802154_registered_device *rdev, goto nla_put_failure; } - if (nla_put_s8(msg, NL802154_ATTR_TX_POWER, - rdev->wpan_phy.transmit_power)) + if (nla_put_s32(msg, NL802154_ATTR_TX_POWER, + rdev->wpan_phy.transmit_power)) goto nla_put_failure; finish: diff --git a/net/mac802154/driver-ops.h b/net/mac802154/driver-ops.h index a0533357b9ea..57c1bdbfaa91 100644 --- a/net/mac802154/driver-ops.h +++ b/net/mac802154/driver-ops.h @@ -58,7 +58,7 @@ drv_set_channel(struct ieee802154_local *local, u8 page, u8 channel) return local->ops->set_channel(&local->hw, page, channel); } -static inline int drv_set_tx_power(struct ieee802154_local *local, s8 dbm) +static inline int drv_set_tx_power(struct ieee802154_local *local, s32 dbm) { might_sleep(); -- cgit From e2eb173aaacd1a1bcd255d3e74ffb719e47eeadb Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Sun, 17 May 2015 21:44:40 +0200 Subject: ieee802154: change transmit power to mbm This patch change the handling of transmit power level from dbm to mbm. This prepares to handle floating point transmit power levels values. The old netlink 802.15.4 will convert the dbm value to mbm for handling backward compatibility. Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/ieee802154/nl-mac.c | 4 ++-- net/mac802154/driver-ops.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ieee802154/nl-mac.c b/net/ieee802154/nl-mac.c index 2b4955d7aae5..4ba2e13f7a07 100644 --- a/net/ieee802154/nl-mac.c +++ b/net/ieee802154/nl-mac.c @@ -117,7 +117,7 @@ static int ieee802154_nl_fill_iface(struct sk_buff *msg, u32 portid, rtnl_unlock(); if (nla_put_s8(msg, IEEE802154_ATTR_TXPOWER, - params.transmit_power) || + params.transmit_power / 100) || nla_put_u8(msg, IEEE802154_ATTR_LBT_ENABLED, params.lbt) || nla_put_u8(msg, IEEE802154_ATTR_CCA_MODE, params.cca.mode) || @@ -510,7 +510,7 @@ int ieee802154_set_macparams(struct sk_buff *skb, struct genl_info *info) ops->get_mac_params(dev, ¶ms); if (info->attrs[IEEE802154_ATTR_TXPOWER]) - params.transmit_power = nla_get_s8(info->attrs[IEEE802154_ATTR_TXPOWER]); + params.transmit_power = nla_get_s8(info->attrs[IEEE802154_ATTR_TXPOWER]) * 100; if (info->attrs[IEEE802154_ATTR_LBT_ENABLED]) params.lbt = nla_get_u8(info->attrs[IEEE802154_ATTR_LBT_ENABLED]); diff --git a/net/mac802154/driver-ops.h b/net/mac802154/driver-ops.h index 57c1bdbfaa91..d289ae3f1e93 100644 --- a/net/mac802154/driver-ops.h +++ b/net/mac802154/driver-ops.h @@ -58,7 +58,7 @@ drv_set_channel(struct ieee802154_local *local, u8 page, u8 channel) return local->ops->set_channel(&local->hw, page, channel); } -static inline int drv_set_tx_power(struct ieee802154_local *local, s32 dbm) +static inline int drv_set_tx_power(struct ieee802154_local *local, s32 mbm) { might_sleep(); @@ -67,7 +67,7 @@ static inline int drv_set_tx_power(struct ieee802154_local *local, s32 dbm) return -EOPNOTSUPP; } - return local->ops->set_txpower(&local->hw, dbm); + return local->ops->set_txpower(&local->hw, mbm); } static inline int drv_set_cca_mode(struct ieee802154_local *local, -- cgit From 32b23550ad64d9676f2218b3d5de46bacf98ef1d Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Sun, 17 May 2015 21:44:41 +0200 Subject: ieee802154: change cca ed level to mbm This patch change the handling of cca energy detection level from dbm to mbm. This prepares to handle floating point cca energy detection levels values. The old netlink 802.15.4 will convert the dbm value to mbm for handling backward compatibility. Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/ieee802154/nl-mac.c | 4 ++-- net/mac802154/driver-ops.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ieee802154/nl-mac.c b/net/ieee802154/nl-mac.c index 4ba2e13f7a07..cdc1cc3543f6 100644 --- a/net/ieee802154/nl-mac.c +++ b/net/ieee802154/nl-mac.c @@ -122,7 +122,7 @@ static int ieee802154_nl_fill_iface(struct sk_buff *msg, u32 portid, nla_put_u8(msg, IEEE802154_ATTR_CCA_MODE, params.cca.mode) || nla_put_s32(msg, IEEE802154_ATTR_CCA_ED_LEVEL, - params.cca_ed_level) || + params.cca_ed_level / 100) || nla_put_u8(msg, IEEE802154_ATTR_CSMA_RETRIES, params.csma_retries) || nla_put_u8(msg, IEEE802154_ATTR_CSMA_MIN_BE, @@ -519,7 +519,7 @@ int ieee802154_set_macparams(struct sk_buff *skb, struct genl_info *info) params.cca.mode = nla_get_u8(info->attrs[IEEE802154_ATTR_CCA_MODE]); if (info->attrs[IEEE802154_ATTR_CCA_ED_LEVEL]) - params.cca_ed_level = nla_get_s32(info->attrs[IEEE802154_ATTR_CCA_ED_LEVEL]); + params.cca_ed_level = nla_get_s32(info->attrs[IEEE802154_ATTR_CCA_ED_LEVEL]) * 100; if (info->attrs[IEEE802154_ATTR_CSMA_RETRIES]) params.csma_retries = nla_get_u8(info->attrs[IEEE802154_ATTR_CSMA_RETRIES]); diff --git a/net/mac802154/driver-ops.h b/net/mac802154/driver-ops.h index d289ae3f1e93..caecd5f43aa7 100644 --- a/net/mac802154/driver-ops.h +++ b/net/mac802154/driver-ops.h @@ -96,7 +96,7 @@ static inline int drv_set_lbt_mode(struct ieee802154_local *local, bool mode) } static inline int -drv_set_cca_ed_level(struct ieee802154_local *local, s32 ed_level) +drv_set_cca_ed_level(struct ieee802154_local *local, s32 mbm) { might_sleep(); @@ -105,7 +105,7 @@ drv_set_cca_ed_level(struct ieee802154_local *local, s32 ed_level) return -EOPNOTSUPP; } - return local->ops->set_cca_ed_level(&local->hw, ed_level); + return local->ops->set_cca_ed_level(&local->hw, mbm); } static inline int drv_set_pan_id(struct ieee802154_local *local, __le16 pan_id) -- cgit From 72f655e44db9c7e835ceba96dc03cbe979d3f80d Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Sun, 17 May 2015 21:44:42 +0200 Subject: ieee802154: introduce wpan_phy_supported This patch introduce the wpan_phy_supported struct for wpan_phy. There is currently no way to check if a transceiver can handle IEEE 802.15.4 complaint values. With this struct we can check before if the transceiver supports these values before sending to driver layer. Signed-off-by: Alexander Aring Suggested-by: Phoebe Buckheister Acked-by: Varka Bhadram Cc: Alan Ott Signed-off-by: Marcel Holtmann --- net/ieee802154/nl-phy.c | 4 ++-- net/ieee802154/nl802154.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ieee802154/nl-phy.c b/net/ieee802154/nl-phy.c index 346c6665d25e..cbc0d09351e0 100644 --- a/net/ieee802154/nl-phy.c +++ b/net/ieee802154/nl-phy.c @@ -56,8 +56,8 @@ static int ieee802154_nl_fill_phy(struct sk_buff *msg, u32 portid, nla_put_u8(msg, IEEE802154_ATTR_CHANNEL, phy->current_channel)) goto nla_put_failure; for (i = 0; i < 32; i++) { - if (phy->channels_supported[i]) - buf[pages++] = phy->channels_supported[i] | (i << 27); + if (phy->supported.channels[i]) + buf[pages++] = phy->supported.channels[i] | (i << 27); } if (pages && nla_put(msg, IEEE802154_ATTR_CHANNEL_PAGE_LIST, diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index f3185c71af10..d271879a2174 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -248,7 +248,7 @@ nl802154_send_wpan_phy_channels(struct cfg802154_registered_device *rdev, for (page = 0; page <= IEEE802154_MAX_PAGE; page++) { if (nla_put_u32(msg, NL802154_ATTR_SUPPORTED_CHANNEL, - rdev->wpan_phy.channels_supported[page])) + rdev->wpan_phy.supported.channels[page])) return -ENOBUFS; } nla_nest_end(msg, nl_page); @@ -626,7 +626,7 @@ static int nl802154_set_channel(struct sk_buff *skb, struct genl_info *info) /* check 802.15.4 constraints */ if (page > IEEE802154_MAX_PAGE || channel > IEEE802154_MAX_CHANNEL || - !(rdev->wpan_phy.channels_supported[page] & BIT(channel))) + !(rdev->wpan_phy.supported.channels[page] & BIT(channel))) return -EINVAL; return rdev_set_channel(rdev, page, channel); -- cgit From fea3318d20776a94afeea0460c6ee9904e60569e Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Sun, 17 May 2015 21:44:43 +0200 Subject: ieee802154: add several phy supported handling This patch adds support for phy supported handling for all other already existing handling 802.15.4 functionality. We assume now a fully 802.15.4 complaint transceiver at phy allocation. If a transceiver can support 802.15.4 default values only, then the values should be overwirtten by values the transceiver supports. If the transceiver doesn't set the according hardware flags, we assume the 802.15.4 defaults now which cannot be changed. Signed-off-by: Alexander Aring Suggested-by: Phoebe Buckheister Signed-off-by: Marcel Holtmann --- net/ieee802154/nl802154.c | 22 +++++++++++++++++----- net/mac802154/main.c | 26 ++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index d271879a2174..fa0c4048b0e8 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -642,7 +642,9 @@ static int nl802154_set_cca_mode(struct sk_buff *skb, struct genl_info *info) cca.mode = nla_get_u32(info->attrs[NL802154_ATTR_CCA_MODE]); /* checking 802.15.4 constraints */ - if (cca.mode < NL802154_CCA_ENERGY || cca.mode > NL802154_CCA_ATTR_MAX) + if (cca.mode < NL802154_CCA_ENERGY || + cca.mode > NL802154_CCA_ATTR_MAX || + !(rdev->wpan_phy.supported.cca_modes & BIT(cca.mode))) return -EINVAL; if (cca.mode == NL802154_CCA_ENERGY_CARRIER) { @@ -650,7 +652,8 @@ static int nl802154_set_cca_mode(struct sk_buff *skb, struct genl_info *info) return -EINVAL; cca.opt = nla_get_u32(info->attrs[NL802154_ATTR_CCA_OPT]); - if (cca.opt > NL802154_CCA_OPT_ATTR_MAX) + if (cca.opt > NL802154_CCA_OPT_ATTR_MAX || + !(rdev->wpan_phy.supported.cca_opts & BIT(cca.opt))) return -EINVAL; } @@ -744,7 +747,11 @@ nl802154_set_backoff_exponent(struct sk_buff *skb, struct genl_info *info) max_be = nla_get_u8(info->attrs[NL802154_ATTR_MAX_BE]); /* check 802.15.4 constraints */ - if (max_be < 3 || max_be > 8 || min_be > max_be) + if (min_be < rdev->wpan_phy.supported.min_minbe || + min_be > rdev->wpan_phy.supported.max_minbe || + max_be < rdev->wpan_phy.supported.min_maxbe || + max_be > rdev->wpan_phy.supported.max_maxbe || + min_be > max_be) return -EINVAL; return rdev_set_backoff_exponent(rdev, wpan_dev, min_be, max_be); @@ -769,7 +776,8 @@ nl802154_set_max_csma_backoffs(struct sk_buff *skb, struct genl_info *info) info->attrs[NL802154_ATTR_MAX_CSMA_BACKOFFS]); /* check 802.15.4 constraints */ - if (max_csma_backoffs > 5) + if (max_csma_backoffs < rdev->wpan_phy.supported.min_csma_backoffs || + max_csma_backoffs > rdev->wpan_phy.supported.max_csma_backoffs) return -EINVAL; return rdev_set_max_csma_backoffs(rdev, wpan_dev, max_csma_backoffs); @@ -793,7 +801,8 @@ nl802154_set_max_frame_retries(struct sk_buff *skb, struct genl_info *info) info->attrs[NL802154_ATTR_MAX_FRAME_RETRIES]); /* check 802.15.4 constraints */ - if (max_frame_retries < -1 || max_frame_retries > 7) + if (max_frame_retries < rdev->wpan_phy.supported.min_frame_retries || + max_frame_retries > rdev->wpan_phy.supported.max_frame_retries) return -EINVAL; return rdev_set_max_frame_retries(rdev, wpan_dev, max_frame_retries); @@ -813,6 +822,9 @@ static int nl802154_set_lbt_mode(struct sk_buff *skb, struct genl_info *info) return -EINVAL; mode = !!nla_get_u8(info->attrs[NL802154_ATTR_LBT_MODE]); + if (!wpan_phy_supported_bool(mode, rdev->wpan_phy.supported.lbt)) + return -EINVAL; + return rdev_set_lbt_mode(rdev, wpan_dev, mode); } diff --git a/net/mac802154/main.c b/net/mac802154/main.c index 08cb32dc8fd3..ddcd6ff8d39c 100644 --- a/net/mac802154/main.c +++ b/net/mac802154/main.c @@ -107,6 +107,15 @@ ieee802154_alloc_hw(size_t priv_data_len, const struct ieee802154_ops *ops) skb_queue_head_init(&local->skb_queue); + /* init supported flags with 802.15.4 default ranges */ + phy->supported.max_minbe = 8; + phy->supported.min_maxbe = 3; + phy->supported.max_maxbe = 8; + phy->supported.min_frame_retries = -1; + phy->supported.max_frame_retries = 7; + phy->supported.max_csma_backoffs = 5; + phy->supported.lbt = NL802154_SUPPORTED_BOOL_FALSE; + return &local->hw; } EXPORT_SYMBOL(ieee802154_alloc_hw); @@ -155,6 +164,23 @@ int ieee802154_register_hw(struct ieee802154_hw *hw) ieee802154_setup_wpan_phy_pib(local->phy); + if (!(hw->flags & IEEE802154_HW_CSMA_PARAMS)) { + local->phy->supported.min_csma_backoffs = 4; + local->phy->supported.max_csma_backoffs = 4; + local->phy->supported.min_maxbe = 5; + local->phy->supported.max_maxbe = 5; + local->phy->supported.min_minbe = 3; + local->phy->supported.max_minbe = 3; + } + + if (!(hw->flags & IEEE802154_HW_FRAME_RETRIES)) { + /* TODO should be 3, but our default value is -1 which means + * no ARET handling. + */ + local->phy->supported.min_frame_retries = -1; + local->phy->supported.max_frame_retries = -1; + } + rc = wpan_phy_register(local->phy); if (rc < 0) goto out_wq; -- cgit From 791021bf13ec9d0fc14bfd8c9c4b368ace568239 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Sun, 17 May 2015 21:44:44 +0200 Subject: mac802154: check for really changes This patch adds check if the value is really changed inside pib/mib. If a transceiver do support only one value for e.g. max_be then this will also handle that the driver layer doesn't need to care about handling to set one value only. Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/mac802154/cfg.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'net') diff --git a/net/mac802154/cfg.c b/net/mac802154/cfg.c index c63601582c71..45c4dc39766e 100644 --- a/net/mac802154/cfg.c +++ b/net/mac802154/cfg.c @@ -73,6 +73,10 @@ ieee802154_set_channel(struct wpan_phy *wpan_phy, u8 page, u8 channel) ASSERT_RTNL(); + if (wpan_phy->current_page == page && + wpan_phy->current_channel == channel) + return 0; + ret = drv_set_channel(local, page, channel); if (!ret) { wpan_phy->current_page = page; @@ -91,6 +95,9 @@ ieee802154_set_cca_mode(struct wpan_phy *wpan_phy, ASSERT_RTNL(); + if (wpan_phy_cca_cmp(&wpan_phy->cca, cca)) + return 0; + /* check if phy support this setting */ if (!(local->hw.flags & IEEE802154_HW_CCA_MODE)) return -EOPNOTSUPP; @@ -108,6 +115,9 @@ ieee802154_set_pan_id(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, { ASSERT_RTNL(); + if (wpan_dev->pan_id == pan_id) + return 0; + wpan_dev->pan_id = pan_id; return 0; } @@ -121,6 +131,10 @@ ieee802154_set_backoff_exponent(struct wpan_phy *wpan_phy, ASSERT_RTNL(); + if (wpan_dev->min_be == min_be && + wpan_dev->max_be == max_be) + return 0; + if (!(local->hw.flags & IEEE802154_HW_CSMA_PARAMS)) return -EOPNOTSUPP; @@ -135,6 +149,9 @@ ieee802154_set_short_addr(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, { ASSERT_RTNL(); + if (wpan_dev->short_addr == short_addr) + return 0; + wpan_dev->short_addr = short_addr; return 0; } @@ -148,6 +165,9 @@ ieee802154_set_max_csma_backoffs(struct wpan_phy *wpan_phy, ASSERT_RTNL(); + if (wpan_dev->csma_retries == max_csma_backoffs) + return 0; + if (!(local->hw.flags & IEEE802154_HW_CSMA_PARAMS)) return -EOPNOTSUPP; @@ -164,6 +184,9 @@ ieee802154_set_max_frame_retries(struct wpan_phy *wpan_phy, ASSERT_RTNL(); + if (wpan_dev->frame_retries == max_frame_retries) + return 0; + if (!(local->hw.flags & IEEE802154_HW_FRAME_RETRIES)) return -EOPNOTSUPP; @@ -179,6 +202,9 @@ ieee802154_set_lbt_mode(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, ASSERT_RTNL(); + if (wpan_dev->lbt == mode) + return 0; + if (!(local->hw.flags & IEEE802154_HW_LBT)) return -EOPNOTSUPP; -- cgit From 8329fcf11f5c2ae2fb7b1f56f05b0bb899babc85 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Sun, 17 May 2015 21:44:45 +0200 Subject: mac802154: remove check if operation is supported This patch removes the check if operation is supported by driver layer. This is done now by capabilities flags, if these are valid then the driver should support the operation, otherwise a WARN_ON occurs. Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/mac802154/cfg.c | 24 ------------------------ 1 file changed, 24 deletions(-) (limited to 'net') diff --git a/net/mac802154/cfg.c b/net/mac802154/cfg.c index 45c4dc39766e..d5290ea49dca 100644 --- a/net/mac802154/cfg.c +++ b/net/mac802154/cfg.c @@ -98,10 +98,6 @@ ieee802154_set_cca_mode(struct wpan_phy *wpan_phy, if (wpan_phy_cca_cmp(&wpan_phy->cca, cca)) return 0; - /* check if phy support this setting */ - if (!(local->hw.flags & IEEE802154_HW_CCA_MODE)) - return -EOPNOTSUPP; - ret = drv_set_cca_mode(local, cca); if (!ret) wpan_phy->cca = *cca; @@ -127,17 +123,12 @@ ieee802154_set_backoff_exponent(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, u8 min_be, u8 max_be) { - struct ieee802154_local *local = wpan_phy_priv(wpan_phy); - ASSERT_RTNL(); if (wpan_dev->min_be == min_be && wpan_dev->max_be == max_be) return 0; - if (!(local->hw.flags & IEEE802154_HW_CSMA_PARAMS)) - return -EOPNOTSUPP; - wpan_dev->min_be = min_be; wpan_dev->max_be = max_be; return 0; @@ -161,16 +152,11 @@ ieee802154_set_max_csma_backoffs(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, u8 max_csma_backoffs) { - struct ieee802154_local *local = wpan_phy_priv(wpan_phy); - ASSERT_RTNL(); if (wpan_dev->csma_retries == max_csma_backoffs) return 0; - if (!(local->hw.flags & IEEE802154_HW_CSMA_PARAMS)) - return -EOPNOTSUPP; - wpan_dev->csma_retries = max_csma_backoffs; return 0; } @@ -180,16 +166,11 @@ ieee802154_set_max_frame_retries(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, s8 max_frame_retries) { - struct ieee802154_local *local = wpan_phy_priv(wpan_phy); - ASSERT_RTNL(); if (wpan_dev->frame_retries == max_frame_retries) return 0; - if (!(local->hw.flags & IEEE802154_HW_FRAME_RETRIES)) - return -EOPNOTSUPP; - wpan_dev->frame_retries = max_frame_retries; return 0; } @@ -198,16 +179,11 @@ static int ieee802154_set_lbt_mode(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, bool mode) { - struct ieee802154_local *local = wpan_phy_priv(wpan_phy); - ASSERT_RTNL(); if (wpan_dev->lbt == mode) return 0; - if (!(local->hw.flags & IEEE802154_HW_LBT)) - return -EOPNOTSUPP; - wpan_dev->lbt = mode; return 0; } -- cgit From edea8f7c75ec6c238130bd7e74d9f6f4c26e97b0 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Sun, 17 May 2015 21:44:46 +0200 Subject: cfg802154: introduce wpan phy flags This patch introduce a flag property for the wpan phy structure. The current flag settings in ieee802154_hw are accessable in mac802154 layer only which is okay for flags which indicates MAC handling which are done by phy. For real PHY layer settings like cca mode, transmit power, cca energy detection level. The difference between these flags are that the MAC handling flags are only handled in mac802154/HardMac layer e.g. on an interface up. The phy settings are direct netlink calls from nl802154 into the driver layer and the nl802154 need to have a chance to check if the driver supports this handling before sending to the next layer. We also check now on PHY flags while dumping and setting pib attributes. In comparing with MIB attributes the 802.15.4 gives us an default value which we assume when a transceiver implement less functionality. In case of MIB settings the nl802154 layer doesn't need to check on the ieee802154_hw flags then. Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/ieee802154/nl802154.c | 27 +++++++++++++++++---------- net/mac802154/mac_cmd.c | 6 +++--- 2 files changed, 20 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index fa0c4048b0e8..40fb0be009b2 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -291,19 +291,23 @@ static int nl802154_send_wpan_phy(struct cfg802154_registered_device *rdev, goto nla_put_failure; /* cca mode */ - if (nla_put_u32(msg, NL802154_ATTR_CCA_MODE, - rdev->wpan_phy.cca.mode)) - goto nla_put_failure; - - if (rdev->wpan_phy.cca.mode == NL802154_CCA_ENERGY_CARRIER) { - if (nla_put_u32(msg, NL802154_ATTR_CCA_OPT, - rdev->wpan_phy.cca.opt)) + if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_MODE) { + if (nla_put_u32(msg, NL802154_ATTR_CCA_MODE, + rdev->wpan_phy.cca.mode)) goto nla_put_failure; + + if (rdev->wpan_phy.cca.mode == NL802154_CCA_ENERGY_CARRIER) { + if (nla_put_u32(msg, NL802154_ATTR_CCA_OPT, + rdev->wpan_phy.cca.opt)) + goto nla_put_failure; + } } - if (nla_put_s32(msg, NL802154_ATTR_TX_POWER, - rdev->wpan_phy.transmit_power)) - goto nla_put_failure; + if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_TXPOWER) { + if (nla_put_s32(msg, NL802154_ATTR_TX_POWER, + rdev->wpan_phy.transmit_power)) + goto nla_put_failure; + } finish: genlmsg_end(msg, hdr); @@ -637,6 +641,9 @@ static int nl802154_set_cca_mode(struct sk_buff *skb, struct genl_info *info) struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct wpan_phy_cca cca; + if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_MODE) + return -EOPNOTSUPP; + if (!info->attrs[NL802154_ATTR_CCA_MODE]) return -EINVAL; diff --git a/net/mac802154/mac_cmd.c b/net/mac802154/mac_cmd.c index bdccb4ecd30f..6dcbb3b5994c 100644 --- a/net/mac802154/mac_cmd.c +++ b/net/mac802154/mac_cmd.c @@ -91,19 +91,19 @@ static int mac802154_set_mac_params(struct net_device *dev, wpan_dev->frame_retries = params->frame_retries; wpan_dev->lbt = params->lbt; - if (local->hw.flags & IEEE802154_HW_TXPOWER) { + if (local->hw.phy->flags & WPAN_PHY_FLAG_TXPOWER) { ret = drv_set_tx_power(local, params->transmit_power); if (ret < 0) return ret; } - if (local->hw.flags & IEEE802154_HW_CCA_MODE) { + if (local->hw.phy->flags & WPAN_PHY_FLAG_CCA_MODE) { ret = drv_set_cca_mode(local, ¶ms->cca); if (ret < 0) return ret; } - if (local->hw.flags & IEEE802154_HW_CCA_ED_LEVEL) { + if (local->hw.phy->flags & WPAN_PHY_FLAG_CCA_ED_LEVEL) { ret = drv_set_cca_ed_level(local, params->cca_ed_level); if (ret < 0) return ret; -- cgit From 65318680c97cca15e3678148b3a5acaa33e991ec Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Sun, 17 May 2015 21:44:47 +0200 Subject: ieee802154: add iftypes capability This patch adds capability flags for supported interface types. Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/ieee802154/nl802154.c | 3 ++- net/mac802154/main.c | 6 ++++++ 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index 40fb0be009b2..3afb20e43ff8 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -579,7 +579,8 @@ static int nl802154_new_interface(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL802154_ATTR_IFTYPE]) { type = nla_get_u32(info->attrs[NL802154_ATTR_IFTYPE]); - if (type > NL802154_IFTYPE_MAX) + if (type > NL802154_IFTYPE_MAX || + !(rdev->wpan_phy.supported.iftypes & BIT(type))) return -EINVAL; } diff --git a/net/mac802154/main.c b/net/mac802154/main.c index ddcd6ff8d39c..356b346e1ee8 100644 --- a/net/mac802154/main.c +++ b/net/mac802154/main.c @@ -116,6 +116,9 @@ ieee802154_alloc_hw(size_t priv_data_len, const struct ieee802154_ops *ops) phy->supported.max_csma_backoffs = 5; phy->supported.lbt = NL802154_SUPPORTED_BOOL_FALSE; + /* always supported */ + phy->supported.iftypes = BIT(NL802154_IFTYPE_NODE); + return &local->hw; } EXPORT_SYMBOL(ieee802154_alloc_hw); @@ -181,6 +184,9 @@ int ieee802154_register_hw(struct ieee802154_hw *hw) local->phy->supported.max_frame_retries = -1; } + if (hw->flags & IEEE802154_HW_PROMISCUOUS) + local->phy->supported.iftypes |= BIT(NL802154_IFTYPE_MONITOR); + rc = wpan_phy_register(local->phy); if (rc < 0) goto out_wq; -- cgit From 0e66545701014814360b08a7a43ca652f82b6e5a Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Sun, 17 May 2015 21:44:53 +0200 Subject: nl802154: add support for dump phy capabilities This patch add support to nl802154 to dump all phy capabilities which is inside the wpan_phy_supported struct. Also we introduce a new method to dumping supported channels. The new method will offer a easier interface and has lesser netlink traffic. Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/ieee802154/nl802154.c | 117 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 116 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index 3afb20e43ff8..54f4959fc9bb 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -225,6 +225,8 @@ static const struct nla_policy nl802154_policy[NL802154_ATTR_MAX+1] = { [NL802154_ATTR_MAX_FRAME_RETRIES] = { .type = NLA_S8, }, [NL802154_ATTR_LBT_MODE] = { .type = NLA_U8, }, + + [NL802154_ATTR_WPAN_PHY_CAPS] = { .type = NLA_NESTED }, }; /* message building helper */ @@ -235,6 +237,28 @@ static inline void *nl802154hdr_put(struct sk_buff *skb, u32 portid, u32 seq, return genlmsg_put(skb, portid, seq, &nl802154_fam, flags, cmd); } +static int +nl802154_put_flags(struct sk_buff *msg, int attr, u32 mask) +{ + struct nlattr *nl_flags = nla_nest_start(msg, attr); + int i; + + if (!nl_flags) + return -ENOBUFS; + + i = 0; + while (mask) { + if ((mask & 1) && nla_put_flag(msg, i)) + return -ENOBUFS; + + mask >>= 1; + i++; + } + + nla_nest_end(msg, nl_flags); + return 0; +} + static int nl802154_send_wpan_phy_channels(struct cfg802154_registered_device *rdev, struct sk_buff *msg) @@ -256,6 +280,92 @@ nl802154_send_wpan_phy_channels(struct cfg802154_registered_device *rdev, return 0; } +static int +nl802154_put_capabilities(struct sk_buff *msg, + struct cfg802154_registered_device *rdev) +{ + const struct wpan_phy_supported *caps = &rdev->wpan_phy.supported; + struct nlattr *nl_caps, *nl_channels; + int i; + + nl_caps = nla_nest_start(msg, NL802154_ATTR_WPAN_PHY_CAPS); + if (!nl_caps) + return -ENOBUFS; + + nl_channels = nla_nest_start(msg, NL802154_CAP_ATTR_CHANNELS); + if (!nl_channels) + return -ENOBUFS; + + for (i = 0; i <= IEEE802154_MAX_PAGE; i++) { + if (caps->channels[i]) { + if (nl802154_put_flags(msg, i, caps->channels[i])) + return -ENOBUFS; + } + } + + nla_nest_end(msg, nl_channels); + + if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_ED_LEVEL) { + struct nlattr *nl_ed_lvls; + + nl_ed_lvls = nla_nest_start(msg, + NL802154_CAP_ATTR_CCA_ED_LEVELS); + if (!nl_ed_lvls) + return -ENOBUFS; + + for (i = 0; i < caps->cca_ed_levels_size; i++) { + if (nla_put_s32(msg, i, caps->cca_ed_levels[i])) + return -ENOBUFS; + } + + nla_nest_end(msg, nl_ed_lvls); + } + + if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_TXPOWER) { + struct nlattr *nl_tx_pwrs; + + nl_tx_pwrs = nla_nest_start(msg, NL802154_CAP_ATTR_TX_POWERS); + if (!nl_tx_pwrs) + return -ENOBUFS; + + for (i = 0; i < caps->tx_powers_size; i++) { + if (nla_put_s32(msg, i, caps->tx_powers[i])) + return -ENOBUFS; + } + + nla_nest_end(msg, nl_tx_pwrs); + } + + if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_MODE) { + if (nl802154_put_flags(msg, NL802154_CAP_ATTR_CCA_MODES, + caps->cca_modes) || + nl802154_put_flags(msg, NL802154_CAP_ATTR_CCA_OPTS, + caps->cca_opts)) + return -ENOBUFS; + } + + if (nla_put_u8(msg, NL802154_CAP_ATTR_MIN_MINBE, caps->min_minbe) || + nla_put_u8(msg, NL802154_CAP_ATTR_MAX_MINBE, caps->max_minbe) || + nla_put_u8(msg, NL802154_CAP_ATTR_MIN_MAXBE, caps->min_maxbe) || + nla_put_u8(msg, NL802154_CAP_ATTR_MAX_MAXBE, caps->max_maxbe) || + nla_put_u8(msg, NL802154_CAP_ATTR_MIN_CSMA_BACKOFFS, + caps->min_csma_backoffs) || + nla_put_u8(msg, NL802154_CAP_ATTR_MAX_CSMA_BACKOFFS, + caps->max_csma_backoffs) || + nla_put_s8(msg, NL802154_CAP_ATTR_MIN_FRAME_RETRIES, + caps->min_frame_retries) || + nla_put_s8(msg, NL802154_CAP_ATTR_MAX_FRAME_RETRIES, + caps->max_frame_retries) || + nl802154_put_flags(msg, NL802154_CAP_ATTR_IFTYPES, + caps->iftypes) || + nla_put_u32(msg, NL802154_CAP_ATTR_LBT, caps->lbt)) + return -ENOBUFS; + + nla_nest_end(msg, nl_caps); + + return 0; +} + static int nl802154_send_wpan_phy(struct cfg802154_registered_device *rdev, enum nl802154_commands cmd, struct sk_buff *msg, u32 portid, u32 seq, @@ -286,7 +396,9 @@ static int nl802154_send_wpan_phy(struct cfg802154_registered_device *rdev, rdev->wpan_phy.current_channel)) goto nla_put_failure; - /* supported channels array */ + /* TODO remove this behaviour, we still keep support it for a while + * so users can change the behaviour to the new one. + */ if (nl802154_send_wpan_phy_channels(rdev, msg)) goto nla_put_failure; @@ -309,6 +421,9 @@ static int nl802154_send_wpan_phy(struct cfg802154_registered_device *rdev, goto nla_put_failure; } + if (nl802154_put_capabilities(msg, rdev)) + goto nla_put_failure; + finish: genlmsg_end(msg, hdr); return 0; -- cgit From 3862eba691e3928338e188915676dd4fa7cefcaa Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Sun, 17 May 2015 21:44:56 +0200 Subject: mac802154: tx: allow xmit complete from hard irq Replace consume_skb with dev_consume_skb_any in ieee802154_xmit_complete which can be called in hard irq and other contexts. Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/mac802154/util.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mac802154/util.c b/net/mac802154/util.c index 150bf807e572..583435f38930 100644 --- a/net/mac802154/util.c +++ b/net/mac802154/util.c @@ -85,11 +85,10 @@ void ieee802154_xmit_complete(struct ieee802154_hw *hw, struct sk_buff *skb, hrtimer_start(&local->ifs_timer, ktime_set(0, hw->phy->sifs_period * NSEC_PER_USEC), HRTIMER_MODE_REL); - - consume_skb(skb); } else { ieee802154_wake_queue(hw); - consume_skb(skb); } + + dev_consume_skb_any(skb); } EXPORT_SYMBOL(ieee802154_xmit_complete); -- cgit From 73e85ed36a96324a2ba5990024f621bf9907fbd5 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 19 May 2015 14:18:35 +0200 Subject: mac802154: select CRYPTO when needed The mac802154 subsystem uses functions from the crypto layer and correctly selects the individual crypto algorithms, but fails to build when the crypto layer is disabled altogether: crypto/built-in.o: In function `crypto_ctr_free': :(.text+0x80): undefined reference to `crypto_drop_spawn' crypto/built-in.o: In function `crypto_rfc3686_free': :(.text+0xac): undefined reference to `crypto_drop_spawn' crypto/built-in.o: In function `crypto_ctr_crypt': :(.text+0x2f0): undefined reference to `blkcipher_walk_virt_block' :(.text+0x2f8): undefined reference to `crypto_inc' To solve that, this patch also selects the core crypto code, like all other users of that code do. Signed-off-by: Arnd Bergmann Reviewed-by: Stefan Schmidt Signed-off-by: Marcel Holtmann --- net/mac802154/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/mac802154/Kconfig b/net/mac802154/Kconfig index aa462b480a39..fb45287ebac3 100644 --- a/net/mac802154/Kconfig +++ b/net/mac802154/Kconfig @@ -2,6 +2,7 @@ config MAC802154 tristate "Generic IEEE 802.15.4 Soft Networking Stack (mac802154)" depends on IEEE802154 select CRC_CCITT + select CRYPTO select CRYPTO_AUTHENC select CRYPTO_CCM select CRYPTO_CTR -- cgit From 011c391a090e243ef0116805a6ad77df74c22cc0 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Tue, 19 May 2015 21:06:04 +0300 Subject: Bluetooth: Add debug logs for legacy SMP crypto functions To help debug legacy SMP crypto functions add debug logs of the various values involved. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/smp.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 1ab3dc9c8f99..659371af39e4 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -371,6 +371,8 @@ static int smp_e(struct crypto_blkcipher *tfm, const u8 *k, u8 *r) uint8_t tmp[16], data[16]; int err; + SMP_DBG("k %16phN r %16phN", k, r); + if (!tfm) { BT_ERR("tfm %p", tfm); return -EINVAL; @@ -400,6 +402,8 @@ static int smp_e(struct crypto_blkcipher *tfm, const u8 *k, u8 *r) /* Most significant octet of encryptedData corresponds to data[0] */ swap_buf(data, r, 16); + SMP_DBG("r %16phN", r); + return err; } @@ -410,6 +414,10 @@ static int smp_c1(struct crypto_blkcipher *tfm_aes, const u8 k[16], u8 p1[16], p2[16]; int err; + SMP_DBG("k %16phN r %16phN", k, r); + SMP_DBG("iat %u ia %6phN rat %u ra %6phN", _iat, ia, _rat, ra); + SMP_DBG("preq %7phN pres %7phN", preq, pres); + memset(p1, 0, 16); /* p1 = pres || preq || _rat || _iat */ @@ -418,10 +426,7 @@ static int smp_c1(struct crypto_blkcipher *tfm_aes, const u8 k[16], memcpy(p1 + 2, preq, 7); memcpy(p1 + 9, pres, 7); - /* p2 = padding || ia || ra */ - memcpy(p2, ra, 6); - memcpy(p2 + 6, ia, 6); - memset(p2 + 12, 0, 4); + SMP_DBG("p1 %16phN", p1); /* res = r XOR p1 */ u128_xor((u128 *) res, (u128 *) r, (u128 *) p1); @@ -433,6 +438,13 @@ static int smp_c1(struct crypto_blkcipher *tfm_aes, const u8 k[16], return err; } + /* p2 = padding || ia || ra */ + memcpy(p2, ra, 6); + memcpy(p2 + 6, ia, 6); + memset(p2 + 12, 0, 4); + + SMP_DBG("p2 %16phN", p2); + /* res = res XOR p2 */ u128_xor((u128 *) res, (u128 *) res, (u128 *) p2); -- cgit From aea0929e516a1ff4c4458203d1f3375eee9acc26 Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Mon, 18 May 2015 14:35:58 -0400 Subject: tcp: Return error instead of partial read for saved syn headers Currently the getsockopt() requesting the cached contents of the syn packet headers will fail silently if the caller uses a buffer that is too small to contain the requested data. Rather than fail silently and discard the headers, getsockopt() should return an error and report the required size to hold the data. Signed-off-by: Eric B Munson Cc: Eric Dumazet Cc: Alexey Kuznetsov Cc: James Morris Cc: Hideaki YOSHIFUJI Cc: Patrick McHardy Cc: netdev@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c724195e5862..bb9bb844204f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2845,7 +2845,15 @@ static int do_tcp_getsockopt(struct sock *sk, int level, lock_sock(sk); if (tp->saved_syn) { - len = min_t(unsigned int, tp->saved_syn[0], len); + if (len < tp->saved_syn[0]) { + if (put_user(tp->saved_syn[0], optlen)) { + release_sock(sk); + return -EFAULT; + } + release_sock(sk); + return -EINVAL; + } + len = tp->saved_syn[0]; if (put_user(len, optlen)) { release_sock(sk); return -EFAULT; -- cgit From 492135557dc090a1abb2cfbe1a412757e3ed68ab Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 19 May 2015 21:04:22 +0200 Subject: tcp: add rfc3168, section 6.1.1.1. fallback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This work as a follow-up of commit f7b3bec6f516 ("net: allow setting ecn via routing table") and adds RFC3168 section 6.1.1.1. fallback for outgoing ECN connections. In other words, this work adds a retry with a non-ECN setup SYN packet, as suggested from the RFC on the first timeout: [...] A host that receives no reply to an ECN-setup SYN within the normal SYN retransmission timeout interval MAY resend the SYN and any subsequent SYN retransmissions with CWR and ECE cleared. [...] Schematic client-side view when assuming the server is in tcp_ecn=2 mode, that is, Linux default since 2009 via commit 255cac91c3c9 ("tcp: extend ECN sysctl to allow server-side only ECN"): 1) Normal ECN-capable path: SYN ECE CWR -----> <----- SYN ACK ECE ACK -----> 2) Path with broken middlebox, when client has fallback: SYN ECE CWR ----X crappy middlebox drops packet (timeout, rtx) SYN -----> <----- SYN ACK ACK -----> In case we would not have the fallback implemented, the middlebox drop point would basically end up as: SYN ECE CWR ----X crappy middlebox drops packet (timeout, rtx) SYN ECE CWR ----X crappy middlebox drops packet (timeout, rtx) SYN ECE CWR ----X crappy middlebox drops packet (timeout, rtx) In any case, it's rather a smaller percentage of sites where there would occur such additional setup latency: it was found in end of 2014 that ~56% of IPv4 and 65% of IPv6 servers of Alexa 1 million list would negotiate ECN (aka tcp_ecn=2 default), 0.42% of these webservers will fail to connect when trying to negotiate with ECN (tcp_ecn=1) due to timeouts, which the fallback would mitigate with a slight latency trade-off. Recent related paper on this topic: Brian Trammell, Mirja Kühlewind, Damiano Boppart, Iain Learmonth, Gorry Fairhurst, and Richard Scheffenegger: "Enabling Internet-Wide Deployment of Explicit Congestion Notification." Proc. PAM 2015, New York. http://ecn.ethz.ch/ecn-pam15.pdf Thus, when net.ipv4.tcp_ecn=1 is being set, the patch will perform RFC3168, section 6.1.1.1. fallback on timeout. For users explicitly not wanting this which can be in DC use case, we add a net.ipv4.tcp_ecn_fallback knob that allows for disabling the fallback. tp->ecn_flags are not being cleared in tcp_ecn_clear_syn() on output, but rather we let tcp_ecn_rcv_synack() take that over on input path in case a SYN ACK ECE was delayed. Thus a spurious SYN retransmission will not prevent ECN being negotiated eventually in that case. Reference: https://www.ietf.org/proceedings/92/slides/slides-92-iccrg-1.pdf Reference: https://www.ietf.org/proceedings/89/slides/slides-89-tsvarea-1.pdf Signed-off-by: Daniel Borkmann Signed-off-by: Florian Westphal Signed-off-by: Mirja Kühlewind Signed-off-by: Brian Trammell Cc: Eric Dumazet Cc: Dave That Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 7 +++++++ net/ipv4/tcp_ipv4.c | 5 ++++- net/ipv4/tcp_output.c | 13 +++++++++++++ 3 files changed, 24 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index c3852a7ff3c7..841de32f1fee 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -820,6 +820,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_ecn_fallback", + .data = &init_net.ipv4.sysctl_tcp_ecn_fallback, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { .procname = "ip_local_port_range", .maxlen = sizeof(init_net.ipv4.ip_local_ports.range), diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 91cb4768a860..0cc4b5a630cd 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2411,12 +2411,15 @@ static int __net_init tcp_sk_init(struct net *net) goto fail; *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk; } + net->ipv4.sysctl_tcp_ecn = 2; + net->ipv4.sysctl_tcp_ecn_fallback = 1; + net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; - return 0; + return 0; fail: tcp_sk_exit(net); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 08c2cc40b26d..e29d43b5a0bb 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -350,6 +350,15 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) } } +static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb) +{ + if (sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback) + /* tp->ecn_flags are cleared at a later point in time when + * SYN ACK is ultimatively being received. + */ + TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR); +} + static void tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th, struct sock *sk) @@ -2615,6 +2624,10 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) } } + /* RFC3168, section 6.1.1.1. ECN fallback */ + if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN) + tcp_ecn_clear_syn(sk, skb); + tcp_retrans_try_collapse(sk, skb, cur_mss); /* Make a copy, if the first transmission SKB clone we made -- cgit From 94c78cb4523c246ec37959436f04e7c946f7494b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 20 May 2015 11:31:47 +0200 Subject: mac80211: fix memory leak My recent change here introduced a possible memory leak if the driver registers an invalid cipher schemes. This won't really happen in practice, but fix the leak nonetheless. Fixes: e3a55b5399d55 ("mac80211: validate cipher scheme PN length better") Signed-off-by: Johannes Berg --- net/mac80211/main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 3c956c5f99b2..99d27babd9f0 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -770,8 +770,10 @@ static int ieee80211_init_cipher_suites(struct ieee80211_local *local) for (r = 0; r < local->hw.n_cipher_schemes; r++) { suites[w++] = cs[r].cipher; - if (WARN_ON(cs[r].pn_len > IEEE80211_MAX_PN_LEN)) + if (WARN_ON(cs[r].pn_len > IEEE80211_MAX_PN_LEN)) { + kfree(suites); return -EINVAL; + } } } -- cgit From 28f297a7af7e00500d72e6c0421c7e10ec96f627 Mon Sep 17 00:00:00 2001 From: Lars-Peter Clausen Date: Sat, 16 May 2015 14:23:55 +0200 Subject: net: rfkill: Switch to PM ops Use dev_pm_ops instead of the legacy suspend/resume callbacks for the rfkill class suspend and resume operations. Signed-off-by: Lars-Peter Clausen Signed-off-by: Johannes Berg --- net/rfkill/core.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/rfkill/core.c b/net/rfkill/core.c index fa7cd792791c..f12149a29cb1 100644 --- a/net/rfkill/core.c +++ b/net/rfkill/core.c @@ -794,7 +794,8 @@ void rfkill_resume_polling(struct rfkill *rfkill) } EXPORT_SYMBOL(rfkill_resume_polling); -static int rfkill_suspend(struct device *dev, pm_message_t state) +#ifdef CONFIG_PM_SLEEP +static int rfkill_suspend(struct device *dev) { struct rfkill *rfkill = to_rfkill(dev); @@ -818,13 +819,18 @@ static int rfkill_resume(struct device *dev) return 0; } +static SIMPLE_DEV_PM_OPS(rfkill_pm_ops, rfkill_suspend, rfkill_resume); +#define RFKILL_PM_OPS (&rfkill_pm_ops) +#else +#define RFKILL_PM_OPS NULL +#endif + static struct class rfkill_class = { .name = "rfkill", .dev_release = rfkill_release, .dev_groups = rfkill_dev_groups, .dev_uevent = rfkill_dev_uevent, - .suspend = rfkill_suspend, - .resume = rfkill_resume, + .pm = RFKILL_PM_OPS, }; bool rfkill_blocked(struct rfkill *rfkill) -- cgit From 262918d847c0f66a0ec05db9de7571ed72e422af Mon Sep 17 00:00:00 2001 From: Lars-Peter Clausen Date: Sat, 16 May 2015 14:35:54 +0200 Subject: cfg80211: Switch to PM ops Use dev_pm_ops instead of the legacy suspend/resume callbacks for the wiphy class suspend and resume operations. Signed-off-by: Lars-Peter Clausen Signed-off-by: Johannes Berg --- net/wireless/sysfs.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c index 9ee6bc1a7610..9cee0220665d 100644 --- a/net/wireless/sysfs.c +++ b/net/wireless/sysfs.c @@ -86,7 +86,7 @@ static int wiphy_uevent(struct device *dev, struct kobj_uevent_env *env) return 0; } -#ifdef CONFIG_PM +#ifdef CONFIG_PM_SLEEP static void cfg80211_leave_all(struct cfg80211_registered_device *rdev) { struct wireless_dev *wdev; @@ -95,7 +95,7 @@ static void cfg80211_leave_all(struct cfg80211_registered_device *rdev) cfg80211_leave(rdev, wdev); } -static int wiphy_suspend(struct device *dev, pm_message_t state) +static int wiphy_suspend(struct device *dev) { struct cfg80211_registered_device *rdev = dev_to_rdev(dev); int ret = 0; @@ -136,6 +136,11 @@ static int wiphy_resume(struct device *dev) return ret; } + +static SIMPLE_DEV_PM_OPS(wiphy_pm_ops, wiphy_suspend, wiphy_resume); +#define WIPHY_PM_OPS (&wiphy_pm_ops) +#else +#define WIPHY_PM_OPS NULL #endif static const void *wiphy_namespace(struct device *d) @@ -151,10 +156,7 @@ struct class ieee80211_class = { .dev_release = wiphy_dev_release, .dev_groups = ieee80211_groups, .dev_uevent = wiphy_uevent, -#ifdef CONFIG_PM - .suspend = wiphy_suspend, - .resume = wiphy_resume, -#endif + .pm = WIPHY_PM_OPS, .ns_type = &net_ns_type_operations, .namespace = wiphy_namespace, }; -- cgit From 464daaf04cb633ae1530652fd23cdea0f0d21dd5 Mon Sep 17 00:00:00 2001 From: Michal Kazior Date: Tue, 19 May 2015 14:13:36 +0200 Subject: mac80211: check fast-xmit on station change Drivers with fast-xmit (e.g. ath10k) running in AP_VLAN setups would fail to communicate with connected 4addr stations. The reason was when new station associates it first goes into master AP interface. It is not until later that a dedicated AP_VLAN is created for it and the station itself is moved there. After that Tx directed at the station should use 4addr header. However fast-xmit wasn't recalculated and 3addr header remained to be used. This in turn caused the connected 4addr stations to drop packets coming from the AP until some other event would cause fast-xmit to recalculate for that station (which could never come). Signed-off-by: Michal Kazior Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 3469bbdc891c..bb9f83640b46 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1411,6 +1411,7 @@ static int ieee80211_change_station(struct wiphy *wiphy, } sta->sdata = vlansdata; + ieee80211_check_fast_xmit(sta); if (sta->sta_state == IEEE80211_STA_AUTHORIZED && prev_4addr != new_4addr) { -- cgit From c5a71688e1e56e155fb79b8d699322f4f0793cc8 Mon Sep 17 00:00:00 2001 From: Arik Nemtsov Date: Tue, 19 May 2015 14:36:48 +0300 Subject: mac80211: disconnect TDLS stations on STA CSA When a station does a channel switch, it's not well defined what its TDLS peers would do. Avoid a situation when the local side marks a potentially disconnected peer as a TDLS peer. Keeping peers connected through CSA is doubly problematic with the upcoming TDLS WIDER-BW feature which allows peers to widen the BSS channel. The new channel transitioned-to might not be compatible and would require a re-negotiation anyway. Make sure to disallow new TDLS link during CSA. Signed-off-by: Arik Nemtsov Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 26 ++++++++++++++++++++++++++ net/mac80211/tdls.c | 6 ++++++ 2 files changed, 32 insertions(+) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 3294666f599c..387fe70ab126 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1098,6 +1098,24 @@ static void ieee80211_chswitch_timer(unsigned long data) ieee80211_queue_work(&sdata->local->hw, &sdata->u.mgd.chswitch_work); } +static void ieee80211_teardown_tdls_peers(struct ieee80211_sub_if_data *sdata) +{ + struct sta_info *sta; + u16 reason = WLAN_REASON_TDLS_TEARDOWN_UNSPECIFIED; + + rcu_read_lock(); + list_for_each_entry_rcu(sta, &sdata->local->sta_list, list) { + if (!sta->sta.tdls || sta->sdata != sdata || !sta->uploaded || + !test_sta_flag(sta, WLAN_STA_AUTHORIZED)) + continue; + + ieee80211_tdls_oper_request(&sdata->vif, sta->sta.addr, + NL80211_TDLS_TEARDOWN, reason, + GFP_ATOMIC); + } + rcu_read_unlock(); +} + static void ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, u64 timestamp, u32 device_timestamp, @@ -1161,6 +1179,14 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, return; } + /* + * Drop all TDLS peers - either we disconnect or move to a different + * channel from this point on. There's no telling what our peer will do. + * The TDLS WIDER_BW scenario is also problematic, as peers might now + * have an incompatible wider chandef. + */ + ieee80211_teardown_tdls_peers(sdata); + mutex_lock(&local->mtx); mutex_lock(&local->chanctx_mtx); conf = rcu_dereference_protected(sdata->vif.chanctx_conf, diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index 8a92a920ff17..75e8e3bba538 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -1183,6 +1183,12 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, switch (oper) { case NL80211_TDLS_ENABLE_LINK: + if (sdata->vif.csa_active) { + tdls_dbg(sdata, "TDLS: disallow link during CSA\n"); + ret = -EBUSY; + break; + } + rcu_read_lock(); sta = sta_info_get(sdata, peer); if (!sta) { -- cgit From 765c9c639fbb132af0cafc6e1da22fe6cea26bb8 Mon Sep 17 00:00:00 2001 From: Erik Kline Date: Mon, 18 May 2015 19:44:41 +0900 Subject: neigh: Better handling of transition to NUD_PROBE state [1] When entering NUD_PROBE state via neigh_update(), perhaps received from userspace, correctly (re)initialize the probes count to zero. This is useful for forcing revalidation of a neighbor (for example if the host is attempting to do DNA [IPv4 4436, IPv6 6059]). [2] Notify listeners when a neighbor goes into NUD_PROBE state. By sending notifications on entry to NUD_PROBE state listeners get more timely warnings of imminent connectivity issues. The current notifications on entry to NUD_STALE have somewhat limited usefulness: NUD_STALE is a perfectly normal state, as is NUD_DELAY, whereas notifications on entry to NUD_FAILURE come after a neighbor reachability problem has been confirmed (typically after three probes). Signed-off-by: Erik Kline Acked-By: Lorenzo Colitti Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/core/neighbour.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 3de654256028..3a74df750af4 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -913,6 +913,7 @@ static void neigh_timer_handler(unsigned long arg) neigh->nud_state = NUD_PROBE; neigh->updated = jiffies; atomic_set(&neigh->probes, 0); + notify = 1; next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME); } } else { @@ -1144,6 +1145,8 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, if (new != old) { neigh_del_timer(neigh); + if (new & NUD_PROBE) + atomic_set(&neigh->probes, 0); if (new & NUD_IN_TIMER) neigh_add_timer(neigh, (jiffies + ((new & NUD_REACHABLE) ? -- cgit From eb9344781a2f8381ed60cd9e662d9ced2d168ecb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 May 2015 13:26:55 -0700 Subject: tcp: add a force_schedule argument to sk_stream_alloc_skb() In commit 8e4d980ac215 ("tcp: fix behavior for epoll edge trigger") we fixed a possible hang of TCP sockets under memory pressure, by allowing sk_stream_alloc_skb() to use sk_forced_mem_schedule() if no packet is in socket write queue. It turns out there are other cases where we want to force memory schedule : tcp_fragment() & tso_fragment() need to split a big TSO packet into two smaller ones. If we block here because of TCP memory pressure, we can effectively block TCP socket from sending new data. If no further ACK is coming, this hang would be definitive, and socket has no chance to effectively reduce its memory usage. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 19 +++++++++++-------- net/ipv4/tcp_output.c | 10 +++++----- 2 files changed, 16 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index bb9bb844204f..ca1d476c80ef 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -808,7 +808,8 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, } EXPORT_SYMBOL(tcp_splice_read); -struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) +struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, + bool force_schedule) { struct sk_buff *skb; @@ -820,15 +821,15 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp); if (likely(skb)) { - bool mem_schedule; + bool mem_scheduled; - if (skb_queue_len(&sk->sk_write_queue) == 0) { - mem_schedule = true; + if (force_schedule) { + mem_scheduled = true; sk_forced_mem_schedule(sk, skb->truesize); } else { - mem_schedule = sk_wmem_schedule(sk, skb->truesize); + mem_scheduled = sk_wmem_schedule(sk, skb->truesize); } - if (likely(mem_schedule)) { + if (likely(mem_scheduled)) { skb_reserve(skb, sk->sk_prot->max_header); /* * Make sure that we have exactly size bytes @@ -918,7 +919,8 @@ new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; - skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation); + skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, + skb_queue_empty(&sk->sk_write_queue)); if (!skb) goto wait_for_memory; @@ -1154,7 +1156,8 @@ new_segment: skb = sk_stream_alloc_skb(sk, select_size(sk, sg), - sk->sk_allocation); + sk->sk_allocation, + skb_queue_empty(&sk->sk_write_queue)); if (!skb) goto wait_for_memory; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e29d43b5a0bb..3aebe0157dfa 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1172,7 +1172,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, return -ENOMEM; /* Get a new skb... force flag on. */ - buff = sk_stream_alloc_skb(sk, nsize, gfp); + buff = sk_stream_alloc_skb(sk, nsize, gfp, true); if (!buff) return -ENOMEM; /* We'll just try again later. */ @@ -1731,7 +1731,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, if (skb->len != skb->data_len) return tcp_fragment(sk, skb, len, mss_now, gfp); - buff = sk_stream_alloc_skb(sk, 0, gfp); + buff = sk_stream_alloc_skb(sk, 0, gfp, true); if (unlikely(!buff)) return -ENOMEM; @@ -1950,7 +1950,7 @@ static int tcp_mtu_probe(struct sock *sk) } /* We're allowed to probe. Build it now. */ - nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC); + nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false); if (!nskb) return -1; sk->sk_wmem_queued += nskb->truesize; @@ -3190,7 +3190,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) /* limit to order-0 allocations */ space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER)); - syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation); + syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation, false); if (!syn_data) goto fallback; syn_data->ip_summed = CHECKSUM_PARTIAL; @@ -3256,7 +3256,7 @@ int tcp_connect(struct sock *sk) return 0; } - buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation); + buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, true); if (unlikely(!buff)) return -ENOBUFS; -- cgit From e7582bab5d28ea72e07cf2c74632eaf46a6c1a50 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 19 May 2015 22:33:25 +0200 Subject: net: dev: reduce both ingress hook ifdefs Reduce ifdef pollution slightly, no functional change. We can simply remove the extra alternative definition of handle_ing() and nf_ingress(). Signed-off-by: Daniel Borkmann Acked-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/core/dev.c | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 0e7afefb5072..594163d0c6eb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3627,11 +3627,11 @@ int (*br_fdb_test_addr_hook)(struct net_device *dev, EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); #endif -#ifdef CONFIG_NET_CLS_ACT static inline struct sk_buff *handle_ing(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, struct net_device *orig_dev) { +#ifdef CONFIG_NET_CLS_ACT struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list); struct tcf_result cl_res; @@ -3665,17 +3665,9 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb, default: break; } - - return skb; -} -#else -static inline struct sk_buff *handle_ing(struct sk_buff *skb, - struct packet_type **pt_prev, - int *ret, struct net_device *orig_dev) -{ +#endif /* CONFIG_NET_CLS_ACT */ return skb; } -#endif /** * netdev_rx_handler_register - register receive handler @@ -3748,10 +3740,10 @@ static bool skb_pfmemalloc_protocol(struct sk_buff *skb) } } -#ifdef CONFIG_NETFILTER_INGRESS static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, struct net_device *orig_dev) { +#ifdef CONFIG_NETFILTER_INGRESS if (nf_hook_ingress_active(skb)) { if (*pt_prev) { *ret = deliver_skb(skb, *pt_prev, orig_dev); @@ -3760,15 +3752,9 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, return nf_hook_ingress(skb); } +#endif /* CONFIG_NETFILTER_INGRESS */ return 0; } -#else -static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, - int *ret, struct net_device *orig_dev) -{ - return 0; -} -#endif static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc) { -- cgit From 04fd61ab36ec065e194ab5e74ae34a5240d992bb Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 19 May 2015 16:59:03 -0700 Subject: bpf: allow bpf programs to tail-call other bpf programs introduce bpf_tail_call(ctx, &jmp_table, index) helper function which can be used from BPF programs like: int bpf_prog(struct pt_regs *ctx) { ... bpf_tail_call(ctx, &jmp_table, index); ... } that is roughly equivalent to: int bpf_prog(struct pt_regs *ctx) { ... if (jmp_table[index]) return (*jmp_table[index])(ctx); ... } The important detail that it's not a normal call, but a tail call. The kernel stack is precious, so this helper reuses the current stack frame and jumps into another BPF program without adding extra call frame. It's trivially done in interpreter and a bit trickier in JITs. In case of x64 JIT the bigger part of generated assembler prologue is common for all programs, so it is simply skipped while jumping. Other JITs can do similar prologue-skipping optimization or do stack unwind before jumping into the next program. bpf_tail_call() arguments: ctx - context pointer jmp_table - one of BPF_MAP_TYPE_PROG_ARRAY maps used as the jump table index - index in the jump table Since all BPF programs are idenitified by file descriptor, user space need to populate the jmp_table with FDs of other BPF programs. If jmp_table[index] is empty the bpf_tail_call() doesn't jump anywhere and program execution continues as normal. New BPF_MAP_TYPE_PROG_ARRAY map type is introduced so that user space can populate this jmp_table array with FDs of other bpf programs. Programs can share the same jmp_table array or use multiple jmp_tables. The chain of tail calls can form unpredictable dynamic loops therefore tail_call_cnt is used to limit the number of calls and currently is set to 32. Use cases: Acked-by: Daniel Borkmann ========== - simplify complex programs by splitting them into a sequence of small programs - dispatch routine For tracing and future seccomp the program may be triggered on all system calls, but processing of syscall arguments will be different. It's more efficient to implement them as: int syscall_entry(struct seccomp_data *ctx) { bpf_tail_call(ctx, &syscall_jmp_table, ctx->nr /* syscall number */); ... default: process unknown syscall ... } int sys_write_event(struct seccomp_data *ctx) {...} int sys_read_event(struct seccomp_data *ctx) {...} syscall_jmp_table[__NR_write] = sys_write_event; syscall_jmp_table[__NR_read] = sys_read_event; For networking the program may call into different parsers depending on packet format, like: int packet_parser(struct __sk_buff *skb) { ... parse L2, L3 here ... __u8 ipproto = load_byte(skb, ... offsetof(struct iphdr, protocol)); bpf_tail_call(skb, &ipproto_jmp_table, ipproto); ... default: process unknown protocol ... } int parse_tcp(struct __sk_buff *skb) {...} int parse_udp(struct __sk_buff *skb) {...} ipproto_jmp_table[IPPROTO_TCP] = parse_tcp; ipproto_jmp_table[IPPROTO_UDP] = parse_udp; - for TC use case, bpf_tail_call() allows to implement reclassify-like logic - bpf_map_update_elem/delete calls into BPF_MAP_TYPE_PROG_ARRAY jump table are atomic, so user space can build chains of BPF programs on the fly Implementation details: ======================= - high performance of bpf_tail_call() is the goal. It could have been implemented without JIT changes as a wrapper on top of BPF_PROG_RUN() macro, but with two downsides: . all programs would have to pay performance penalty for this feature and tail call itself would be slower, since mandatory stack unwind, return, stack allocate would be done for every tailcall. . tailcall would be limited to programs running preempt_disabled, since generic 'void *ctx' doesn't have room for 'tail_call_cnt' and it would need to be either global per_cpu variable accessed by helper and by wrapper or global variable protected by locks. In this implementation x64 JIT bypasses stack unwind and jumps into the callee program after prologue. - bpf_prog_array_compatible() ensures that prog_type of callee and caller are the same and JITed/non-JITed flag is the same, since calling JITed program from non-JITed is invalid, since stack frames are different. Similarly calling kprobe type program from socket type program is invalid. - jump table is implemented as BPF_MAP_TYPE_PROG_ARRAY to reuse 'map' abstraction, its user space API and all of verifier logic. It's in the existing arraymap.c file, since several functions are shared with regular array map. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 6805717be614..3adcca6f17a4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1421,6 +1421,8 @@ sk_filter_func_proto(enum bpf_func_id func_id) return &bpf_get_prandom_u32_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_tail_call: + return &bpf_tail_call_proto; default: return NULL; } -- cgit From ce5ec440994b7b2b714633b516b9e0a4f6a98dfe Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Wed, 20 May 2015 15:52:53 +0000 Subject: tcp: ensure epoll edge trigger wakeup when write queue is empty We currently rely on the setting of SOCK_NOSPACE in the write() path to ensure that we wake up any epoll edge trigger waiters when acks return to free space in the write queue. However, if we fail to allocate even a single skb in the write queue, we could end up waiting indefinitely. Fix this by explicitly issuing a wakeup when we detect the condition of an empty write queue and a return value of -EAGAIN. This allows userspace to re-try as we expect this to be a temporary failure. I've tested this approach by artificially making sk_stream_alloc_skb() return NULL periodically. In that case, epoll edge trigger waiters will hang indefinitely in epoll_wait() without this patch. Signed-off-by: Jason Baron Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ca1d476c80ef..0a3f9a00565b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -999,6 +999,9 @@ do_error: if (copied) goto out; out_err: + /* make sure we wake any epoll edge trigger waiter */ + if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN)) + sk->sk_write_space(sk); return sk_stream_error(sk, flags, err); } @@ -1288,6 +1291,9 @@ do_error: goto out; out_err: err = sk_stream_error(sk, flags, err); + /* make sure we wake any epoll edge trigger waiter */ + if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN)) + sk->sk_write_space(sk); release_sock(sk); return err; } -- cgit From f5af1f57a2914e290de40e2c93716da8885c4965 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 20 May 2015 10:59:01 -0700 Subject: inet_hashinfo: remove bsocket counter We no longer need bsocket atomic counter, as inet_csk_get_port() calls bind_conflict() regardless of its value, after commit 2b05ad33e1e624e ("tcp: bind() fix autoselection to share ports") This patch removes overhead of maintaining this counter and double inet_csk_get_port() calls under pressure. Signed-off-by: Eric Dumazet Cc: Marcelo Ricardo Leitner Cc: Flavio Leitner Acked-by: Flavio Leitner Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 5 ----- net/ipv4/inet_hashtables.c | 7 ------- 2 files changed, 12 deletions(-) (limited to 'net') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 8976ca423a07..b95fb263a13f 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -127,11 +127,6 @@ again: (tb->num_owners < smallest_size || smallest_size == -1)) { smallest_size = tb->num_owners; smallest_rover = rover; - if (atomic_read(&hashinfo->bsockets) > (high - low) + 1 && - !inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) { - snum = smallest_rover; - goto tb_found; - } } if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) { snum = rover; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index c6fb80bd5826..3766bddb3e8a 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -90,10 +90,6 @@ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, const unsigned short snum) { - struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; - - atomic_inc(&hashinfo->bsockets); - inet_sk(sk)->inet_num = snum; sk_add_bind_node(sk, &tb->owners); tb->num_owners++; @@ -111,8 +107,6 @@ static void __inet_put_port(struct sock *sk) struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; struct inet_bind_bucket *tb; - atomic_dec(&hashinfo->bsockets); - spin_lock(&head->lock); tb = inet_csk(sk)->icsk_bind_hash; __sk_del_bind_node(sk); @@ -608,7 +602,6 @@ void inet_hashinfo_init(struct inet_hashinfo *h) { int i; - atomic_set(&h->bsockets, 0); for (i = 0; i < INET_LHTABLE_SIZE; i++) { spin_lock_init(&h->listening_hash[i].lock); INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head, -- cgit From 946f9eb226c296da0d6c630bdad282ca11d77f60 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 20 May 2015 10:59:02 -0700 Subject: tcp: improve REUSEADDR/NOREUSEADDR cohabitation inet_csk_get_port() randomization effort tends to spread sockets on all the available range (ip_local_port_range) This is unfortunate because SO_REUSEADDR sockets have less requirements than non SO_REUSEADDR ones. If an application uses SO_REUSEADDR hint, it is to try to allow source ports being shared. So instead of picking a random port number in ip_local_port_range, lets try first in first half of the range. This gives more chances to use upper half of the range for the sockets with strong requirements (not using SO_REUSEADDR) Note this patch does not add a new sysctl, and only changes the way we try to pick port number. Signed-off-by: Eric Dumazet Cc: Marcelo Ricardo Leitner Cc: Flavio Leitner Acked-by: Flavio Leitner Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'net') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index b95fb263a13f..60021d0d9326 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -99,6 +99,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) struct net *net = sock_net(sk); int smallest_size = -1, smallest_rover; kuid_t uid = sock_i_uid(sk); + int attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0; local_bh_disable(); if (!snum) { @@ -106,6 +107,14 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) again: inet_get_local_port_range(net, &low, &high); + if (attempt_half) { + int half = low + ((high - low) >> 1); + + if (attempt_half == 1) + high = half; + else + low = half; + } remaining = (high - low) + 1; smallest_rover = rover = prandom_u32() % remaining + low; @@ -154,6 +163,11 @@ again: snum = smallest_rover; goto have_snum; } + if (attempt_half == 1) { + /* OK we now try the upper half of the range */ + attempt_half = 2; + goto again; + } goto fail; } /* OK, here is the one we will use. HEAD is -- cgit From 48ed7b26faa758e6612cd1fb11c07f25cd54f771 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 21 May 2015 00:25:41 +0200 Subject: ipv6: reject locally assigned nexthop addresses ip -6 addr add dead::1/128 dev eth0 sleep 5 ip -6 route add default via dead::1/128 -> fails ip -6 addr add dead::1/128 dev eth0 ip -6 route add default via dead::1/128 -> succeeds reason is that if (nonsensensical) route above is added, dead::1 is still subject to DAD, so the route lookup will pick eth0 as outdev due to the prefix route that is added before DAD work is started. Add explicit test that checks if nexthop gateway is a local address. Link: https://bugzilla.redhat.com/show_bug.cgi?id=1167969 Signed-off-by: Florian Westphal Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv6/route.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 6f4a35096bde..98fce6f4a580 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1624,6 +1624,16 @@ int ip6_route_add(struct fib6_config *cfg) int gwa_type; gw_addr = &cfg->fc_gateway; + + /* if gw_addr is local we will fail to detect this in case + * address is still TENTATIVE (DAD in progress). rt6_lookup() + * will return already-added prefix route via interface that + * prefix route was assigned to, which might be non-loopback. + */ + err = -EINVAL; + if (ipv6_chk_addr_and_flags(net, gw_addr, NULL, 0, 0)) + goto out; + rt->rt6i_gateway = *gw_addr; gwa_type = ipv6_addr_type(gw_addr); @@ -1637,7 +1647,6 @@ int ip6_route_add(struct fib6_config *cfg) (SIT, PtP, NBMA NOARP links) it is handy to allow some exceptions. --ANK */ - err = -EINVAL; if (!(gwa_type & IPV6_ADDR_UNICAST)) goto out; -- cgit From 2efd055c53c06b7e89c167c98069bab9afce7e59 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Wed, 20 May 2015 16:35:41 -0700 Subject: tcp: add tcpi_segs_in and tcpi_segs_out to tcp_info This patch tracks the total number of inbound and outbound segments on a TCP socket. One may use this number to have an idea on connection quality when compared against the retransmissions. RFC4898 named these : tcpEStatsPerfSegsIn and tcpEStatsPerfSegsOut These are a 32bit field each and can be fetched both from TCP_INFO getsockopt() if one has a handle on a TCP socket, or from inet_diag netlink facility (iproute2/ss patch will follow) Note that tp->segs_out was placed near tp->snd_nxt for good data locality and minimal performance impact, while tp->segs_in was placed near tp->bytes_received for the same reason. Join work with Eric Dumazet. Note that received SYN are accounted on the listener, but sent SYNACK are not accounted. Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 2 ++ net/ipv4/tcp_ipv4.c | 1 + net/ipv4/tcp_minisocks.c | 1 + net/ipv4/tcp_output.c | 1 + net/ipv6/tcp_ipv6.c | 1 + 5 files changed, 6 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0a3f9a00565b..7f3e721b9e69 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2695,6 +2695,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) spin_lock_bh(&sk->sk_lock.slock); info->tcpi_bytes_acked = tp->bytes_acked; info->tcpi_bytes_received = tp->bytes_received; + info->tcpi_segs_out = tp->segs_out; + info->tcpi_segs_in = tp->segs_in; spin_unlock_bh(&sk->sk_lock.slock); } EXPORT_SYMBOL_GPL(tcp_get_info); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 0cc4b5a630cd..feb875769b8d 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1626,6 +1626,7 @@ process: skb->dev = NULL; bh_lock_sock_nested(sk); + tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs); ret = 0; if (!sock_owned_by_user(sk)) { if (!tcp_prequeue(sk, skb)) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index ebe2ab2596ed..b62d15c86946 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -448,6 +448,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1; + newtp->segs_in = 0; newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3aebe0157dfa..534e5fdb04c1 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1027,6 +1027,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); + tp->segs_out += tcp_skb_pcount(skb); /* OK, its time to fill skb_shinfo(skb)->gso_segs */ skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index b6575d665568..beac6bf840b9 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1421,6 +1421,7 @@ process: skb->dev = NULL; bh_lock_sock_nested(sk); + tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs); ret = 0; if (!sock_owned_by_user(sk)) { if (!tcp_prequeue(sk, skb)) -- cgit From 12c227ec89a70c14e3efcf102c5babece381e172 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 22 May 2015 11:05:58 +0200 Subject: flow_dissector: do not break if ports are not needed in flowlabel This restored previous behaviour. If caller does not want ports to be filled, we should not break. Fixes: 06635a35d13d ("flow_dissect: use programable dissector in skb_flow_dissect and friends") Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 703d05916a80..1f2d89300b1a 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -221,13 +221,13 @@ flow_label: key_basic->ip_proto = ip_proto; key_basic->thoff = (u16)nhoff; - if (!skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_PORTS)) - break; - key_ports = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_PORTS, - target_container); - key_ports->ports = flow_label; + if (skb_flow_dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS)) { + key_ports = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS, + target_container); + key_ports->ports = flow_label; + } return true; } -- cgit From d079abd181950a44cdf31daafd1662388a6c4d2e Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 21 May 2015 12:16:11 +0200 Subject: pktgen: adjust spacing in proc file interface output Too many spaces were introduced in commit 63adc6fb8ac0 ("pktgen: cleanup checkpatch warnings"), thus misaligning "src_min:" to other columns. Fixes: 63adc6fb8ac0 ("pktgen: cleanup checkpatch warnings") Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- net/core/pktgen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 62f979984a23..167e59243fea 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -572,7 +572,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v) " dst_min: %s dst_max: %s\n", pkt_dev->dst_min, pkt_dev->dst_max); seq_printf(seq, - " src_min: %s src_max: %s\n", + " src_min: %s src_max: %s\n", pkt_dev->src_min, pkt_dev->src_max); } -- cgit From 4020726479ff799318a5aa188b81d79df86a0ea3 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 21 May 2015 12:16:56 +0200 Subject: pktgen: make /proc/net/pktgen/pgctrl report fail on invalid input Giving /proc/net/pktgen/pgctrl an invalid command just returns shell success and prints a warning in dmesg. This is not very useful for shell scripting, as it can only detect the error by parsing dmesg. Instead return -EINVAL when the command is unknown, as this provides userspace shell scripting a way of detecting this. Also bump version tag to 2.75, because (1) reading /proc/net/pktgen/pgctrl output this version number which would allow to detect this small semantic change, and (2) because the pktgen version tag have not been updated since 2010. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- net/core/pktgen.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 167e59243fea..676550f34560 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -177,7 +177,7 @@ #include #include /* do_div */ -#define VERSION "2.74" +#define VERSION "2.75" #define IP_NAME_SZ 32 #define MAX_MPLS_LABELS 16 /* This is the max label stack depth */ #define MPLS_STACK_BOTTOM htonl(0x00000100) @@ -512,7 +512,7 @@ static ssize_t pgctrl_write(struct file *file, const char __user *buf, pktgen_reset_all_threads(pn); else - pr_warn("Unknown command: %s\n", data); + return -EINVAL; return count; } -- cgit From 4a669f7d72535017dd03cbcdf5af6e85edfdf90c Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Fri, 22 May 2015 17:43:51 +0200 Subject: mac802154: fix hold rtnl while ioctl This patch fixes an issue to set address configuration with ioctl. Accessing the mib requires rtnl lock and the ndo_do_ioctl doesn't hold the rtnl lock while this callback is called. This patch do that manually. Signed-off-by: Alexander Aring Reported-by: Matteo Petracca Reviewed-by: Stefan Schmidt Signed-off-by: Marcel Holtmann --- net/mac802154/iface.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c index 91b75abbd1a1..2a5878889289 100644 --- a/net/mac802154/iface.c +++ b/net/mac802154/iface.c @@ -62,8 +62,7 @@ mac802154_wpan_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) (struct sockaddr_ieee802154 *)&ifr->ifr_addr; int err = -ENOIOCTLCMD; - ASSERT_RTNL(); - + rtnl_lock(); spin_lock_bh(&sdata->mib_lock); switch (cmd) { @@ -90,6 +89,7 @@ mac802154_wpan_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) case SIOCSIFADDR: if (netif_running(dev)) { spin_unlock_bh(&sdata->mib_lock); + rtnl_unlock(); return -EBUSY; } @@ -112,6 +112,7 @@ mac802154_wpan_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) } spin_unlock_bh(&sdata->mib_lock); + rtnl_unlock(); return err; } -- cgit From 4a3a8c0c3a613e481bea931f0d65dc4a7efaa9b9 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Fri, 22 May 2015 17:43:52 +0200 Subject: mac802154: remove pib lock This patch removes the pib lock which is now replaced by rtnl lock. The new interface already use the rtnl lock only. Nevertheless this patch will fix issues while using new and old interface at the same time. Signed-off-by: Alexander Aring Reviewed-by: Stefan Schmidt Signed-off-by: Marcel Holtmann --- net/ieee802154/core.c | 2 -- net/ieee802154/nl-phy.c | 6 +++--- net/mac802154/iface.c | 7 ------- net/mac802154/mib.c | 4 ++-- 4 files changed, 5 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/ieee802154/core.c b/net/ieee802154/core.c index 2ee00e8a0308..b0248e934230 100644 --- a/net/ieee802154/core.c +++ b/net/ieee802154/core.c @@ -121,8 +121,6 @@ wpan_phy_new(const struct cfg802154_ops *ops, size_t priv_size) /* atomic_inc_return makes it start at 1, make it start at 0 */ rdev->wpan_phy_idx--; - mutex_init(&rdev->wpan_phy.pib_lock); - INIT_LIST_HEAD(&rdev->wpan_dev_list); device_initialize(&rdev->wpan_phy.dev); dev_set_name(&rdev->wpan_phy.dev, PHY_NAME "%d", rdev->wpan_phy_idx); diff --git a/net/ieee802154/nl-phy.c b/net/ieee802154/nl-phy.c index cbc0d09351e0..77d73014bde3 100644 --- a/net/ieee802154/nl-phy.c +++ b/net/ieee802154/nl-phy.c @@ -50,7 +50,7 @@ static int ieee802154_nl_fill_phy(struct sk_buff *msg, u32 portid, if (!hdr) goto out; - mutex_lock(&phy->pib_lock); + rtnl_lock(); if (nla_put_string(msg, IEEE802154_ATTR_PHY_NAME, wpan_phy_name(phy)) || nla_put_u8(msg, IEEE802154_ATTR_PAGE, phy->current_page) || nla_put_u8(msg, IEEE802154_ATTR_CHANNEL, phy->current_channel)) @@ -63,13 +63,13 @@ static int ieee802154_nl_fill_phy(struct sk_buff *msg, u32 portid, nla_put(msg, IEEE802154_ATTR_CHANNEL_PAGE_LIST, pages * sizeof(uint32_t), buf)) goto nla_put_failure; - mutex_unlock(&phy->pib_lock); + rtnl_unlock(); kfree(buf); genlmsg_end(msg, hdr); return 0; nla_put_failure: - mutex_unlock(&phy->pib_lock); + rtnl_unlock(); genlmsg_cancel(msg, hdr); out: kfree(buf); diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c index 2a5878889289..22f478be7489 100644 --- a/net/mac802154/iface.c +++ b/net/mac802154/iface.c @@ -242,7 +242,6 @@ static int mac802154_wpan_open(struct net_device *dev) struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); struct ieee802154_local *local = sdata->local; struct wpan_dev *wpan_dev = &sdata->wpan_dev; - struct wpan_phy *phy = sdata->local->phy; rc = ieee802154_check_concurrent_iface(sdata, sdata->vif.type); if (rc < 0) @@ -252,8 +251,6 @@ static int mac802154_wpan_open(struct net_device *dev) if (rc < 0) return rc; - mutex_lock(&phy->pib_lock); - if (local->hw.flags & IEEE802154_HW_PROMISCUOUS) { rc = drv_set_promiscuous_mode(local, wpan_dev->promiscuous_mode); @@ -295,11 +292,7 @@ static int mac802154_wpan_open(struct net_device *dev) goto out; } - mutex_unlock(&phy->pib_lock); - return 0; - out: - mutex_unlock(&phy->pib_lock); return rc; } diff --git a/net/mac802154/mib.c b/net/mac802154/mib.c index 5cf019a57fd7..033dfc7755c6 100644 --- a/net/mac802154/mib.c +++ b/net/mac802154/mib.c @@ -91,16 +91,16 @@ void mac802154_dev_set_page_channel(struct net_device *dev, u8 page, u8 chan) struct ieee802154_local *local = sdata->local; int res; + ASSERT_RTNL(); + BUG_ON(dev->type != ARPHRD_IEEE802154); res = drv_set_channel(local, page, chan); if (res) { pr_debug("set_channel failed\n"); } else { - mutex_lock(&local->phy->pib_lock); local->phy->current_channel = chan; local->phy->current_page = page; - mutex_unlock(&local->phy->pib_lock); } } -- cgit From 344f8c119df742f2bf7098cf8fc326351f583249 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Fri, 22 May 2015 17:43:53 +0200 Subject: mac802154: use atomic ops for sequence incrementation This patch will use atomic operations for sequence number incrementation while MAC header generation. Upper layers like af_802154 or 6LoWPAN could call this function in a parallel context while generating 802.15.4 MAC header before queuing into wpan interfaces transmit queue. Signed-off-by: Alexander Aring Reviewed-by: Stefan Schmidt Signed-off-by: Marcel Holtmann --- net/ieee802154/6lowpan/core.c | 8 -------- net/mac802154/ieee802154_i.h | 1 - net/mac802154/iface.c | 9 ++++++--- net/mac802154/mac_cmd.c | 1 - net/mac802154/mib.c | 9 --------- 5 files changed, 6 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index 0ae5822ef944..2e77fada7e54 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -69,13 +69,6 @@ static __le16 lowpan_get_short_addr(const struct net_device *dev) return ieee802154_mlme_ops(real_dev)->get_short_addr(real_dev); } -static u8 lowpan_get_dsn(const struct net_device *dev) -{ - struct net_device *real_dev = lowpan_dev_info(dev)->real_dev; - - return ieee802154_mlme_ops(real_dev)->get_dsn(real_dev); -} - static struct header_ops lowpan_header_ops = { .create = lowpan_header_create, }; @@ -106,7 +99,6 @@ static const struct net_device_ops lowpan_netdev_ops = { static struct ieee802154_mlme_ops lowpan_mlme = { .get_pan_id = lowpan_get_pan_id, .get_short_addr = lowpan_get_short_addr, - .get_dsn = lowpan_get_dsn, }; static void lowpan_setup(struct net_device *dev) diff --git a/net/mac802154/ieee802154_i.h b/net/mac802154/ieee802154_i.h index 127ba18386fc..56b3847466e9 100644 --- a/net/mac802154/ieee802154_i.h +++ b/net/mac802154/ieee802154_i.h @@ -141,7 +141,6 @@ __le16 mac802154_dev_get_short_addr(const struct net_device *dev); __le16 mac802154_dev_get_pan_id(const struct net_device *dev); void mac802154_dev_set_pan_id(struct net_device *dev, __le16 val); void mac802154_dev_set_page_channel(struct net_device *dev, u8 page, u8 chan); -u8 mac802154_dev_get_dsn(const struct net_device *dev); int mac802154_get_params(struct net_device *dev, struct ieee802154_llsec_params *params); diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c index 22f478be7489..0ec7bc402e29 100644 --- a/net/mac802154/iface.c +++ b/net/mac802154/iface.c @@ -368,7 +368,7 @@ static int mac802154_header_create(struct sk_buff *skb, hdr.fc.type = cb->type; hdr.fc.security_enabled = cb->secen; hdr.fc.ack_request = cb->ackreq; - hdr.seq = ieee802154_mlme_ops(dev)->get_dsn(dev); + hdr.seq = atomic_inc_return(&dev->ieee802154_ptr->dsn) & 0xFF; if (mac802154_set_header_security(sdata, &hdr, cb) < 0) return -EINVAL; @@ -468,13 +468,16 @@ ieee802154_setup_sdata(struct ieee802154_sub_if_data *sdata, enum nl802154_iftype type) { struct wpan_dev *wpan_dev = &sdata->wpan_dev; + u8 tmp; /* set some type-dependent values */ sdata->vif.type = type; sdata->wpan_dev.iftype = type; - get_random_bytes(&wpan_dev->bsn, 1); - get_random_bytes(&wpan_dev->dsn, 1); + get_random_bytes(&tmp, sizeof(tmp)); + atomic_set(&wpan_dev->bsn, tmp); + get_random_bytes(&tmp, sizeof(tmp)); + atomic_set(&wpan_dev->dsn, tmp); /* defaults per 802.15.4-2011 */ wpan_dev->min_be = 3; diff --git a/net/mac802154/mac_cmd.c b/net/mac802154/mac_cmd.c index 6dcbb3b5994c..3b347d4d3d4c 100644 --- a/net/mac802154/mac_cmd.c +++ b/net/mac802154/mac_cmd.c @@ -153,7 +153,6 @@ struct ieee802154_mlme_ops mac802154_mlme_wpan = { .start_req = mac802154_mlme_start_req, .get_pan_id = mac802154_dev_get_pan_id, .get_short_addr = mac802154_dev_get_short_addr, - .get_dsn = mac802154_dev_get_dsn, .llsec = &mac802154_llsec_ops, diff --git a/net/mac802154/mib.c b/net/mac802154/mib.c index 033dfc7755c6..00a62de9cc23 100644 --- a/net/mac802154/mib.c +++ b/net/mac802154/mib.c @@ -76,15 +76,6 @@ void mac802154_dev_set_pan_id(struct net_device *dev, __le16 val) spin_unlock_bh(&sdata->mib_lock); } -u8 mac802154_dev_get_dsn(const struct net_device *dev) -{ - struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); - - BUG_ON(dev->type != ARPHRD_IEEE802154); - - return sdata->wpan_dev.dsn++; -} - void mac802154_dev_set_page_channel(struct net_device *dev, u8 page, u8 chan) { struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); -- cgit From c947f7e1e31a708f5a4ea8c1a627bec578cd9223 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Fri, 22 May 2015 17:43:54 +0200 Subject: mac802154: remove mib lock This patch removes the mib lock. The new locking mechanism is to protect the mib values with the rtnl lock. Note that this isn't always necessary if we have an interface up the most mib values are readonly (e.g. address settings). With this behaviour we can remove locking in hotpath like frame parsing completely. It depends on context if we need to hold the rtnl lock or not, this makes the callbacks of ieee802154_mlme_ops unnecessary because these callbacks hols always the locks. Signed-off-by: Alexander Aring Reviewed-by: Stefan Schmidt Signed-off-by: Marcel Holtmann --- net/ieee802154/6lowpan/core.c | 20 ----------------- net/ieee802154/6lowpan/tx.c | 2 +- net/ieee802154/nl-mac.c | 14 ++++++++---- net/ieee802154/socket.c | 12 +++++------ net/mac802154/ieee802154_i.h | 6 ------ net/mac802154/iface.c | 8 ------- net/mac802154/mac_cmd.c | 6 ++---- net/mac802154/mib.c | 50 ------------------------------------------- net/mac802154/rx.c | 5 ----- 9 files changed, 18 insertions(+), 105 deletions(-) (limited to 'net') diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index 2e77fada7e54..f20a387a1011 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -55,20 +55,6 @@ LIST_HEAD(lowpan_devices); static int lowpan_open_count; -static __le16 lowpan_get_pan_id(const struct net_device *dev) -{ - struct net_device *real_dev = lowpan_dev_info(dev)->real_dev; - - return ieee802154_mlme_ops(real_dev)->get_pan_id(real_dev); -} - -static __le16 lowpan_get_short_addr(const struct net_device *dev) -{ - struct net_device *real_dev = lowpan_dev_info(dev)->real_dev; - - return ieee802154_mlme_ops(real_dev)->get_short_addr(real_dev); -} - static struct header_ops lowpan_header_ops = { .create = lowpan_header_create, }; @@ -96,11 +82,6 @@ static const struct net_device_ops lowpan_netdev_ops = { .ndo_start_xmit = lowpan_xmit, }; -static struct ieee802154_mlme_ops lowpan_mlme = { - .get_pan_id = lowpan_get_pan_id, - .get_short_addr = lowpan_get_short_addr, -}; - static void lowpan_setup(struct net_device *dev) { dev->addr_len = IEEE802154_ADDR_LEN; @@ -116,7 +97,6 @@ static void lowpan_setup(struct net_device *dev) dev->netdev_ops = &lowpan_netdev_ops; dev->header_ops = &lowpan_header_ops; - dev->ml_priv = &lowpan_mlme; dev->destructor = free_netdev; dev->features |= NETIF_F_NETNS_LOCAL; } diff --git a/net/ieee802154/6lowpan/tx.c b/net/ieee802154/6lowpan/tx.c index 2349070bd534..98acf7319754 100644 --- a/net/ieee802154/6lowpan/tx.c +++ b/net/ieee802154/6lowpan/tx.c @@ -207,7 +207,7 @@ static int lowpan_header(struct sk_buff *skb, struct net_device *dev) /* prepare wpan address data */ sa.mode = IEEE802154_ADDR_LONG; - sa.pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev); + sa.pan_id = lowpan_dev_info(dev)->real_dev->ieee802154_ptr->pan_id; sa.extended_addr = ieee802154_devaddr_from_raw(saddr); /* intra-PAN communications */ diff --git a/net/ieee802154/nl-mac.c b/net/ieee802154/nl-mac.c index cdc1cc3543f6..ada58a81b753 100644 --- a/net/ieee802154/nl-mac.c +++ b/net/ieee802154/nl-mac.c @@ -97,8 +97,10 @@ static int ieee802154_nl_fill_iface(struct sk_buff *msg, u32 portid, BUG_ON(!phy); get_device(&phy->dev); - short_addr = ops->get_short_addr(dev); - pan_id = ops->get_pan_id(dev); + rtnl_lock(); + short_addr = dev->ieee802154_ptr->short_addr; + pan_id = dev->ieee802154_ptr->pan_id; + rtnl_unlock(); if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) || nla_put_string(msg, IEEE802154_ATTR_PHY_NAME, wpan_phy_name(phy)) || @@ -244,7 +246,9 @@ int ieee802154_associate_resp(struct sk_buff *skb, struct genl_info *info) addr.mode = IEEE802154_ADDR_LONG; addr.extended_addr = nla_get_hwaddr( info->attrs[IEEE802154_ATTR_DEST_HW_ADDR]); - addr.pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev); + rtnl_lock(); + addr.pan_id = dev->ieee802154_ptr->pan_id; + rtnl_unlock(); ret = ieee802154_mlme_ops(dev)->assoc_resp(dev, &addr, nla_get_shortaddr(info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR]), @@ -281,7 +285,9 @@ int ieee802154_disassociate_req(struct sk_buff *skb, struct genl_info *info) addr.short_addr = nla_get_shortaddr( info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR]); } - addr.pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev); + rtnl_lock(); + addr.pan_id = dev->ieee802154_ptr->pan_id; + rtnl_unlock(); ret = ieee802154_mlme_ops(dev)->disassoc_req(dev, &addr, nla_get_u8(info->attrs[IEEE802154_ATTR_REASON])); diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index 7aaaf967df58..e5cc2537c2a3 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -64,10 +64,8 @@ ieee802154_get_dev(struct net *net, const struct ieee802154_addr *addr) if (tmp->type != ARPHRD_IEEE802154) continue; - pan_id = ieee802154_mlme_ops(tmp)->get_pan_id(tmp); - short_addr = - ieee802154_mlme_ops(tmp)->get_short_addr(tmp); - + pan_id = tmp->ieee802154_ptr->pan_id; + short_addr = tmp->ieee802154_ptr->short_addr; if (pan_id == addr->pan_id && short_addr == addr->short_addr) { dev = tmp; @@ -797,9 +795,9 @@ static int ieee802154_dgram_deliver(struct net_device *dev, struct sk_buff *skb) /* Data frame processing */ BUG_ON(dev->type != ARPHRD_IEEE802154); - pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev); - short_addr = ieee802154_mlme_ops(dev)->get_short_addr(dev); - hw_addr = ieee802154_devaddr_from_raw(dev->dev_addr); + pan_id = dev->ieee802154_ptr->pan_id; + short_addr = dev->ieee802154_ptr->short_addr; + hw_addr = dev->ieee802154_ptr->extended_addr; read_lock(&dgram_lock); sk_for_each(sk, &dgram_head) { diff --git a/net/mac802154/ieee802154_i.h b/net/mac802154/ieee802154_i.h index 56b3847466e9..eec668f3637f 100644 --- a/net/mac802154/ieee802154_i.h +++ b/net/mac802154/ieee802154_i.h @@ -86,8 +86,6 @@ struct ieee802154_sub_if_data { unsigned long state; char name[IFNAMSIZ]; - spinlock_t mib_lock; - /* protects sec from concurrent access by netlink. access by * encrypt/decrypt/header_create safe without additional protection. */ @@ -136,10 +134,6 @@ ieee802154_subif_start_xmit(struct sk_buff *skb, struct net_device *dev); enum hrtimer_restart ieee802154_xmit_ifs_timer(struct hrtimer *timer); /* MIB callbacks */ -void mac802154_dev_set_short_addr(struct net_device *dev, __le16 val); -__le16 mac802154_dev_get_short_addr(const struct net_device *dev); -__le16 mac802154_dev_get_pan_id(const struct net_device *dev); -void mac802154_dev_set_pan_id(struct net_device *dev, __le16 val); void mac802154_dev_set_page_channel(struct net_device *dev, u8 page, u8 chan); int mac802154_get_params(struct net_device *dev, diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c index 0ec7bc402e29..f30788d2702f 100644 --- a/net/mac802154/iface.c +++ b/net/mac802154/iface.c @@ -63,7 +63,6 @@ mac802154_wpan_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) int err = -ENOIOCTLCMD; rtnl_lock(); - spin_lock_bh(&sdata->mib_lock); switch (cmd) { case SIOCGIFADDR: @@ -88,7 +87,6 @@ mac802154_wpan_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) } case SIOCSIFADDR: if (netif_running(dev)) { - spin_unlock_bh(&sdata->mib_lock); rtnl_unlock(); return -EBUSY; } @@ -111,7 +109,6 @@ mac802154_wpan_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) break; } - spin_unlock_bh(&sdata->mib_lock); rtnl_unlock(); return err; } @@ -374,8 +371,6 @@ static int mac802154_header_create(struct sk_buff *skb, return -EINVAL; if (!saddr) { - spin_lock_bh(&sdata->mib_lock); - if (wpan_dev->short_addr == cpu_to_le16(IEEE802154_ADDR_BROADCAST) || wpan_dev->short_addr == cpu_to_le16(IEEE802154_ADDR_UNDEF) || wpan_dev->pan_id == cpu_to_le16(IEEE802154_PANID_BROADCAST)) { @@ -387,8 +382,6 @@ static int mac802154_header_create(struct sk_buff *skb, } hdr.source.pan_id = wpan_dev->pan_id; - - spin_unlock_bh(&sdata->mib_lock); } else { hdr.source = *(const struct ieee802154_addr *)saddr; } @@ -500,7 +493,6 @@ ieee802154_setup_sdata(struct ieee802154_sub_if_data *sdata, sdata->dev->ml_priv = &mac802154_mlme_wpan; wpan_dev->promiscuous_mode = false; - spin_lock_init(&sdata->mib_lock); mutex_init(&sdata->sec_mtx); mac802154_llsec_init(&sdata->sec); diff --git a/net/mac802154/mac_cmd.c b/net/mac802154/mac_cmd.c index 3b347d4d3d4c..5220c2b2735b 100644 --- a/net/mac802154/mac_cmd.c +++ b/net/mac802154/mac_cmd.c @@ -43,8 +43,8 @@ static int mac802154_mlme_start_req(struct net_device *dev, BUG_ON(addr->mode != IEEE802154_ADDR_SHORT); - mac802154_dev_set_pan_id(dev, addr->pan_id); - mac802154_dev_set_short_addr(dev, addr->short_addr); + dev->ieee802154_ptr->pan_id = addr->pan_id; + dev->ieee802154_ptr->short_addr = addr->short_addr; mac802154_dev_set_page_channel(dev, page, channel); if (ops->llsec) { @@ -151,8 +151,6 @@ static struct ieee802154_llsec_ops mac802154_llsec_ops = { struct ieee802154_mlme_ops mac802154_mlme_wpan = { .start_req = mac802154_mlme_start_req, - .get_pan_id = mac802154_dev_get_pan_id, - .get_short_addr = mac802154_dev_get_short_addr, .llsec = &mac802154_llsec_ops, diff --git a/net/mac802154/mib.c b/net/mac802154/mib.c index 00a62de9cc23..73f94fbf8785 100644 --- a/net/mac802154/mib.c +++ b/net/mac802154/mib.c @@ -26,56 +26,6 @@ #include "ieee802154_i.h" #include "driver-ops.h" -void mac802154_dev_set_short_addr(struct net_device *dev, __le16 val) -{ - struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); - - BUG_ON(dev->type != ARPHRD_IEEE802154); - - spin_lock_bh(&sdata->mib_lock); - sdata->wpan_dev.short_addr = val; - spin_unlock_bh(&sdata->mib_lock); -} - -__le16 mac802154_dev_get_short_addr(const struct net_device *dev) -{ - struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); - __le16 ret; - - BUG_ON(dev->type != ARPHRD_IEEE802154); - - spin_lock_bh(&sdata->mib_lock); - ret = sdata->wpan_dev.short_addr; - spin_unlock_bh(&sdata->mib_lock); - - return ret; -} - -__le16 mac802154_dev_get_pan_id(const struct net_device *dev) -{ - struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); - __le16 ret; - - BUG_ON(dev->type != ARPHRD_IEEE802154); - - spin_lock_bh(&sdata->mib_lock); - ret = sdata->wpan_dev.pan_id; - spin_unlock_bh(&sdata->mib_lock); - - return ret; -} - -void mac802154_dev_set_pan_id(struct net_device *dev, __le16 val) -{ - struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); - - BUG_ON(dev->type != ARPHRD_IEEE802154); - - spin_lock_bh(&sdata->mib_lock); - sdata->wpan_dev.pan_id = val; - spin_unlock_bh(&sdata->mib_lock); -} - void mac802154_dev_set_page_channel(struct net_device *dev, u8 page, u8 chan) { struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c index c0d67b2b4132..e0f10063cac3 100644 --- a/net/mac802154/rx.c +++ b/net/mac802154/rx.c @@ -47,8 +47,6 @@ ieee802154_subif_frame(struct ieee802154_sub_if_data *sdata, pr_debug("getting packet via slave interface %s\n", sdata->dev->name); - spin_lock_bh(&sdata->mib_lock); - span = wpan_dev->pan_id; sshort = wpan_dev->short_addr; @@ -83,13 +81,10 @@ ieee802154_subif_frame(struct ieee802154_sub_if_data *sdata, skb->pkt_type = PACKET_OTHERHOST; break; default: - spin_unlock_bh(&sdata->mib_lock); pr_debug("invalid dest mode\n"); goto fail; } - spin_unlock_bh(&sdata->mib_lock); - skb->dev = sdata->dev; rc = mac802154_llsec_decrypt(&sdata->sec, skb); -- cgit From be12a1fe298e8be04d5215364f94654dff81b0bc Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Thu, 21 May 2015 16:59:58 +0200 Subject: net: skbuff: add skb_append_pagefrags and use it Signed-off-by: Hannes Frederic Sowa Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/skbuff.c | 18 ++++++++++++++++++ net/ipv4/ip_output.c | 15 ++++----------- 2 files changed, 22 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index f3fe9bd9e672..4f2babeaf18d 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2915,6 +2915,24 @@ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb, } EXPORT_SYMBOL(skb_append_datato_frags); +int skb_append_pagefrags(struct sk_buff *skb, struct page *page, + int offset, size_t size) +{ + int i = skb_shinfo(skb)->nr_frags; + + if (skb_can_coalesce(skb, i, page, offset)) { + skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size); + } else if (i < MAX_SKB_FRAGS) { + get_page(page); + skb_fill_page_desc(skb, i, page, offset, size); + } else { + return -EMSGSIZE; + } + + return 0; +} +EXPORT_SYMBOL_GPL(skb_append_pagefrags); + /** * skb_pull_rcsum - pull skb and update receive checksum * @skb: buffer to update diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 8d91b922fcfe..451b009dae75 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1233,11 +1233,9 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, } while (size > 0) { - int i; - - if (skb_is_gso(skb)) + if (skb_is_gso(skb)) { len = size; - else { + } else { /* Check if the remaining data fits into current packet. */ len = mtu - skb->len; @@ -1289,15 +1287,10 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, continue; } - i = skb_shinfo(skb)->nr_frags; if (len > size) len = size; - if (skb_can_coalesce(skb, i, page, offset)) { - skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len); - } else if (i < MAX_SKB_FRAGS) { - get_page(page); - skb_fill_page_desc(skb, i, page, offset, len); - } else { + + if (skb_append_pagefrags(skb, page, offset, len)) { err = -EMSGSIZE; goto error; } -- cgit From 869e7c62486ec0e170a9771acaa251d1a33b5871 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Thu, 21 May 2015 16:59:59 +0200 Subject: net: af_unix: implement stream sendpage support This patch implements sendpage support for AF_UNIX SOCK_STREAM sockets. This is also required for a complete splice implementation. The implementation is a bit tricky because we append to already existing skbs and so have to hold unix_sk->readlock to protect the reading side from either advancing UNIXCB.consumed or freeing the skb at the socket receive tail. Signed-off-by: Hannes Frederic Sowa Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/unix/af_unix.c | 99 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 98 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 941b3d26e3bf..7762c0b46721 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -518,6 +518,8 @@ static int unix_ioctl(struct socket *, unsigned int, unsigned long); static int unix_shutdown(struct socket *, int); static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t); static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int); +static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset, + size_t size, int flags); static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t); static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int); static int unix_dgram_connect(struct socket *, struct sockaddr *, @@ -558,7 +560,7 @@ static const struct proto_ops unix_stream_ops = { .sendmsg = unix_stream_sendmsg, .recvmsg = unix_stream_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, + .sendpage = unix_stream_sendpage, .set_peek_off = unix_set_peek_off, }; @@ -1720,6 +1722,101 @@ out_err: return sent ? : err; } +static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page, + int offset, size_t size, int flags) +{ + int err = 0; + bool send_sigpipe = true; + struct sock *other, *sk = socket->sk; + struct sk_buff *skb, *newskb = NULL, *tail = NULL; + + if (flags & MSG_OOB) + return -EOPNOTSUPP; + + other = unix_peer(sk); + if (!other || sk->sk_state != TCP_ESTABLISHED) + return -ENOTCONN; + + if (false) { +alloc_skb: + unix_state_unlock(other); + mutex_unlock(&unix_sk(other)->readlock); + newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT, + &err, 0); + if (!newskb) + return err; + } + + /* we must acquire readlock as we modify already present + * skbs in the sk_receive_queue and mess with skb->len + */ + err = mutex_lock_interruptible(&unix_sk(other)->readlock); + if (err) { + err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS; + send_sigpipe = false; + goto err; + } + + if (sk->sk_shutdown & SEND_SHUTDOWN) { + err = -EPIPE; + goto err_unlock; + } + + unix_state_lock(other); + + if (sock_flag(other, SOCK_DEAD) || + other->sk_shutdown & RCV_SHUTDOWN) { + err = -EPIPE; + goto err_state_unlock; + } + + skb = skb_peek_tail(&other->sk_receive_queue); + if (tail && tail == skb) { + skb = newskb; + } else if (!skb) { + if (newskb) + skb = newskb; + else + goto alloc_skb; + } else if (newskb) { + /* this is fast path, we don't necessarily need to + * call to kfree_skb even though with newskb == NULL + * this - does no harm + */ + consume_skb(newskb); + } + + if (skb_append_pagefrags(skb, page, offset, size)) { + tail = skb; + goto alloc_skb; + } + + skb->len += size; + skb->data_len += size; + skb->truesize += size; + atomic_add(size, &sk->sk_wmem_alloc); + + if (newskb) + __skb_queue_tail(&other->sk_receive_queue, newskb); + + unix_state_unlock(other); + mutex_unlock(&unix_sk(other)->readlock); + + other->sk_data_ready(other); + + return size; + +err_state_unlock: + unix_state_unlock(other); +err_unlock: + mutex_unlock(&unix_sk(other)->readlock); +err: + kfree_skb(newskb); + if (send_sigpipe && !(flags & MSG_NOSIGNAL)) + send_sig(SIGPIPE, current, 0); + return err; +} + static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { -- cgit From a60e3cc7c92973a31fad0fd04dc5cf4355d3d1ef Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Thu, 21 May 2015 17:00:00 +0200 Subject: net: make skb_splice_bits more configureable Prepare skb_splice_bits to be able to deal with AF_UNIX sockets. AF_UNIX sockets don't use lock_sock/release_sock and thus we have to use a callback to make the locking and unlocking configureable. Signed-off-by: Hannes Frederic Sowa Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/skbuff.c | 45 ++++++++++++++++++++++++++++----------------- net/ipv4/tcp.c | 5 +++-- 2 files changed, 31 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 4f2babeaf18d..02769fa4f5c8 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1870,15 +1870,39 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, return false; } +ssize_t skb_socket_splice(struct sock *sk, + struct pipe_inode_info *pipe, + struct splice_pipe_desc *spd) +{ + int ret; + + /* Drop the socket lock, otherwise we have reverse + * locking dependencies between sk_lock and i_mutex + * here as compared to sendfile(). We enter here + * with the socket lock held, and splice_to_pipe() will + * grab the pipe inode lock. For sendfile() emulation, + * we call into ->sendpage() with the i_mutex lock held + * and networking will grab the socket lock. + */ + release_sock(sk); + ret = splice_to_pipe(pipe, spd); + lock_sock(sk); + + return ret; +} + /* * Map data from the skb to a pipe. Should handle both the linear part, * the fragments, and the frag list. It does NOT handle frag lists within * the frag list, if such a thing exists. We'd probably need to recurse to * handle that cleanly. */ -int skb_splice_bits(struct sk_buff *skb, unsigned int offset, +int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, struct pipe_inode_info *pipe, unsigned int tlen, - unsigned int flags) + unsigned int flags, + ssize_t (*splice_cb)(struct sock *, + struct pipe_inode_info *, + struct splice_pipe_desc *)) { struct partial_page partial[MAX_SKB_FRAGS]; struct page *pages[MAX_SKB_FRAGS]; @@ -1891,7 +1915,6 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset, .spd_release = sock_spd_release, }; struct sk_buff *frag_iter; - struct sock *sk = skb->sk; int ret = 0; /* @@ -1914,20 +1937,8 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset, } done: - if (spd.nr_pages) { - /* - * Drop the socket lock, otherwise we have reverse - * locking dependencies between sk_lock and i_mutex - * here as compared to sendfile(). We enter here - * with the socket lock held, and splice_to_pipe() will - * grab the pipe inode lock. For sendfile() emulation, - * we call into ->sendpage() with the i_mutex lock held - * and networking will grab the socket lock. - */ - release_sock(sk); - ret = splice_to_pipe(pipe, &spd); - lock_sock(sk); - } + if (spd.nr_pages) + ret = splice_cb(sk, pipe, &spd); return ret; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 90afcec3f427..65f791f74845 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -695,8 +695,9 @@ static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, struct tcp_splice_state *tss = rd_desc->arg.data; int ret; - ret = skb_splice_bits(skb, offset, tss->pipe, min(rd_desc->count, len), - tss->flags); + ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe, + min(rd_desc->count, len), tss->flags, + skb_socket_splice); if (ret > 0) rd_desc->count -= ret; return ret; -- cgit From 2b514574f7e88c8498027ee366fd6e7aae5aa4b5 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Thu, 21 May 2015 17:00:01 +0200 Subject: net: af_unix: implement splice for stream af_unix sockets unix_stream_recvmsg is refactored to unix_stream_read_generic in this patch and enhanced to deal with pipe splicing. The refactoring is inneglible, we mostly have to deal with a non-existing struct msghdr argument. Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/core/skbuff.c | 1 + net/unix/af_unix.c | 140 +++++++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 120 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 02769fa4f5c8..9bac0e6f8dfa 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1942,6 +1942,7 @@ done: return ret; } +EXPORT_SYMBOL_GPL(skb_splice_bits); /** * skb_store_bits - store bits from kernel buffer to skb diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 7762c0b46721..b8c44076c776 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -520,6 +520,9 @@ static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t); static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int); static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset, size_t size, int flags); +static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos, + struct pipe_inode_info *, size_t size, + unsigned int flags); static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t); static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int); static int unix_dgram_connect(struct socket *, struct sockaddr *, @@ -561,6 +564,7 @@ static const struct proto_ops unix_stream_ops = { .recvmsg = unix_stream_recvmsg, .mmap = sock_no_mmap, .sendpage = unix_stream_sendpage, + .splice_read = unix_stream_splice_read, .set_peek_off = unix_set_peek_off, }; @@ -1957,8 +1961,9 @@ out: * Sleep until more data has arrived. But check for races.. */ static long unix_stream_data_wait(struct sock *sk, long timeo, - struct sk_buff *last) + struct sk_buff *last, unsigned int last_len) { + struct sk_buff *tail; DEFINE_WAIT(wait); unix_state_lock(sk); @@ -1966,7 +1971,9 @@ static long unix_stream_data_wait(struct sock *sk, long timeo, for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - if (skb_peek_tail(&sk->sk_receive_queue) != last || + tail = skb_peek_tail(&sk->sk_receive_queue); + if (tail != last || + (tail && tail->len != last_len) || sk->sk_err || (sk->sk_shutdown & RCV_SHUTDOWN) || signal_pending(current) || @@ -1990,38 +1997,50 @@ static unsigned int unix_skb_len(const struct sk_buff *skb) return skb->len - UNIXCB(skb).consumed; } -static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, - size_t size, int flags) +struct unix_stream_read_state { + int (*recv_actor)(struct sk_buff *, int, int, + struct unix_stream_read_state *); + struct socket *socket; + struct msghdr *msg; + struct pipe_inode_info *pipe; + size_t size; + int flags; + unsigned int splice_flags; +}; + +static int unix_stream_read_generic(struct unix_stream_read_state *state) { struct scm_cookie scm; + struct socket *sock = state->socket; struct sock *sk = sock->sk; struct unix_sock *u = unix_sk(sk); - DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name); int copied = 0; + int flags = state->flags; int noblock = flags & MSG_DONTWAIT; - int check_creds = 0; + bool check_creds = false; int target; int err = 0; long timeo; int skip; + size_t size = state->size; + unsigned int last_len; err = -EINVAL; if (sk->sk_state != TCP_ESTABLISHED) goto out; err = -EOPNOTSUPP; - if (flags&MSG_OOB) + if (flags & MSG_OOB) goto out; - target = sock_rcvlowat(sk, flags&MSG_WAITALL, size); + target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); timeo = sock_rcvtimeo(sk, noblock); + memset(&scm, 0, sizeof(scm)); + /* Lock the socket to prevent queue disordering * while sleeps in memcpy_tomsg */ - - memset(&scm, 0, sizeof(scm)); - err = mutex_lock_interruptible(&u->readlock); if (unlikely(err)) { /* recvmsg() in non blocking mode is supposed to return -EAGAIN @@ -2037,6 +2056,7 @@ static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, unix_state_lock(sk); last = skb = skb_peek(&sk->sk_receive_queue); + last_len = last ? last->len : 0; again: if (skb == NULL) { unix_sk(sk)->recursion_level = 0; @@ -2059,16 +2079,17 @@ again: break; mutex_unlock(&u->readlock); - timeo = unix_stream_data_wait(sk, timeo, last); + timeo = unix_stream_data_wait(sk, timeo, last, + last_len); - if (signal_pending(current) - || mutex_lock_interruptible(&u->readlock)) { + if (signal_pending(current) || + mutex_lock_interruptible(&u->readlock)) { err = sock_intr_errno(timeo); goto out; } continue; - unlock: +unlock: unix_state_unlock(sk); break; } @@ -2077,6 +2098,7 @@ again: while (skip >= unix_skb_len(skb)) { skip -= unix_skb_len(skb); last = skb; + last_len = skb->len; skb = skb_peek_next(skb, &sk->sk_receive_queue); if (!skb) goto again; @@ -2093,18 +2115,20 @@ again: } else if (test_bit(SOCK_PASSCRED, &sock->flags)) { /* Copy credentials */ scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); - check_creds = 1; + check_creds = true; } /* Copy address just once */ - if (sunaddr) { - unix_copy_addr(msg, skb->sk); + if (state->msg && state->msg->msg_name) { + DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, + state->msg->msg_name); + unix_copy_addr(state->msg, skb->sk); sunaddr = NULL; } chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size); - if (skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip, - msg, chunk)) { + chunk = state->recv_actor(skb, skip, chunk, state); + if (chunk < 0) { if (copied == 0) copied = -EFAULT; break; @@ -2142,11 +2166,85 @@ again: } while (size); mutex_unlock(&u->readlock); - scm_recv(sock, msg, &scm, flags); + if (state->msg) + scm_recv(sock, state->msg, &scm, flags); + else + scm_destroy(&scm); out: return copied ? : err; } +static int unix_stream_read_actor(struct sk_buff *skb, + int skip, int chunk, + struct unix_stream_read_state *state) +{ + int ret; + + ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip, + state->msg, chunk); + return ret ?: chunk; +} + +static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, + size_t size, int flags) +{ + struct unix_stream_read_state state = { + .recv_actor = unix_stream_read_actor, + .socket = sock, + .msg = msg, + .size = size, + .flags = flags + }; + + return unix_stream_read_generic(&state); +} + +static ssize_t skb_unix_socket_splice(struct sock *sk, + struct pipe_inode_info *pipe, + struct splice_pipe_desc *spd) +{ + int ret; + struct unix_sock *u = unix_sk(sk); + + mutex_unlock(&u->readlock); + ret = splice_to_pipe(pipe, spd); + mutex_lock(&u->readlock); + + return ret; +} + +static int unix_stream_splice_actor(struct sk_buff *skb, + int skip, int chunk, + struct unix_stream_read_state *state) +{ + return skb_splice_bits(skb, state->socket->sk, + UNIXCB(skb).consumed + skip, + state->pipe, chunk, state->splice_flags, + skb_unix_socket_splice); +} + +static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t size, unsigned int flags) +{ + struct unix_stream_read_state state = { + .recv_actor = unix_stream_splice_actor, + .socket = sock, + .pipe = pipe, + .size = size, + .splice_flags = flags, + }; + + if (unlikely(*ppos)) + return -ESPIPE; + + if (sock->file->f_flags & O_NONBLOCK || + flags & SPLICE_F_NONBLOCK) + state.flags = MSG_DONTWAIT; + + return unix_stream_read_generic(&state); +} + static int unix_shutdown(struct socket *sock, int mode) { struct sock *sk = sock->sk; -- cgit From cf826244322443215c156c0751fa2c982969a9c9 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 22 May 2015 00:06:40 +0200 Subject: ip: reject too-big defragmented DF-skb when forwarding Send icmp pmtu error if we find that the largest fragment of df-skb exceeded the output path mtu. The ip output path will still catch this later on but we can avoid the forward/postrouting hook traversal by rejecting right away. This is what ipv6 already does. Acked-by: Hannes Frederic Sowa Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/ipv4/ip_forward.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 3674484946a5..2d3aa408fbdc 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -39,17 +39,21 @@ #include #include -static bool ip_may_fragment(const struct sk_buff *skb) -{ - return unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0) || - skb->ignore_df; -} - static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) { if (skb->len <= mtu) return false; + if (unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0)) + return false; + + /* original fragment exceeds mtu and DF is set */ + if (unlikely(IPCB(skb)->frag_max_size > mtu)) + return true; + + if (skb->ignore_df) + return false; + if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu) return false; @@ -114,7 +118,7 @@ int ip_forward(struct sk_buff *skb) IPCB(skb)->flags |= IPSKB_FORWARDED; mtu = ip_dst_mtu_maybe_forward(&rt->dst, true); - if (!ip_may_fragment(skb) && ip_exceeds_mtu(skb, mtu)) { + if (ip_exceeds_mtu(skb, mtu)) { IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); -- cgit From 286c2349f6665c3e67f464a5faa14a0e28be4842 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 22 May 2015 20:55:56 -0700 Subject: ipv6: Clean up ipv6_select_ident() and ip6_fragment() This patch changes the ipv6_select_ident() signature to return a fragment id instead of taking a whole frag_hdr as a param to only set the frag_hdr->identification. It also cleans up ip6_fragment() to obtain the fragment id at the beginning instead of using multiple "if" later to check fragment id has been generated or not. Signed-off-by: Martin KaFai Lau Cc: Hannes Frederic Sowa Cc: Steffen Klassert Cc: Julian Anastasov Signed-off-by: David S. Miller --- net/ipv6/ip6_output.c | 17 ++++++----------- net/ipv6/output_core.c | 5 ++--- 2 files changed, 8 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index bc09cb97b840..05e2cdf938bd 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -551,7 +551,7 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb, struct frag_hdr *fh; unsigned int mtu, hlen, left, len; int hroom, troom; - __be32 frag_id = 0; + __be32 frag_id; int ptr, offset = 0, err = 0; u8 *prevhdr, nexthdr = 0; struct net *net = dev_net(skb_dst(skb)->dev); @@ -584,6 +584,8 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb, } mtu -= hlen + sizeof(struct frag_hdr); + frag_id = ipv6_select_ident(net, rt); + if (skb_has_frag_list(skb)) { int first_len = skb_pagelen(skb); struct sk_buff *frag2; @@ -632,11 +634,10 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb, skb_reset_network_header(skb); memcpy(skb_network_header(skb), tmp_hdr, hlen); - ipv6_select_ident(net, fh, rt); fh->nexthdr = nexthdr; fh->reserved = 0; fh->frag_off = htons(IP6_MF); - frag_id = fh->identification; + fh->identification = frag_id; first_len = skb_pagelen(skb); skb->data_len = first_len - skb_headlen(skb); @@ -778,11 +779,7 @@ slow_path: */ fh->nexthdr = nexthdr; fh->reserved = 0; - if (!frag_id) { - ipv6_select_ident(net, fh, rt); - frag_id = fh->identification; - } else - fh->identification = frag_id; + fh->identification = frag_id; /* * Copy a block of the IP datagram. @@ -1064,7 +1061,6 @@ static inline int ip6_ufo_append_data(struct sock *sk, { struct sk_buff *skb; - struct frag_hdr fhdr; int err; /* There is support for UDP large send offload by network @@ -1106,8 +1102,7 @@ static inline int ip6_ufo_append_data(struct sock *sk, skb_shinfo(skb)->gso_size = (mtu - fragheaderlen - sizeof(struct frag_hdr)) & ~7; skb_shinfo(skb)->gso_type = SKB_GSO_UDP; - ipv6_select_ident(sock_net(sk), &fhdr, rt); - skb_shinfo(skb)->ip6_frag_id = fhdr.identification; + skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk), rt); append: return skb_append_datato_frags(sk, skb, getfrag, from, diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index 85892af57364..ef0e2326496b 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -60,8 +60,7 @@ void ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb) } EXPORT_SYMBOL_GPL(ipv6_proxy_select_ident); -void ipv6_select_ident(struct net *net, struct frag_hdr *fhdr, - struct rt6_info *rt) +u32 ipv6_select_ident(struct net *net, struct rt6_info *rt) { static u32 ip6_idents_hashrnd __read_mostly; u32 id; @@ -70,7 +69,7 @@ void ipv6_select_ident(struct net *net, struct frag_hdr *fhdr, id = __ipv6_select_ident(net, ip6_idents_hashrnd, &rt->rt6i_dst.addr, &rt->rt6i_src.addr); - fhdr->identification = htonl(id); + return htonl(id); } EXPORT_SYMBOL(ipv6_select_ident); -- cgit From fd0273d7939f2ce3247f6aac5f6b9a0135d4cd39 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 22 May 2015 20:55:57 -0700 Subject: ipv6: Remove external dependency on rt6i_dst and rt6i_src This patch removes the assumptions that the returned rt is always a RTF_CACHE entry with the rt6i_dst and rt6i_src containing the destination and source address. The dst and src can be recovered from the calling site. We may consider to rename (rt6i_dst, rt6i_src) to (rt6i_key_dst, rt6i_key_src) later. Signed-off-by: Martin KaFai Lau Reviewed-by: Hannes Frederic Sowa Cc: Steffen Klassert Cc: Julian Anastasov Signed-off-by: David S. Miller --- net/ipv6/icmp.c | 2 +- net/ipv6/ip6_output.c | 13 ++++++++----- net/ipv6/ndisc.c | 2 +- net/ipv6/output_core.c | 10 ++++++---- net/ipv6/tcp_ipv6.c | 2 +- net/netfilter/ipvs/ip_vs_xmit.c | 4 ++-- net/sctp/ipv6.c | 3 ++- 7 files changed, 21 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 2c2b5d51f15c..24b359d1425b 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -207,7 +207,7 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, struct inet_peer *peer; peer = inet_getpeer_v6(net->ipv6.peers, - &rt->rt6i_dst.addr, 1); + &fl6->daddr, 1); res = inet_peer_xrlim_allow(peer, tmo); if (peer) inet_putpeer(peer); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 05e2cdf938bd..3097f2c5e890 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -459,7 +459,7 @@ int ip6_forward(struct sk_buff *skb) else target = &hdr->daddr; - peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1); + peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1); /* Limit redirects both by destination (here) and by source (inside ndisc_send_redirect) @@ -584,7 +584,8 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb, } mtu -= hlen + sizeof(struct frag_hdr); - frag_id = ipv6_select_ident(net, rt); + frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr, + &ipv6_hdr(skb)->saddr); if (skb_has_frag_list(skb)) { int first_len = skb_pagelen(skb); @@ -1057,7 +1058,7 @@ static inline int ip6_ufo_append_data(struct sock *sk, int odd, struct sk_buff *skb), void *from, int length, int hh_len, int fragheaderlen, int transhdrlen, int mtu, unsigned int flags, - struct rt6_info *rt) + const struct flowi6 *fl6) { struct sk_buff *skb; @@ -1102,7 +1103,9 @@ static inline int ip6_ufo_append_data(struct sock *sk, skb_shinfo(skb)->gso_size = (mtu - fragheaderlen - sizeof(struct frag_hdr)) & ~7; skb_shinfo(skb)->gso_type = SKB_GSO_UDP; - skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk), rt); + skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk), + &fl6->daddr, + &fl6->saddr); append: return skb_append_datato_frags(sk, skb, getfrag, from, @@ -1327,7 +1330,7 @@ emsgsize: (sk->sk_type == SOCK_DGRAM)) { err = ip6_ufo_append_data(sk, queue, getfrag, from, length, hh_len, fragheaderlen, - transhdrlen, mtu, flags, rt); + transhdrlen, mtu, flags, fl6); if (err) goto error; return 0; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 96f153c0846b..0a05b35a90fc 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1506,7 +1506,7 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) "Redirect: destination is not a neighbour\n"); goto release; } - peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1); + peer = inet_getpeer_v6(net->ipv6.peers, &ipv6_hdr(skb)->saddr, 1); ret = inet_peer_xrlim_allow(peer, 1*HZ); if (peer) inet_putpeer(peer); diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index ef0e2326496b..055e85cb7b65 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -10,7 +10,8 @@ #include static u32 __ipv6_select_ident(struct net *net, u32 hashrnd, - struct in6_addr *dst, struct in6_addr *src) + const struct in6_addr *dst, + const struct in6_addr *src) { u32 hash, id; @@ -60,15 +61,16 @@ void ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb) } EXPORT_SYMBOL_GPL(ipv6_proxy_select_ident); -u32 ipv6_select_ident(struct net *net, struct rt6_info *rt) +u32 ipv6_select_ident(struct net *net, + const struct in6_addr *daddr, + const struct in6_addr *saddr) { static u32 ip6_idents_hashrnd __read_mostly; u32 id; net_get_random_once(&ip6_idents_hashrnd, sizeof(ip6_idents_hashrnd)); - id = __ipv6_select_ident(net, ip6_idents_hashrnd, &rt->rt6i_dst.addr, - &rt->rt6i_src.addr); + id = __ipv6_select_ident(net, ip6_idents_hashrnd, daddr, saddr); return htonl(id); } EXPORT_SYMBOL(ipv6_select_ident); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index fefe4455e1f3..1a9390a370c0 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -262,7 +262,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, rt = (struct rt6_info *) dst; if (tcp_death_row.sysctl_tw_recycle && !tp->rx_opt.ts_recent_stamp && - ipv6_addr_equal(&rt->rt6i_dst.addr, &sk->sk_v6_daddr)) + ipv6_addr_equal(&fl6.daddr, &sk->sk_v6_daddr)) tcp_fetch_timewait_stamp(sk, dst); icsk->icsk_ext_hdr_len = 0; diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 19986ec5f21a..38f862728f75 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -781,7 +781,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* From world but DNAT to loopback address? */ if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && - ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) { + ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) { IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, 0, "ip_vs_nat_xmit_v6(): " "stopping DNAT to loopback address"); @@ -1346,7 +1346,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* From world but DNAT to loopback address? */ if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && - ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) { + ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) { IP_VS_DBG(1, "%s(): " "stopping DNAT to loopback %pI6\n", __func__, &cp->daddr.in6); diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index e703ff7fed40..17a0120ae5a5 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -332,7 +332,8 @@ out: rt = (struct rt6_info *)dst; t->dst = dst; t->dst_cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; - pr_debug("rt6_dst:%pI6 rt6_src:%pI6\n", &rt->rt6i_dst.addr, + pr_debug("rt6_dst:%pI6/%d rt6_src:%pI6\n", + &rt->rt6i_dst.addr, rt->rt6i_dst.plen, &fl6->saddr); } else { t->dst = NULL; -- cgit From 2647a9b07032c5a95ddee1fcb65d95bddbc6b7f9 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 22 May 2015 20:55:58 -0700 Subject: ipv6: Remove external dependency on rt6i_gateway and RTF_ANYCAST When creating a RTF_CACHE route, RTF_ANYCAST is set based on rt6i_dst. Also, rt6i_gateway is always set to the nexthop while the nexthop could be a gateway or the rt6i_dst.addr. After removing the rt6i_dst and rt6i_src dependency in the last patch, we also need to stop the caller from depending on rt6i_gateway and RTF_ANYCAST. Signed-off-by: Martin KaFai Lau Cc: Hannes Frederic Sowa Cc: Steffen Klassert Cc: Julian Anastasov Signed-off-by: David S. Miller --- net/bluetooth/6lowpan.c | 2 +- net/ipv6/icmp.c | 4 ++-- net/ipv6/ip6_output.c | 5 +++-- net/ipv6/route.c | 6 +----- net/netfilter/nf_conntrack_h323_main.c | 4 ++-- net/netfilter/xt_addrtype.c | 2 +- 6 files changed, 10 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 1742b849fcff..f3d6046c8ee7 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -192,7 +192,7 @@ static inline struct lowpan_peer *peer_lookup_dst(struct lowpan_dev *dev, if (ipv6_addr_any(nexthop)) return NULL; } else { - nexthop = rt6_nexthop(rt); + nexthop = rt6_nexthop(rt, daddr); /* We need to remember the address because it is needed * by bt_xmit() when sending the packet. In bt_xmit(), the diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 24b359d1425b..713d7434c911 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -337,7 +337,7 @@ static struct dst_entry *icmpv6_route_lookup(struct net *net, * We won't send icmp if the destination is known * anycast. */ - if (((struct rt6_info *)dst)->rt6i_flags & RTF_ANYCAST) { + if (ipv6_anycast_destination(dst, &fl6->daddr)) { net_dbg_ratelimited("icmp6_send: acast source\n"); dst_release(dst); return ERR_PTR(-EINVAL); @@ -564,7 +564,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb) if (!ipv6_unicast_destination(skb) && !(net->ipv6.sysctl.anycast_src_echo_reply && - ipv6_anycast_destination(skb))) + ipv6_anycast_destination(skb_dst(skb), saddr))) saddr = NULL; memcpy(&tmp_hdr, icmph, sizeof(tmp_hdr)); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 3097f2c5e890..61741676b987 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -105,7 +105,7 @@ static int ip6_finish_output2(struct sock *sk, struct sk_buff *skb) } rcu_read_lock_bh(); - nexthop = rt6_nexthop((struct rt6_info *)dst); + nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr); neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop); if (unlikely(!neigh)) neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); @@ -934,7 +934,8 @@ static int ip6_dst_lookup_tail(struct sock *sk, */ rt = (struct rt6_info *) *dst; rcu_read_lock_bh(); - n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt)); + n = __ipv6_neigh_lookup_noref(rt->dst.dev, + rt6_nexthop(rt, &fl6->daddr)); err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0; rcu_read_unlock_bh(); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 0c889cb89cc3..ce3b1754b4f5 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1945,11 +1945,7 @@ static struct rt6_info *ip6_rt_copy(struct rt6_info *ort, if (rt->rt6i_idev) in6_dev_hold(rt->rt6i_idev); rt->dst.lastuse = jiffies; - - if (ort->rt6i_flags & RTF_GATEWAY) - rt->rt6i_gateway = ort->rt6i_gateway; - else - rt->rt6i_gateway = *dest; + rt->rt6i_gateway = ort->rt6i_gateway; rt->rt6i_flags = ort->rt6i_flags; rt6_set_from(rt, ort); rt->rt6i_metric = 0; diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 1d69f5b9748f..9511af04dc81 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -779,8 +779,8 @@ static int callforward_do_filter(struct net *net, flowi6_to_flowi(&fl1), false)) { if (!afinfo->route(net, (struct dst_entry **)&rt2, flowi6_to_flowi(&fl2), false)) { - if (ipv6_addr_equal(rt6_nexthop(rt1), - rt6_nexthop(rt2)) && + if (ipv6_addr_equal(rt6_nexthop(rt1, &fl1.daddr), + rt6_nexthop(rt2, &fl2.daddr)) && rt1->dst.dev == rt2->dst.dev) ret = 1; dst_release(&rt2->dst); diff --git a/net/netfilter/xt_addrtype.c b/net/netfilter/xt_addrtype.c index fab6eea1bf38..5b4743cc0436 100644 --- a/net/netfilter/xt_addrtype.c +++ b/net/netfilter/xt_addrtype.c @@ -73,7 +73,7 @@ static u32 match_lookup_rt6(struct net *net, const struct net_device *dev, if (dev == NULL && rt->rt6i_flags & RTF_LOCAL) ret |= XT_ADDRTYPE_LOCAL; - if (rt->rt6i_flags & RTF_ANYCAST) + if (ipv6_anycast_destination((struct dst_entry *)rt, addr)) ret |= XT_ADDRTYPE_ANYCAST; dst_release(&rt->dst); -- cgit From 8b9df2657704dd313333a79497dde429f9190caa Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 22 May 2015 20:55:59 -0700 Subject: ipv6: Combine rt6_alloc_cow and rt6_alloc_clone A prep work for creating RTF_CACHE on exception only. After this patch, the same condition (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY)) is checked twice. This redundancy will be removed in the later patch. Signed-off-by: Martin KaFai Lau Cc: Hannes Frederic Sowa Cc: Steffen Klassert Cc: Julian Anastasov Signed-off-by: David S. Miller --- net/ipv6/route.c | 45 ++++++++++++++++++++------------------------- 1 file changed, 20 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index ce3b1754b4f5..f199d6357b31 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -655,6 +655,11 @@ static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict) return match ? match : net->ipv6.ip6_null_entry; } +static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt) +{ + return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY)); +} + #ifdef CONFIG_IPV6_ROUTE_INFO int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, const struct in6_addr *gwaddr) @@ -833,9 +838,9 @@ int ip6_ins_rt(struct rt6_info *rt) return __ip6_ins_rt(rt, &info, &mxc); } -static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, - const struct in6_addr *daddr, - const struct in6_addr *saddr) +static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort, + const struct in6_addr *daddr, + const struct in6_addr *saddr) { struct rt6_info *rt; @@ -846,33 +851,24 @@ static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, rt = ip6_rt_copy(ort, daddr); if (rt) { - if (ort->rt6i_dst.plen != 128 && - ipv6_addr_equal(&ort->rt6i_dst.addr, daddr)) - rt->rt6i_flags |= RTF_ANYCAST; - rt->rt6i_flags |= RTF_CACHE; + if (!rt6_is_gw_or_nonexthop(ort)) { + if (ort->rt6i_dst.plen != 128 && + ipv6_addr_equal(&ort->rt6i_dst.addr, daddr)) + rt->rt6i_flags |= RTF_ANYCAST; #ifdef CONFIG_IPV6_SUBTREES - if (rt->rt6i_src.plen && saddr) { - rt->rt6i_src.addr = *saddr; - rt->rt6i_src.plen = 128; - } + if (rt->rt6i_src.plen && saddr) { + rt->rt6i_src.addr = *saddr; + rt->rt6i_src.plen = 128; + } #endif + } } return rt; } -static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, - const struct in6_addr *daddr) -{ - struct rt6_info *rt = ip6_rt_copy(ort, daddr); - - if (rt) - rt->rt6i_flags |= RTF_CACHE; - return rt; -} - static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, int flags) { @@ -918,10 +914,9 @@ redo_rt6_select: if (rt->rt6i_flags & RTF_CACHE) goto out2; - if (!(rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY))) - nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr); - else if (!(rt->dst.flags & DST_HOST) || !(rt->rt6i_flags & RTF_LOCAL)) - nrt = rt6_alloc_clone(rt, &fl6->daddr); + if (!rt6_is_gw_or_nonexthop(rt) || + !(rt->dst.flags & DST_HOST) || !(rt->rt6i_flags & RTF_LOCAL)) + nrt = ip6_rt_cache_alloc(rt, &fl6->daddr, &fl6->saddr); else goto out2; -- cgit From 45e4fd26683c9a5f88600d91b08a484f7f09226a Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 22 May 2015 20:56:00 -0700 Subject: ipv6: Only create RTF_CACHE routes after encountering pmtu exception This patch creates a RTF_CACHE routes only after encountering a pmtu exception. After ip6_rt_update_pmtu() has inserted the RTF_CACHE route to the fib6 tree, the rt->rt6i_node->fn_sernum is bumped which will fail the ip6_dst_check() and trigger a relookup. Signed-off-by: Martin KaFai Lau Cc: Hannes Frederic Sowa Cc: Steffen Klassert Cc: Julian Anastasov Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 1 + net/ipv6/route.c | 100 +++++++++++++++++++++++++++-------------------------- 2 files changed, 52 insertions(+), 49 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index bde57b113009..83341b3a248d 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -738,6 +738,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, rt6_clean_expires(iter); else rt6_set_expires(iter, rt->dst.expires); + iter->rt6i_pmtu = rt->rt6i_pmtu; return -EEXIST; } /* If we have the same destination and the same metric, diff --git a/net/ipv6/route.c b/net/ipv6/route.c index f199d6357b31..e7ae2430dfed 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -873,16 +873,13 @@ static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, struct flowi6 *fl6, int flags) { struct fib6_node *fn, *saved_fn; - struct rt6_info *rt, *nrt; + struct rt6_info *rt; int strict = 0; - int attempts = 3; - int err; strict |= flags & RT6_LOOKUP_F_IFACE; if (net->ipv6.devconf_all->forwarding == 0) strict |= RT6_LOOKUP_F_REACHABLE; -redo_fib6_lookup_lock: read_lock_bh(&table->tb6_lock); fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); @@ -901,46 +898,12 @@ redo_rt6_select: strict &= ~RT6_LOOKUP_F_REACHABLE; fn = saved_fn; goto redo_rt6_select; - } else { - dst_hold(&rt->dst); - read_unlock_bh(&table->tb6_lock); - goto out2; } } dst_hold(&rt->dst); read_unlock_bh(&table->tb6_lock); - if (rt->rt6i_flags & RTF_CACHE) - goto out2; - - if (!rt6_is_gw_or_nonexthop(rt) || - !(rt->dst.flags & DST_HOST) || !(rt->rt6i_flags & RTF_LOCAL)) - nrt = ip6_rt_cache_alloc(rt, &fl6->daddr, &fl6->saddr); - else - goto out2; - - ip6_rt_put(rt); - rt = nrt ? : net->ipv6.ip6_null_entry; - - dst_hold(&rt->dst); - if (nrt) { - err = ip6_ins_rt(nrt); - if (!err) - goto out2; - } - - if (--attempts <= 0) - goto out2; - - /* - * Race condition! In the gap, when table->tb6_lock was - * released someone could insert this route. Relookup. - */ - ip6_rt_put(rt); - goto redo_fib6_lookup_lock; - -out2: rt6_dst_from_metrics_check(rt); rt->dst.lastuse = jiffies; rt->dst.__use++; @@ -1113,24 +1076,63 @@ static void ip6_link_failure(struct sk_buff *skb) } } -static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu) +static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) +{ + struct net *net = dev_net(rt->dst.dev); + + rt->rt6i_flags |= RTF_MODIFIED; + rt->rt6i_pmtu = mtu; + rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); +} + +static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, + const struct ipv6hdr *iph, u32 mtu) { struct rt6_info *rt6 = (struct rt6_info *)dst; - dst_confirm(dst); - if (mtu < dst_mtu(dst) && (rt6->rt6i_flags & RTF_CACHE)) { - struct net *net = dev_net(dst->dev); + if (rt6->rt6i_flags & RTF_LOCAL) + return; - rt6->rt6i_flags |= RTF_MODIFIED; - if (mtu < IPV6_MIN_MTU) - mtu = IPV6_MIN_MTU; + dst_confirm(dst); + mtu = max_t(u32, mtu, IPV6_MIN_MTU); + if (mtu >= dst_mtu(dst)) + return; - rt6->rt6i_pmtu = mtu; - rt6_update_expires(rt6, net->ipv6.sysctl.ip6_rt_mtu_expires); + if (rt6->rt6i_flags & RTF_CACHE) { + rt6_do_update_pmtu(rt6, mtu); + } else { + const struct in6_addr *daddr, *saddr; + struct rt6_info *nrt6; + + if (iph) { + daddr = &iph->daddr; + saddr = &iph->saddr; + } else if (sk) { + daddr = &sk->sk_v6_daddr; + saddr = &inet6_sk(sk)->saddr; + } else { + return; + } + nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr); + if (nrt6) { + rt6_do_update_pmtu(nrt6, mtu); + + /* ip6_ins_rt(nrt6) will bump the + * rt6->rt6i_node->fn_sernum + * which will fail the next rt6_check() and + * invalidate the sk->sk_dst_cache. + */ + ip6_ins_rt(nrt6); + } } } +static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu) +{ + __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu); +} + void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, int oif, u32 mark) { @@ -1147,7 +1149,7 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, dst = ip6_route_output(net, NULL, &fl6); if (!dst->error) - ip6_rt_update_pmtu(dst, NULL, skb, ntohl(mtu)); + __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu)); dst_release(dst); } EXPORT_SYMBOL_GPL(ip6_update_pmtu); -- cgit From b197df4f0f3782782e9ea8996e91b65ae33e8dd9 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 22 May 2015 20:56:01 -0700 Subject: ipv6: Add rt6_get_cookie() function Instead of doing the rt6->rt6i_node check whenever we need to get the route's cookie. Refactor it into rt6_get_cookie(). It is a prep work to handle FLOWI_FLAG_KNOWN_NH and also percpu rt6_info later. Signed-off-by: Martin KaFai Lau Cc: Hannes Frederic Sowa Cc: Steffen Klassert Cc: Julian Anastasov Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 2 +- net/ipv6/tcp_ipv6.c | 3 +-- net/ipv6/xfrm6_policy.c | 6 ++---- net/netfilter/ipvs/ip_vs_xmit.c | 2 +- net/sctp/ipv6.c | 2 +- 5 files changed, 6 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 5cafd92c2312..2e67b660118b 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -151,7 +151,7 @@ EXPORT_SYMBOL_GPL(ip6_tnl_dst_reset); void ip6_tnl_dst_store(struct ip6_tnl *t, struct dst_entry *dst) { struct rt6_info *rt = (struct rt6_info *) dst; - t->dst_cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; + t->dst_cookie = rt6_get_cookie(rt); dst_release(t->dst_cache); t->dst_cache = dst; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 1a9390a370c0..7be3d858cbf0 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -99,8 +99,7 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) dst_hold(dst); sk->sk_rx_dst = dst; inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; - if (rt->rt6i_node) - inet6_sk(sk)->rx_dst_cookie = rt->rt6i_node->fn_sernum; + inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt); } } diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 6ae256b4c55a..ed0583c1b9fc 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -76,8 +76,7 @@ static int xfrm6_init_path(struct xfrm_dst *path, struct dst_entry *dst, { if (dst->ops->family == AF_INET6) { struct rt6_info *rt = (struct rt6_info *)dst; - if (rt->rt6i_node) - path->path_cookie = rt->rt6i_node->fn_sernum; + path->path_cookie = rt6_get_cookie(rt); } path->u.rt6.rt6i_nfheader_len = nfheader_len; @@ -105,8 +104,7 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, RTF_LOCAL); xdst->u.rt6.rt6i_metric = rt->rt6i_metric; xdst->u.rt6.rt6i_node = rt->rt6i_node; - if (rt->rt6i_node) - xdst->route_cookie = rt->rt6i_node->fn_sernum; + xdst->route_cookie = rt6_get_cookie(rt); xdst->u.rt6.rt6i_gateway = rt->rt6i_gateway; xdst->u.rt6.rt6i_dst = rt->rt6i_dst; xdst->u.rt6.rt6i_src = rt->rt6i_src; diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 38f862728f75..5eff9f6dcfb8 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -435,7 +435,7 @@ __ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, goto err_unreach; } rt = (struct rt6_info *) dst; - cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; + cookie = rt6_get_cookie(rt); __ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie); spin_unlock_bh(&dest->dst_lock); IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n", diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 17a0120ae5a5..e917d27328ea 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -331,7 +331,7 @@ out: rt = (struct rt6_info *)dst; t->dst = dst; - t->dst_cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; + t->dst_cookie = rt6_get_cookie(rt); pr_debug("rt6_dst:%pI6/%d rt6_src:%pI6\n", &rt->rt6i_dst.addr, rt->rt6i_dst.plen, &fl6->saddr); -- cgit From 48e8aa6e3137692d38f20e8bfff100e408c6bc53 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 22 May 2015 20:56:02 -0700 Subject: ipv6: Set FLOWI_FLAG_KNOWN_NH at flowi6_flags The neighbor look-up used to depend on the rt6i_gateway (if there is a gateway) or the rt6i_dst (if it is a RTF_CACHE clone) as the nexthop address. Note that rt6i_dst is set to fl6->daddr for the RTF_CACHE clone where fl6->daddr is the one used to do the route look-up. Now, we only create RTF_CACHE clone after encountering exception. When doing the neighbor look-up with a route that is neither a gateway nor a RTF_CACHE clone, the daddr in skb will be used as the nexthop. In some cases, the daddr in skb is not the one used to do the route look-up. One example is in ip_vs_dr_xmit_v6() where the real nexthop server address is different from the one in the skb. This patch is going to follow the IPv4 approach and ask the ip6_pol_route() callers to set the FLOWI_FLAG_KNOWN_NH properly. In the next patch, ip6_pol_route() will honor the FLOWI_FLAG_KNOWN_NH and create a RTF_CACHE clone. Signed-off-by: Martin KaFai Lau Acked-by: Julian Anastasov Tested-by: Julian Anastasov Cc: Hannes Frederic Sowa Cc: Steffen Klassert Signed-off-by: David S. Miller --- net/ipv6/raw.c | 3 +++ net/netfilter/ipvs/ip_vs_xmit.c | 13 +++++++++---- net/netfilter/xt_TEE.c | 1 + 3 files changed, 13 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 8072bd4139b7..484a5c144073 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -865,6 +865,9 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowi6_oif = np->ucast_oif; security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + if (inet->hdrincl) + fl6.flowi6_flags |= FLOWI_FLAG_KNOWN_NH; + dst = ip6_dst_lookup_flow(sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 5eff9f6dcfb8..bf66a8657a5f 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -364,13 +364,16 @@ err_unreach: #ifdef CONFIG_IP_VS_IPV6 static struct dst_entry * __ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr, - struct in6_addr *ret_saddr, int do_xfrm) + struct in6_addr *ret_saddr, int do_xfrm, int rt_mode) { struct dst_entry *dst; struct flowi6 fl6 = { .daddr = *daddr, }; + if (rt_mode & IP_VS_RT_MODE_KNOWN_NH) + fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH; + dst = ip6_route_output(net, NULL, &fl6); if (dst->error) goto out_err; @@ -427,7 +430,7 @@ __ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, } dst = __ip_vs_route_output_v6(net, &dest->addr.in6, &dest_dst->dst_saddr.in6, - do_xfrm); + do_xfrm, rt_mode); if (!dst) { __ip_vs_dst_set(dest, NULL, NULL, 0); spin_unlock_bh(&dest->dst_lock); @@ -446,7 +449,8 @@ __ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, *ret_saddr = dest_dst->dst_saddr.in6; } else { noref = 0; - dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm); + dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm, + rt_mode); if (!dst) goto err_unreach; rt = (struct rt6_info *) dst; @@ -1164,7 +1168,8 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, local = __ip_vs_get_out_rt_v6(cp->af, skb, cp->dest, &cp->daddr.in6, NULL, ipvsh, 0, IP_VS_RT_MODE_LOCAL | - IP_VS_RT_MODE_NON_LOCAL); + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_KNOWN_NH); if (local < 0) goto tx_error; if (local) { diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c index 292934d23482..a747eb475b68 100644 --- a/net/netfilter/xt_TEE.c +++ b/net/netfilter/xt_TEE.c @@ -152,6 +152,7 @@ tee_tg_route6(struct sk_buff *skb, const struct xt_tee_tginfo *info) fl6.daddr = info->gw.in6; fl6.flowlabel = ((iph->flow_lbl[0] & 0xF) << 16) | (iph->flow_lbl[1] << 8) | iph->flow_lbl[2]; + fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH; dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { dst_release(dst); -- cgit From 3da59bd94583d1239e4fbdee452265a160b9cd71 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 22 May 2015 20:56:03 -0700 Subject: ipv6: Create RTF_CACHE clone when FLOWI_FLAG_KNOWN_NH is set This patch always creates RTF_CACHE clone with DST_NOCACHE when FLOWI_FLAG_KNOWN_NH is set so that the rt6i_dst is set to the fl6->daddr. Signed-off-by: Martin KaFai Lau Acked-by: Julian Anastasov Tested-by: Julian Anastasov Cc: Hannes Frederic Sowa Cc: Steffen Klassert Signed-off-by: David S. Miller --- net/ipv6/route.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 49 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index e7ae2430dfed..9e4c3c5d1591 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -901,13 +901,34 @@ redo_rt6_select: } } - dst_hold(&rt->dst); + dst_use(&rt->dst, jiffies); read_unlock_bh(&table->tb6_lock); - rt6_dst_from_metrics_check(rt); - rt->dst.lastuse = jiffies; - rt->dst.__use++; + if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) { + goto done; + } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && + !(rt->rt6i_flags & RTF_GATEWAY))) { + /* Create a RTF_CACHE clone which will not be + * owned by the fib6 tree. It is for the special case where + * the daddr in the skb during the neighbor look-up is different + * from the fl6->daddr used to look-up route here. + */ + + struct rt6_info *uncached_rt; + + uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL); + dst_release(&rt->dst); + + if (uncached_rt) + uncached_rt->dst.flags |= DST_NOCACHE; + else + uncached_rt = net->ipv6.ip6_null_entry; + dst_hold(&uncached_rt->dst); + return uncached_rt; + } +done: + rt6_dst_from_metrics_check(rt); return rt; } @@ -1019,6 +1040,26 @@ static void rt6_dst_from_metrics_check(struct rt6_info *rt) dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true); } +static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie) +{ + if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie)) + return NULL; + + if (rt6_check_expired(rt)) + return NULL; + + return &rt->dst; +} + +static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie) +{ + if (rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && + rt6_check((struct rt6_info *)(rt->dst.from), cookie)) + return &rt->dst; + else + return NULL; +} + static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) { struct rt6_info *rt; @@ -1029,15 +1070,13 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) * DST_OBSOLETE_FORCE_CHK which forces validation calls down * into this function always. */ - if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie)) - return NULL; - - if (rt6_check_expired(rt)) - return NULL; rt6_dst_from_metrics_check(rt); - return dst; + if (unlikely(dst->flags & DST_NOCACHE)) + return rt6_dst_from_check(rt, cookie); + else + return rt6_check(rt, cookie); } static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) -- cgit From 8d0b94afdca84598912347e61defa846a0988d04 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 22 May 2015 20:56:04 -0700 Subject: ipv6: Keep track of DST_NOCACHE routes in case of iface down/unregister This patch keeps track of the DST_NOCACHE routes in a list and replaces its dev with loopback during the iface down/unregister event. Signed-off-by: Martin KaFai Lau Cc: Hannes Frederic Sowa Cc: Steffen Klassert Cc: Julian Anastasov Signed-off-by: David S. Miller --- net/ipv6/route.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 76 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 9e4c3c5d1591..94ce1e0b5a33 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -105,6 +105,67 @@ static struct rt6_info *rt6_get_route_info(struct net *net, const struct in6_addr *gwaddr, int ifindex); #endif +struct uncached_list { + spinlock_t lock; + struct list_head head; +}; + +static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); + +static void rt6_uncached_list_add(struct rt6_info *rt) +{ + struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); + + rt->dst.flags |= DST_NOCACHE; + rt->rt6i_uncached_list = ul; + + spin_lock_bh(&ul->lock); + list_add_tail(&rt->rt6i_uncached, &ul->head); + spin_unlock_bh(&ul->lock); +} + +static void rt6_uncached_list_del(struct rt6_info *rt) +{ + if (!list_empty(&rt->rt6i_uncached)) { + struct uncached_list *ul = rt->rt6i_uncached_list; + + spin_lock_bh(&ul->lock); + list_del(&rt->rt6i_uncached); + spin_unlock_bh(&ul->lock); + } +} + +static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) +{ + struct net_device *loopback_dev = net->loopback_dev; + int cpu; + + for_each_possible_cpu(cpu) { + struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); + struct rt6_info *rt; + + spin_lock_bh(&ul->lock); + list_for_each_entry(rt, &ul->head, rt6i_uncached) { + struct inet6_dev *rt_idev = rt->rt6i_idev; + struct net_device *rt_dev = rt->dst.dev; + + if (rt_idev && (rt_idev->dev == dev || !dev) && + rt_idev->dev != loopback_dev) { + rt->rt6i_idev = in6_dev_get(loopback_dev); + in6_dev_put(rt_idev); + } + + if (rt_dev && (rt_dev == dev || !dev) && + rt_dev != loopback_dev) { + rt->dst.dev = loopback_dev; + dev_hold(rt->dst.dev); + dev_put(rt_dev); + } + } + spin_unlock_bh(&ul->lock); + } +} + static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old) { struct rt6_info *rt = (struct rt6_info *)dst; @@ -262,6 +323,7 @@ static inline struct rt6_info *ip6_dst_alloc(struct net *net, memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); INIT_LIST_HEAD(&rt->rt6i_siblings); + INIT_LIST_HEAD(&rt->rt6i_uncached); } return rt; } @@ -269,11 +331,14 @@ static inline struct rt6_info *ip6_dst_alloc(struct net *net, static void ip6_dst_destroy(struct dst_entry *dst) { struct rt6_info *rt = (struct rt6_info *)dst; - struct inet6_dev *idev = rt->rt6i_idev; struct dst_entry *from = dst->from; + struct inet6_dev *idev; dst_destroy_metrics_generic(dst); + rt6_uncached_list_del(rt); + + idev = rt->rt6i_idev; if (idev) { rt->rt6i_idev = NULL; in6_dev_put(idev); @@ -920,7 +985,7 @@ redo_rt6_select: dst_release(&rt->dst); if (uncached_rt) - uncached_rt->dst.flags |= DST_NOCACHE; + rt6_uncached_list_add(uncached_rt); else uncached_rt = net->ipv6.ip6_null_entry; dst_hold(&uncached_rt->dst); @@ -2367,6 +2432,7 @@ void rt6_ifdown(struct net *net, struct net_device *dev) fib6_clean_all(net, fib6_ifdown, &adn); icmp6_clean_all(fib6_ifdown, &adn); + rt6_uncached_list_flush_dev(net, dev); } struct rt6_mtu_change_arg { @@ -3263,6 +3329,7 @@ static struct notifier_block ip6_route_dev_notifier = { int __init ip6_route_init(void) { int ret; + int cpu; ret = -ENOMEM; ip6_dst_ops_template.kmem_cachep = @@ -3322,6 +3389,13 @@ int __init ip6_route_init(void) if (ret) goto out_register_late_subsys; + for_each_possible_cpu(cpu) { + struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); + + INIT_LIST_HEAD(&ul->head); + spin_lock_init(&ul->lock); + } + out: return ret; -- cgit From 83a09abd1a8badbbb715f928d07c65ac47709c47 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 22 May 2015 20:56:05 -0700 Subject: ipv6: Break up ip6_rt_copy() This patch breaks up ip6_rt_copy() into ip6_rt_copy_init() and ip6_rt_cache_alloc(). In the later patch, we need to create a percpu rt6_info copy. Hence, refactor the common rt6_info init codes to ip6_rt_copy_init(). Signed-off-by: Martin KaFai Lau Cc: Hannes Frederic Sowa Cc: Steffen Klassert Cc: Julian Anastasov Signed-off-by: David S. Miller --- net/ipv6/route.c | 90 ++++++++++++++++++++++++++------------------------------ 1 file changed, 41 insertions(+), 49 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 94ce1e0b5a33..90c8eaa24565 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -72,8 +72,7 @@ enum rt6_nud_state { RT6_NUD_SUCCEED = 1 }; -static struct rt6_info *ip6_rt_copy(struct rt6_info *ort, - const struct in6_addr *dest); +static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort); static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); static unsigned int ip6_default_advmss(const struct dst_entry *dst); static unsigned int ip6_mtu(const struct dst_entry *dst); @@ -913,22 +912,32 @@ static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort, * Clone the route. */ - rt = ip6_rt_copy(ort, daddr); + if (ort->rt6i_flags & RTF_CACHE) + ort = (struct rt6_info *)ort->dst.from; - if (rt) { - rt->rt6i_flags |= RTF_CACHE; + rt = ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, + 0, ort->rt6i_table); + + if (!rt) + return NULL; + + ip6_rt_copy_init(rt, ort); + rt->rt6i_flags |= RTF_CACHE; + rt->rt6i_metric = 0; + rt->dst.flags |= DST_HOST; + rt->rt6i_dst.addr = *daddr; + rt->rt6i_dst.plen = 128; - if (!rt6_is_gw_or_nonexthop(ort)) { - if (ort->rt6i_dst.plen != 128 && - ipv6_addr_equal(&ort->rt6i_dst.addr, daddr)) - rt->rt6i_flags |= RTF_ANYCAST; + if (!rt6_is_gw_or_nonexthop(ort)) { + if (ort->rt6i_dst.plen != 128 && + ipv6_addr_equal(&ort->rt6i_dst.addr, daddr)) + rt->rt6i_flags |= RTF_ANYCAST; #ifdef CONFIG_IPV6_SUBTREES - if (rt->rt6i_src.plen && saddr) { - rt->rt6i_src.addr = *saddr; - rt->rt6i_src.plen = 128; - } -#endif + if (rt->rt6i_src.plen && saddr) { + rt->rt6i_src.addr = *saddr; + rt->rt6i_src.plen = 128; } +#endif } return rt; @@ -1980,7 +1989,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu NEIGH_UPDATE_F_ISROUTER)) ); - nrt = ip6_rt_copy(rt, &msg->dest); + nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL); if (!nrt) goto out; @@ -2022,42 +2031,25 @@ static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from) dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true); } -static struct rt6_info *ip6_rt_copy(struct rt6_info *ort, - const struct in6_addr *dest) -{ - struct net *net = dev_net(ort->dst.dev); - struct rt6_info *rt; - - if (ort->rt6i_flags & RTF_CACHE) - ort = (struct rt6_info *)ort->dst.from; - - rt = ip6_dst_alloc(net, ort->dst.dev, 0, - ort->rt6i_table); - - if (rt) { - rt->dst.input = ort->dst.input; - rt->dst.output = ort->dst.output; - rt->dst.flags |= DST_HOST; - - rt->rt6i_dst.addr = *dest; - rt->rt6i_dst.plen = 128; - rt->dst.error = ort->dst.error; - rt->rt6i_idev = ort->rt6i_idev; - if (rt->rt6i_idev) - in6_dev_hold(rt->rt6i_idev); - rt->dst.lastuse = jiffies; - rt->rt6i_gateway = ort->rt6i_gateway; - rt->rt6i_flags = ort->rt6i_flags; - rt6_set_from(rt, ort); - rt->rt6i_metric = 0; - +static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort) +{ + rt->dst.input = ort->dst.input; + rt->dst.output = ort->dst.output; + rt->rt6i_dst = ort->rt6i_dst; + rt->dst.error = ort->dst.error; + rt->rt6i_idev = ort->rt6i_idev; + if (rt->rt6i_idev) + in6_dev_hold(rt->rt6i_idev); + rt->dst.lastuse = jiffies; + rt->rt6i_gateway = ort->rt6i_gateway; + rt->rt6i_flags = ort->rt6i_flags; + rt6_set_from(rt, ort); + rt->rt6i_metric = ort->rt6i_metric; #ifdef CONFIG_IPV6_SUBTREES - memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); + rt->rt6i_src = ort->rt6i_src; #endif - memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key)); - rt->rt6i_table = ort->rt6i_table; - } - return rt; + rt->rt6i_prefsrc = ort->rt6i_prefsrc; + rt->rt6i_table = ort->rt6i_table; } #ifdef CONFIG_IPV6_ROUTE_INFO -- cgit From d52d3997f843ffefaa8d8462790ffcaca6c74192 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 22 May 2015 20:56:06 -0700 Subject: ipv6: Create percpu rt6_info After the patch 'ipv6: Only create RTF_CACHE routes after encountering pmtu exception', we need to compensate the performance hit (bouncing dst->__refcnt). Signed-off-by: Martin KaFai Lau Cc: Hannes Frederic Sowa Cc: Steffen Klassert Cc: Julian Anastasov Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 24 +++++++++- net/ipv6/route.c | 132 ++++++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 139 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 83341b3a248d..55d19861ab20 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -154,10 +154,32 @@ static void node_free(struct fib6_node *fn) kmem_cache_free(fib6_node_kmem, fn); } +static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) +{ + int cpu; + + if (!non_pcpu_rt->rt6i_pcpu) + return; + + for_each_possible_cpu(cpu) { + struct rt6_info **ppcpu_rt; + struct rt6_info *pcpu_rt; + + ppcpu_rt = per_cpu_ptr(non_pcpu_rt->rt6i_pcpu, cpu); + pcpu_rt = *ppcpu_rt; + if (pcpu_rt) { + dst_free(&pcpu_rt->dst); + *ppcpu_rt = NULL; + } + } +} + static void rt6_release(struct rt6_info *rt) { - if (atomic_dec_and_test(&rt->rt6i_ref)) + if (atomic_dec_and_test(&rt->rt6i_ref)) { + rt6_free_pcpu(rt); dst_free(&rt->dst); + } } static void fib6_link_table(struct net *net, struct fib6_table *tb) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 90c8eaa24565..1a1122a6bbf5 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -165,11 +165,18 @@ static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) } } +static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt) +{ + return dst_metrics_write_ptr(rt->dst.from); +} + static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old) { struct rt6_info *rt = (struct rt6_info *)dst; - if (rt->rt6i_flags & RTF_CACHE) + if (rt->rt6i_flags & RTF_PCPU) + return rt6_pcpu_cow_metrics(rt); + else if (rt->rt6i_flags & RTF_CACHE) return NULL; else return dst_cow_metrics_generic(dst, old); @@ -309,10 +316,10 @@ static const struct rt6_info ip6_blk_hole_entry_template = { #endif /* allocate dst with ip6_dst_ops */ -static inline struct rt6_info *ip6_dst_alloc(struct net *net, - struct net_device *dev, - int flags, - struct fib6_table *table) +static struct rt6_info *__ip6_dst_alloc(struct net *net, + struct net_device *dev, + int flags, + struct fib6_table *table) { struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0, DST_OBSOLETE_FORCE_CHK, flags); @@ -327,6 +334,34 @@ static inline struct rt6_info *ip6_dst_alloc(struct net *net, return rt; } +static struct rt6_info *ip6_dst_alloc(struct net *net, + struct net_device *dev, + int flags, + struct fib6_table *table) +{ + struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags, table); + + if (rt) { + rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC); + if (rt->rt6i_pcpu) { + int cpu; + + for_each_possible_cpu(cpu) { + struct rt6_info **p; + + p = per_cpu_ptr(rt->rt6i_pcpu, cpu); + /* no one shares rt */ + *p = NULL; + } + } else { + dst_destroy((struct dst_entry *)rt); + return NULL; + } + } + + return rt; +} + static void ip6_dst_destroy(struct dst_entry *dst) { struct rt6_info *rt = (struct rt6_info *)dst; @@ -335,6 +370,9 @@ static void ip6_dst_destroy(struct dst_entry *dst) dst_destroy_metrics_generic(dst); + if (rt->rt6i_pcpu) + free_percpu(rt->rt6i_pcpu); + rt6_uncached_list_del(rt); idev = rt->rt6i_idev; @@ -912,11 +950,11 @@ static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort, * Clone the route. */ - if (ort->rt6i_flags & RTF_CACHE) + if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)) ort = (struct rt6_info *)ort->dst.from; - rt = ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, - 0, ort->rt6i_table); + rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, + 0, ort->rt6i_table); if (!rt) return NULL; @@ -943,6 +981,54 @@ static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort, return rt; } +static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt) +{ + struct rt6_info *pcpu_rt; + + pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev), + rt->dst.dev, rt->dst.flags, + rt->rt6i_table); + + if (!pcpu_rt) + return NULL; + ip6_rt_copy_init(pcpu_rt, rt); + pcpu_rt->rt6i_protocol = rt->rt6i_protocol; + pcpu_rt->rt6i_flags |= RTF_PCPU; + return pcpu_rt; +} + +/* It should be called with read_lock_bh(&tb6_lock) acquired */ +static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt) +{ + struct rt6_info *pcpu_rt, *prev, **p; + + p = this_cpu_ptr(rt->rt6i_pcpu); + pcpu_rt = *p; + + if (pcpu_rt) + goto done; + + pcpu_rt = ip6_rt_pcpu_alloc(rt); + if (!pcpu_rt) { + struct net *net = dev_net(rt->dst.dev); + + pcpu_rt = net->ipv6.ip6_null_entry; + goto done; + } + + prev = cmpxchg(p, NULL, pcpu_rt); + if (prev) { + /* If someone did it before us, return prev instead */ + dst_destroy(&pcpu_rt->dst); + pcpu_rt = prev; + } + +done: + dst_hold(&pcpu_rt->dst); + rt6_dst_from_metrics_check(pcpu_rt); + return pcpu_rt; +} + static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, int flags) { @@ -975,11 +1061,13 @@ redo_rt6_select: } } - dst_use(&rt->dst, jiffies); - read_unlock_bh(&table->tb6_lock); if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) { - goto done; + dst_use(&rt->dst, jiffies); + read_unlock_bh(&table->tb6_lock); + + rt6_dst_from_metrics_check(rt); + return rt; } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && !(rt->rt6i_flags & RTF_GATEWAY))) { /* Create a RTF_CACHE clone which will not be @@ -990,6 +1078,9 @@ redo_rt6_select: struct rt6_info *uncached_rt; + dst_use(&rt->dst, jiffies); + read_unlock_bh(&table->tb6_lock); + uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL); dst_release(&rt->dst); @@ -997,13 +1088,22 @@ redo_rt6_select: rt6_uncached_list_add(uncached_rt); else uncached_rt = net->ipv6.ip6_null_entry; + dst_hold(&uncached_rt->dst); return uncached_rt; - } -done: - rt6_dst_from_metrics_check(rt); - return rt; + } else { + /* Get a percpu copy */ + + struct rt6_info *pcpu_rt; + + rt->dst.lastuse = jiffies; + rt->dst.__use++; + pcpu_rt = rt6_get_pcpu_route(rt); + read_unlock_bh(&table->tb6_lock); + + return pcpu_rt; + } } static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table, @@ -1147,7 +1247,7 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) rt6_dst_from_metrics_check(rt); - if (unlikely(dst->flags & DST_NOCACHE)) + if ((rt->rt6i_flags & RTF_PCPU) || unlikely(dst->flags & DST_NOCACHE)) return rt6_dst_from_check(rt, cookie); else return rt6_check(rt, cookie); -- cgit From 485fca664d76c47fb2a17dcd2ce705a0f137cc3b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 22 May 2015 00:44:16 +0200 Subject: ipv6: don't increase size when refragmenting forwarded ipv6 skbs since commit 6aafeef03b9d ("netfilter: push reasm skb through instead of original frag skbs") we will end up sometimes re-fragmenting skbs that we've reassembled. ipv6 defrag preserves the original skbs using the skb frag list, i.e. as long as the skb frag list is preserved there is no problem since we keep original geometry of fragments intact. However, in the rare case where the frag list is munged or skb is linearized, we might send larger fragments than what we originally received. A router in the path might then send packet-too-big errors even if sender never sent fragments exceeding the reported mtu: mtu 1500 - 1500:1400 - 1400:1280 - 1280 A R1 R2 B 1 - A sends to B, fragment size 1400 2 - R2 sends pkttoobig error for 1280 3 - A sends to B, fragment size 1280 4 - R2 sends pkttoobig error for 1280 again because it sees fragments of size 1400. make sure ip6_fragment always caps MTU at largest packet size seen when defragmented skb is forwarded. Acked-by: Hannes Frederic Sowa Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/ipv6/ip6_output.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 61741676b987..d5f7716662db 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -564,18 +564,17 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb, /* We must not fragment if the socket is set to force MTU discovery * or if the skb it not generated by a local socket. */ - if (unlikely(!skb->ignore_df && skb->len > mtu) || - (IP6CB(skb)->frag_max_size && - IP6CB(skb)->frag_max_size > mtu)) { - if (skb->sk && dst_allfrag(skb_dst(skb))) - sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK); + if (unlikely(!skb->ignore_df && skb->len > mtu)) + goto fail_toobig; - skb->dev = skb_dst(skb)->dev; - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_FRAGFAILS); - kfree_skb(skb); - return -EMSGSIZE; + if (IP6CB(skb)->frag_max_size) { + if (IP6CB(skb)->frag_max_size > mtu) + goto fail_toobig; + + /* don't send fragments larger than what we received */ + mtu = IP6CB(skb)->frag_max_size; + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; } if (np && np->frag_size < mtu) { @@ -813,6 +812,14 @@ slow_path: consume_skb(skb); return err; +fail_toobig: + if (skb->sk && dst_allfrag(skb_dst(skb))) + sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK); + + skb->dev = skb_dst(skb)->dev; + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + err = -EMSGSIZE; + fail: IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); -- cgit From 6ae4ae8e512bd229f806c22f8a2cd751e4f987c2 Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Sat, 23 May 2015 03:12:34 +0200 Subject: bridge: allow setting hash_max + multicast_router if interface is down MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Network managers like netifd (used in OpenWRT for instance) try to configure interface options after creation but before setting the interface up. Unfortunately the sysfs / bridge currently only allows to configure the hash_max and multicast_router options when the bridge interface is up. But since br_multicast_init() doesn't start any timers and only sets default values and initializes timers it should be save to reconfigure the default values after that, before things actually get active after the bridge is set up. Signed-off-by: Linus Lüssing Signed-off-by: David S. Miller --- net/bridge/br_multicast.c | 24 +++--------------------- 1 file changed, 3 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index d7e103e3538a..7c78b8df1d81 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1772,11 +1772,9 @@ out: int br_multicast_set_router(struct net_bridge *br, unsigned long val) { - int err = -ENOENT; + int err = -EINVAL; spin_lock_bh(&br->multicast_lock); - if (!netif_running(br->dev)) - goto unlock; switch (val) { case 0: @@ -1787,13 +1785,8 @@ int br_multicast_set_router(struct net_bridge *br, unsigned long val) br->multicast_router = val; err = 0; break; - - default: - err = -EINVAL; - break; } -unlock: spin_unlock_bh(&br->multicast_lock); return err; @@ -1802,11 +1795,9 @@ unlock: int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val) { struct net_bridge *br = p->br; - int err = -ENOENT; + int err = -EINVAL; spin_lock(&br->multicast_lock); - if (!netif_running(br->dev) || p->state == BR_STATE_DISABLED) - goto unlock; switch (val) { case 0: @@ -1828,13 +1819,8 @@ int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val) br_multicast_add_router(br, p); break; - - default: - err = -EINVAL; - break; } -unlock: spin_unlock(&br->multicast_lock); return err; @@ -1939,15 +1925,11 @@ unlock: int br_multicast_set_hash_max(struct net_bridge *br, unsigned long val) { - int err = -ENOENT; + int err = -EINVAL; u32 old; struct net_bridge_mdb_htable *mdb; spin_lock_bh(&br->multicast_lock); - if (!netif_running(br->dev)) - goto unlock; - - err = -EINVAL; if (!is_power_of_2(val)) goto unlock; -- cgit From 005e8709c6179b75830971efa891c7ed33e02d1e Mon Sep 17 00:00:00 2001 From: Nicholas Mc Guire Date: Mon, 25 May 2015 08:16:50 +0200 Subject: irda: use msecs_to_jiffies for conversion to jiffies API compliance scanning with coccinelle flagged: ./net/irda/timer.c:63:35-37: use of msecs_to_jiffies probably perferable Converting milliseconds to jiffies by "val * HZ / 1000" technically is not a clean solution as it does not handle all corner cases correctly. By changing the conversion to use msecs_to_jiffies(val) conversion is correct in all cases. Further the () around the arithmetic expression was dropped. Patch was compile tested for x86_64_defconfig + CONFIG_IRDA=m Patch is against 4.1-rc4 (localversion-next is -next-20150522) Signed-off-by: Nicholas Mc Guire Signed-off-by: David S. Miller --- net/irda/timer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/irda/timer.c b/net/irda/timer.c index 0c4c115a5cab..f2280f73b057 100644 --- a/net/irda/timer.c +++ b/net/irda/timer.c @@ -60,8 +60,8 @@ void irlap_start_query_timer(struct irlap_cb *self, int S, int s) * to avoid messing with for incoming connections requests and * to accommodate devices that perform discovery slower than us. * Jean II */ - timeout = ((sysctl_slot_timeout * HZ / 1000) * (S - s) - + XIDEXTRA_TIMEOUT + SMALLBUSY_TIMEOUT); + timeout = msecs_to_jiffies(sysctl_slot_timeout) * (S - s) + + XIDEXTRA_TIMEOUT + SMALLBUSY_TIMEOUT; /* Set or re-set the timer. We reset the timer for each received * discovery query, which allow us to automatically adjust to -- cgit From 7f1598678d4c05e3e085bf780a5ab3119637ac3c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 25 May 2015 16:02:21 -0700 Subject: ipv6: ipv6_select_ident() returns a __be32 ipv6_select_ident() returns a 32bit value in network order. Fixes: 286c2349f666 ("ipv6: Clean up ipv6_select_ident() and ip6_fragment()") Signed-off-by: Eric Dumazet Reported-by: kbuild test robot Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv6/output_core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index 055e85cb7b65..21678acd4521 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -61,9 +61,9 @@ void ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb) } EXPORT_SYMBOL_GPL(ipv6_proxy_select_ident); -u32 ipv6_select_ident(struct net *net, - const struct in6_addr *daddr, - const struct in6_addr *saddr) +__be32 ipv6_select_ident(struct net *net, + const struct in6_addr *daddr, + const struct in6_addr *saddr) { static u32 ip6_idents_hashrnd __read_mostly; u32 id; -- cgit From d4969581452be430135ba5040c5b6c9023c1d461 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 25 May 2015 16:06:37 -0700 Subject: pktgen: remove one sparse error net/core/pktgen.c:2672:43: warning: incorrect type in assignment (different base types) net/core/pktgen.c:2672:43: expected unsigned short [unsigned] [short] [usertype] net/core/pktgen.c:2672:43: got restricted __be16 [usertype] protocol Let's use proper struct ethhdr instead of hard coding everything. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/pktgen.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 676550f34560..d93cbc5715f4 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2645,9 +2645,9 @@ static int process_ipsec(struct pktgen_dev *pkt_dev, struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x; int nhead = 0; if (x) { - int ret; - __u8 *eth; + struct ethhdr *eth; struct iphdr *iph; + int ret; nhead = x->props.header_len - skb_headroom(skb); if (nhead > 0) { @@ -2667,9 +2667,9 @@ static int process_ipsec(struct pktgen_dev *pkt_dev, goto err; } /* restore ll */ - eth = (__u8 *) skb_push(skb, ETH_HLEN); - memcpy(eth, pkt_dev->hh, 12); - *(u16 *) ð[12] = protocol; + eth = (struct ethhdr *)skb_push(skb, ETH_HLEN); + memcpy(eth, pkt_dev->hh, 2 * ETH_ALEN); + eth->h_proto = protocol; /* Update IPv4 header len as well as checksum value */ iph = ip_hdr(skb); -- cgit From eb8d7baae21584a7b27e79c2deca3eee72376ff0 Mon Sep 17 00:00:00 2001 From: Wilson Kok Date: Mon, 25 May 2015 06:39:31 -0700 Subject: bridge: skip fdb add if the port shouldn't learn Check in fdb_add_entry() if the source port should learn, similar check is used in br_fdb_update. Note that new fdb entries which are added manually or as local ones are still permitted. This patch has been tested by running traffic via a bridge port and switching the port's state, also by manually adding/removing entries from the bridge's fdb. Signed-off-by: Wilson Kok Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_fdb.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index e0670d7054f9..7896cf143045 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -736,6 +736,12 @@ static int fdb_add_entry(struct net_bridge_port *source, const __u8 *addr, struct net_bridge_fdb_entry *fdb; bool modified = false; + /* If the port cannot learn allow only local and static entries */ + if (!(state & NUD_PERMANENT) && !(state & NUD_NOARP) && + !(source->state == BR_STATE_LEARNING || + source->state == BR_STATE_FORWARDING)) + return -EPERM; + fdb = fdb_find(head, addr, vid); if (fdb == NULL) { if (!(flags & NLM_F_CREATE)) -- cgit From f72186d22aad44100e2dd17ccdcf13c4418ec3cb Mon Sep 17 00:00:00 2001 From: Florian Grandel Date: Tue, 26 May 2015 03:31:09 +0200 Subject: Bluetooth: mgmt: fix typos A few comments had minor typos. These are being fixed. Signed-off-by: Florian Grandel Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 7fd87e7135b5..a6f21f8c2f98 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -7577,7 +7577,7 @@ void mgmt_new_ltk(struct hci_dev *hdev, struct smp_ltk *key, bool persistent) memset(&ev, 0, sizeof(ev)); /* Devices using resolvable or non-resolvable random addresses - * without providing an indentity resolving key don't require + * without providing an identity resolving key don't require * to store long term keys. Their addresses will change the * next time around. * @@ -7617,7 +7617,7 @@ void mgmt_new_irk(struct hci_dev *hdev, struct smp_irk *irk) /* For identity resolving keys from devices that are already * using a public address or static random address, do not * ask for storing this key. The identity resolving key really - * is only mandatory for devices using resovlable random + * is only mandatory for devices using resolvable random * addresses. * * Storing all identity resolving keys has the downside that @@ -7646,7 +7646,7 @@ void mgmt_new_csrk(struct hci_dev *hdev, struct smp_csrk *csrk, memset(&ev, 0, sizeof(ev)); /* Devices using resolvable or non-resolvable random addresses - * without providing an indentity resolving key don't require + * without providing an identity resolving key don't require * to store signature resolving keys. Their addresses will change * the next time around. * -- cgit From 68319052d10d18462fbfd5571a5062cb7ec50788 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 25 May 2015 18:55:48 -0700 Subject: net: remove a sparse error in secure_dccpv6_sequence_number() make C=2 CF=-D__CHECK_ENDIAN__ net/core/secure_seq.o net/core/secure_seq.c:157:50: warning: restricted __be32 degrades to integer Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/secure_seq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index 51dd3193a33e..fd3ce461fbe6 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -154,7 +154,7 @@ u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, net_secret_init(); memcpy(hash, saddr, 16); for (i = 0; i < 4; i++) - secret[i] = net_secret[i] + daddr[i]; + secret[i] = net_secret[i] + (__force u32)daddr[i]; secret[4] = net_secret[4] + (((__force u16)sport << 16) + (__force u16)dport); for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++) -- cgit From 05c985436dfe81ab784251a54dd69be5b2dea98a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 25 May 2015 18:50:01 -0700 Subject: net: fix inet_proto_csum_replace4() sparse errors make C=2 CF=-D__CHECK_ENDIAN__ net/core/utils.o ... net/core/utils.c:307:72: warning: incorrect type in argument 2 (different base types) net/core/utils.c:307:72: expected restricted __wsum [usertype] addend net/core/utils.c:307:72: got restricted __be32 [usertype] from net/core/utils.c:308:34: warning: incorrect type in argument 2 (different base types) net/core/utils.c:308:34: expected restricted __wsum [usertype] addend net/core/utils.c:308:34: got restricted __be32 [usertype] to net/core/utils.c:310:70: warning: incorrect type in argument 2 (different base types) net/core/utils.c:310:70: expected restricted __wsum [usertype] addend net/core/utils.c:310:70: got restricted __be32 [usertype] from net/core/utils.c:310:77: warning: incorrect type in argument 2 (different base types) net/core/utils.c:310:77: expected restricted __wsum [usertype] addend net/core/utils.c:310:77: got restricted __be32 [usertype] to net/core/utils.c:312:72: warning: incorrect type in argument 2 (different base types) net/core/utils.c:312:72: expected restricted __wsum [usertype] addend net/core/utils.c:312:72: got restricted __be32 [usertype] from net/core/utils.c:313:35: warning: incorrect type in argument 2 (different base types) net/core/utils.c:313:35: expected restricted __wsum [usertype] addend net/core/utils.c:313:35: got restricted __be32 [usertype] to Note we can use csum_replace4() helper Fixes: 58e3cac5613aa ("net: optimise inet_proto_csum_replace4()") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/utils.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/utils.c b/net/core/utils.c index 7b803884c162..a7732a068043 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -304,13 +304,15 @@ void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, __be32 from, __be32 to, int pseudohdr) { if (skb->ip_summed != CHECKSUM_PARTIAL) { - *sum = csum_fold(csum_add(csum_sub(~csum_unfold(*sum), from), - to)); + csum_replace4(sum, from, to); if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr) - skb->csum = ~csum_add(csum_sub(~(skb->csum), from), to); + skb->csum = ~csum_add(csum_sub(~(skb->csum), + (__force __wsum)from), + (__force __wsum)to); } else if (pseudohdr) - *sum = ~csum_fold(csum_add(csum_sub(csum_unfold(*sum), from), - to)); + *sum = ~csum_fold(csum_add(csum_sub(csum_unfold(*sum), + (__force __wsum)from), + (__force __wsum)to)); } EXPORT_SYMBOL(inet_proto_csum_replace4); -- cgit From 80279fb7ba5b71981a60988b0307afa43f78f6b1 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 22 May 2015 16:22:20 +0200 Subject: cfg80211: properly send NL80211_ATTR_DISCONNECTED_BY_AP in disconnect When we disconnect from the AP, drivers call cfg80211_disconnect(). This doesn't know whether the disconnection was initiated locally or by the AP though, which can cause problems with the supplicant, for example with WPS. This issue obviously doesn't show up with any mac80211 based driver since mac80211 doesn't call this function. Fix this by requiring drivers to indicate whether the disconnect is locally generated or not. I've tried to update the drivers, but may not have gotten the values correct, and some drivers may currently not be able to report correct values. In case of doubt I left it at false, which is the current behaviour. For libertas, make adjustments as indicated by Dan Williams. Reported-by: Matthieu Mauger Tested-by: Matthieu Mauger Signed-off-by: Johannes Berg --- net/wireless/core.h | 1 + net/wireless/sme.c | 4 +++- net/wireless/util.c | 3 ++- 3 files changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/wireless/core.h b/net/wireless/core.h index 801cd49c5a0c..311eef26bf88 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -222,6 +222,7 @@ struct cfg80211_event { const u8 *ie; size_t ie_len; u16 reason; + bool locally_generated; } dc; struct { u8 bssid[ETH_ALEN]; diff --git a/net/wireless/sme.c b/net/wireless/sme.c index d11454f87bac..8020b5b094d4 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -938,7 +938,8 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie, } void cfg80211_disconnected(struct net_device *dev, u16 reason, - const u8 *ie, size_t ie_len, gfp_t gfp) + const u8 *ie, size_t ie_len, + bool locally_generated, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); @@ -954,6 +955,7 @@ void cfg80211_disconnected(struct net_device *dev, u16 reason, ev->dc.ie_len = ie_len; memcpy((void *)ev->dc.ie, ie, ie_len); ev->dc.reason = reason; + ev->dc.locally_generated = locally_generated; spin_lock_irqsave(&wdev->event_lock, flags); list_add_tail(&ev->list, &wdev->event_list); diff --git a/net/wireless/util.c b/net/wireless/util.c index 70051ab52f4f..4cb34557b873 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -887,7 +887,8 @@ void cfg80211_process_wdev_events(struct wireless_dev *wdev) case EVENT_DISCONNECTED: __cfg80211_disconnected(wdev->netdev, ev->dc.ie, ev->dc.ie_len, - ev->dc.reason, true); + ev->dc.reason, + !ev->dc.locally_generated); break; case EVENT_IBSS_JOINED: __cfg80211_ibss_joined(wdev->netdev, ev->ij.bssid, -- cgit From e34fd879f5516496c7241c9c2caf3a108295a30c Mon Sep 17 00:00:00 2001 From: Lennert Buytenhek Date: Tue, 26 May 2015 15:06:10 +0300 Subject: mac802154: Avoid rtnl deadlock in mac802154_wpan_ioctl(). ->ndo_do_ioctl() can be entered with the rtnl lock already held, for example when sending a wext ioctl to a device (in which case the rtnl lock is taken by wext_ioctl_dispatch()), but mac802154_wpan_ioctl() currently unconditionally takes the rtnl lock on entry, which can cause deadlocks. To fix this, bail out of mac802154_wpan_ioctl() before taking the rtnl lock if the ioctl cmd is not one of the cmds we implement. Signed-off-by: Lennert Buytenhek Acked-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/mac802154/iface.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c index f30788d2702f..b544b5dc4bfb 100644 --- a/net/mac802154/iface.c +++ b/net/mac802154/iface.c @@ -62,6 +62,9 @@ mac802154_wpan_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) (struct sockaddr_ieee802154 *)&ifr->ifr_addr; int err = -ENOIOCTLCMD; + if (cmd != SIOCGIFADDR && cmd != SIOCSIFADDR) + return err; + rtnl_lock(); switch (cmd) { -- cgit From c032705ebfed32a6dcda72d83c54f060d4bf1e6e Mon Sep 17 00:00:00 2001 From: Lennert Buytenhek Date: Mon, 25 May 2015 15:38:24 +0300 Subject: ieee802154 socket: Return EMSGSIZE from raw_sendmsg() if packet too big. The proper return code for trying to send a packet that exceeds the outgoing interface's MTU is EMSGSIZE, not EINVAL, so patch ieee802154's raw_sendmsg() to do the right thing. (Its dgram_sendmsg() was already returning EMSGSIZE for this case.) Signed-off-by: Lennert Buytenhek Acked-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/ieee802154/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index e5cc2537c2a3..d9fc5ccb1d0d 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -284,7 +284,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) if (size > mtu) { pr_debug("size = %Zu, mtu = %u\n", size, mtu); - err = -EINVAL; + err = -EMSGSIZE; goto out_dev; } -- cgit From 2f06550b3b0e26f54045337e34ec2a1b666bb6c6 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 24 May 2015 01:00:41 +0200 Subject: netfilter: remove unused comefrom hookmask argument Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/ip_tables.c | 4 +--- net/ipv6/netfilter/ip6_tables.c | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index c69db7fa25ee..583779fc3425 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1441,7 +1441,6 @@ static int compat_find_calc_match(struct xt_entry_match *m, const char *name, const struct ipt_ip *ip, - unsigned int hookmask, int *size) { struct xt_match *match; @@ -1510,8 +1509,7 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e, entry_offset = (void *)e - (void *)base; j = 0; xt_ematch_foreach(ematch, e) { - ret = compat_find_calc_match(ematch, name, - &e->ip, e->comefrom, &off); + ret = compat_find_calc_match(ematch, name, &e->ip, &off); if (ret != 0) goto release_matches; ++j; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 1a732a1d3c8e..d54f04970ee0 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1456,7 +1456,6 @@ static int compat_find_calc_match(struct xt_entry_match *m, const char *name, const struct ip6t_ip6 *ipv6, - unsigned int hookmask, int *size) { struct xt_match *match; @@ -1525,8 +1524,7 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e, entry_offset = (void *)e - (void *)base; j = 0; xt_ematch_foreach(ematch, e) { - ret = compat_find_calc_match(ematch, name, - &e->ipv6, e->comefrom, &off); + ret = compat_find_calc_match(ematch, name, &e->ipv6, &off); if (ret != 0) goto release_matches; ++j; -- cgit From 529985de202276d0d3455d16d284d72efc357d98 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 26 May 2015 18:41:12 +0200 Subject: netfilter: default CONFIG_NETFILTER_INGRESS to y Useful to compile-test all options. Suggested-by: Alexei Stavoroitov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index db1c674397ad..9a89e7c67d78 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -3,6 +3,7 @@ menu "Core Netfilter Configuration" config NETFILTER_INGRESS bool "Netfilter ingress support" + default y select NET_INGRESS help This allows you to classify packets from ingress using the Netfilter -- cgit From ebddf1a8d78aa3436353fae75c4396e50cb2d6cf Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 26 May 2015 18:41:20 +0200 Subject: netfilter: nf_tables: allow to bind table to net_device This patch adds the internal NFT_AF_NEEDS_DEV flag to indicate that you must attach this table to a net_device. This change is required by the follow up patch that introduces the new netdev table. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 46 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 41 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index ad9d11fb29fd..2fd4e99dd074 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -399,6 +399,8 @@ static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = { [NFTA_TABLE_NAME] = { .type = NLA_STRING, .len = NFT_TABLE_MAXNAMELEN - 1 }, [NFTA_TABLE_FLAGS] = { .type = NLA_U32 }, + [NFTA_TABLE_DEV] = { .type = NLA_STRING, + .len = IFNAMSIZ - 1 }, }; static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net, @@ -423,6 +425,10 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net, nla_put_be32(skb, NFTA_TABLE_USE, htonl(table->use))) goto nla_put_failure; + if (table->dev && + nla_put_string(skb, NFTA_TABLE_DEV, table->dev->name)) + goto nla_put_failure; + nlmsg_end(skb, nlh); return 0; @@ -608,6 +614,11 @@ static int nf_tables_updtable(struct nft_ctx *ctx) if (flags == ctx->table->flags) return 0; + if ((ctx->afi->flags & NFT_AF_NEEDS_DEV) && + ctx->nla[NFTA_TABLE_DEV] && + nla_strcmp(ctx->nla[NFTA_TABLE_DEV], ctx->table->dev->name)) + return -EOPNOTSUPP; + trans = nft_trans_alloc(ctx, NFT_MSG_NEWTABLE, sizeof(struct nft_trans_table)); if (trans == NULL) @@ -645,6 +656,7 @@ static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb, struct nft_table *table; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; + struct net_device *dev = NULL; u32 flags = 0; struct nft_ctx ctx; int err; @@ -679,30 +691,50 @@ static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb, return -EINVAL; } + if (afi->flags & NFT_AF_NEEDS_DEV) { + char ifname[IFNAMSIZ]; + + if (!nla[NFTA_TABLE_DEV]) + return -EOPNOTSUPP; + + nla_strlcpy(ifname, nla[NFTA_TABLE_DEV], IFNAMSIZ); + dev = dev_get_by_name(net, ifname); + if (!dev) + return -ENOENT; + } else if (nla[NFTA_TABLE_DEV]) { + return -EOPNOTSUPP; + } + + err = -EAFNOSUPPORT; if (!try_module_get(afi->owner)) - return -EAFNOSUPPORT; + goto err1; err = -ENOMEM; table = kzalloc(sizeof(*table), GFP_KERNEL); if (table == NULL) - goto err1; + goto err2; nla_strlcpy(table->name, name, NFT_TABLE_MAXNAMELEN); INIT_LIST_HEAD(&table->chains); INIT_LIST_HEAD(&table->sets); table->flags = flags; + table->dev = dev; nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE); if (err < 0) - goto err2; + goto err3; list_add_tail_rcu(&table->list, &afi->tables); return 0; -err2: +err3: kfree(table); -err1: +err2: module_put(afi->owner); +err1: + if (dev != NULL) + dev_put(dev); + return err; } @@ -806,6 +838,9 @@ static void nf_tables_table_destroy(struct nft_ctx *ctx) { BUG_ON(ctx->table->use > 0); + if (ctx->table->dev) + dev_put(ctx->table->dev); + kfree(ctx->table); module_put(ctx->afi->owner); } @@ -1361,6 +1396,7 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, ops->priority = priority; ops->priv = chain; ops->hook = afi->hooks[ops->hooknum]; + ops->dev = table->dev; if (hookfn) ops->hook = hookfn; if (afi->hook_ops_init) -- cgit From ed6c4136f1571bd6ab362afc3410905a8a69ca42 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 26 May 2015 18:41:40 +0200 Subject: netfilter: nf_tables: add netdev table to filter from ingress This allows us to create netdev tables that contain ingress chains. Use skb_header_pointer() as we may see shared sk_buffs at this stage. This change provides access to the existing nf_tables features from the ingress hook. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/Kconfig | 5 ++ net/netfilter/Makefile | 1 + net/netfilter/nf_tables_netdev.c | 183 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 189 insertions(+) create mode 100644 net/netfilter/nf_tables_netdev.c (limited to 'net') diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 9a89e7c67d78..bd5aaebaa9a2 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -456,6 +456,11 @@ config NF_TABLES_INET help This option enables support for a mixed IPv4/IPv6 "inet" table. +config NF_TABLES_NETDEV + tristate "Netfilter nf_tables netdev tables support" + help + This option enables support for the "netdev" table. + config NFT_EXTHDR tristate "Netfilter nf_tables IPv6 exthdr module" help diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index a87d8b8ec730..70d026d46fe7 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -75,6 +75,7 @@ nf_tables-objs += nft_bitwise.o nft_byteorder.o nft_payload.o obj-$(CONFIG_NF_TABLES) += nf_tables.o obj-$(CONFIG_NF_TABLES_INET) += nf_tables_inet.o +obj-$(CONFIG_NF_TABLES_NETDEV) += nf_tables_netdev.o obj-$(CONFIG_NFT_COMPAT) += nft_compat.o obj-$(CONFIG_NFT_EXTHDR) += nft_exthdr.o obj-$(CONFIG_NFT_META) += nft_meta.o diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c new file mode 100644 index 000000000000..04cb17057f46 --- /dev/null +++ b/net/netfilter/nf_tables_netdev.c @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2015 Pablo Neira Ayuso + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include + +static inline void +nft_netdev_set_pktinfo_ipv4(struct nft_pktinfo *pkt, + const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct iphdr *iph, _iph; + u32 len, thoff; + + nft_set_pktinfo(pkt, ops, skb, state); + + iph = skb_header_pointer(skb, skb_network_offset(skb), sizeof(*iph), + &_iph); + if (!iph) + return; + + iph = ip_hdr(skb); + if (iph->ihl < 5 || iph->version != 4) + return; + + len = ntohs(iph->tot_len); + thoff = iph->ihl * 4; + if (skb->len < len) + return; + else if (len < thoff) + return; + + pkt->tprot = iph->protocol; + pkt->xt.thoff = thoff; + pkt->xt.fragoff = ntohs(iph->frag_off) & IP_OFFSET; +} + +static inline void +__nft_netdev_set_pktinfo_ipv6(struct nft_pktinfo *pkt, + const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ +#if IS_ENABLED(CONFIG_IPV6) + struct ipv6hdr *ip6h, _ip6h; + unsigned int thoff = 0; + unsigned short frag_off; + int protohdr; + u32 pkt_len; + + ip6h = skb_header_pointer(skb, skb_network_offset(skb), sizeof(*ip6h), + &_ip6h); + if (!ip6h) + return; + + if (ip6h->version != 6) + return; + + pkt_len = ntohs(ip6h->payload_len); + if (pkt_len + sizeof(*ip6h) > skb->len) + return; + + protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, NULL); + if (protohdr < 0) + return; + + pkt->tprot = protohdr; + pkt->xt.thoff = thoff; + pkt->xt.fragoff = frag_off; +#endif +} + +static inline void nft_netdev_set_pktinfo_ipv6(struct nft_pktinfo *pkt, + const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + nft_set_pktinfo(pkt, ops, skb, state); + __nft_netdev_set_pktinfo_ipv6(pkt, ops, skb, state); +} + +static unsigned int +nft_do_chain_netdev(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nft_pktinfo pkt; + + switch (eth_hdr(skb)->h_proto) { + case htons(ETH_P_IP): + nft_netdev_set_pktinfo_ipv4(&pkt, ops, skb, state); + break; + case htons(ETH_P_IPV6): + nft_netdev_set_pktinfo_ipv6(&pkt, ops, skb, state); + break; + default: + nft_set_pktinfo(&pkt, ops, skb, state); + break; + } + + return nft_do_chain(&pkt, ops); +} + +static struct nft_af_info nft_af_netdev __read_mostly = { + .family = NFPROTO_NETDEV, + .nhooks = NF_NETDEV_NUMHOOKS, + .owner = THIS_MODULE, + .flags = NFT_AF_NEEDS_DEV, + .nops = 1, + .hooks = { + [NF_NETDEV_INGRESS] = nft_do_chain_netdev, + }, +}; + +static int nf_tables_netdev_init_net(struct net *net) +{ + net->nft.netdev = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL); + if (net->nft.netdev == NULL) + return -ENOMEM; + + memcpy(net->nft.netdev, &nft_af_netdev, sizeof(nft_af_netdev)); + + if (nft_register_afinfo(net, net->nft.netdev) < 0) + goto err; + + return 0; +err: + kfree(net->nft.netdev); + return -ENOMEM; +} + +static void nf_tables_netdev_exit_net(struct net *net) +{ + nft_unregister_afinfo(net->nft.netdev); + kfree(net->nft.netdev); +} + +static struct pernet_operations nf_tables_netdev_net_ops = { + .init = nf_tables_netdev_init_net, + .exit = nf_tables_netdev_exit_net, +}; + +static const struct nf_chain_type nft_filter_chain_netdev = { + .name = "filter", + .type = NFT_CHAIN_T_DEFAULT, + .family = NFPROTO_NETDEV, + .owner = THIS_MODULE, + .hook_mask = (1 << NF_NETDEV_INGRESS), +}; + +static int __init nf_tables_netdev_init(void) +{ + int ret; + + nft_register_chain_type(&nft_filter_chain_netdev); + ret = register_pernet_subsys(&nf_tables_netdev_net_ops); + if (ret < 0) + nft_unregister_chain_type(&nft_filter_chain_netdev); + + return ret; +} + +static void __exit nf_tables_netdev_exit(void) +{ + unregister_pernet_subsys(&nf_tables_netdev_net_ops); + nft_unregister_chain_type(&nft_filter_chain_netdev); +} + +module_init(nf_tables_netdev_init); +module_exit(nf_tables_netdev_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso "); +MODULE_ALIAS_NFT_FAMILY(5); /* NFPROTO_NETDEV */ -- cgit From 01c8d2bbd407a178f5bfe0312c32aec345066697 Mon Sep 17 00:00:00 2001 From: Lennert Buytenhek Date: Mon, 25 May 2015 15:38:39 +0300 Subject: ieee802154: Remove 802.15.4/6LoWPAN checks for interface MTU. In the past, 802.15.4 interfaces and 6LoWPAN interfaces used the same dev->type (ARPHRD_IEEE802154), and 802.15.4 interfaces were distinguished from 6LoWPAN interfaces by their differing dev->mtu. 6LoWPAN interfaces have their own ARPHRD type now, so there is no longer any need to check dev->mtu to distinguish 802.15.4 devices from 6LoWPAN devices. Signed-off-by: Lennert Buytenhek Acked-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/ieee802154/nl-mac.c | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/ieee802154/nl-mac.c b/net/ieee802154/nl-mac.c index ada58a81b753..3503c38954f9 100644 --- a/net/ieee802154/nl-mac.c +++ b/net/ieee802154/nl-mac.c @@ -168,10 +168,7 @@ static struct net_device *ieee802154_nl_get_dev(struct genl_info *info) if (!dev) return NULL; - /* Check on mtu is currently a hacked solution because lowpan - * and wpan have the same ARPHRD type. - */ - if (dev->type != ARPHRD_IEEE802154 || dev->mtu != IEEE802154_MTU) { + if (dev->type != ARPHRD_IEEE802154) { dev_put(dev); return NULL; } @@ -455,11 +452,7 @@ int ieee802154_dump_iface(struct sk_buff *skb, struct netlink_callback *cb) idx = 0; for_each_netdev(net, dev) { - /* Check on mtu is currently a hacked solution because lowpan - * and wpan have the same ARPHRD type. - */ - if (idx < s_idx || dev->type != ARPHRD_IEEE802154 || - dev->mtu != IEEE802154_MTU) + if (idx < s_idx || dev->type != ARPHRD_IEEE802154) goto cont; if (ieee802154_nl_fill_iface(skb, NETLINK_CB(cb->skb).portid, @@ -789,11 +782,7 @@ ieee802154_llsec_dump_table(struct sk_buff *skb, struct netlink_callback *cb, int rc; for_each_netdev(net, dev) { - /* Check on mtu is currently a hacked solution because lowpan - * and wpan have the same ARPHRD type. - */ - if (idx < first_dev || dev->type != ARPHRD_IEEE802154 || - dev->mtu != IEEE802154_MTU) + if (idx < first_dev || dev->type != ARPHRD_IEEE802154) goto skip; data.ops = ieee802154_mlme_ops(dev); -- cgit From 66a3297f6d3f9dc35e27c3cec6b4437ac13f07ff Mon Sep 17 00:00:00 2001 From: Lennert Buytenhek Date: Mon, 25 May 2015 15:38:45 +0300 Subject: ieee802154 socket: No need to check for ARPHRD_IEEE802154 in raw_bind(). ieee802154_get_dev() only returns devices that have dev->type == ARPHRD_IEEE802154, therefore, there is no need to check this again in raw_bind(). Signed-off-by: Lennert Buytenhek Acked-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/ieee802154/socket.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'net') diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index d9fc5ccb1d0d..02abef2c1621 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -226,15 +226,9 @@ static int raw_bind(struct sock *sk, struct sockaddr *_uaddr, int len) goto out; } - if (dev->type != ARPHRD_IEEE802154) { - err = -ENODEV; - goto out_put; - } - sk->sk_bound_dev_if = dev->ifindex; sk_dst_reset(sk); -out_put: dev_put(dev); out: release_sock(sk); -- cgit From 641459ca336edc009e044b76d16ee3d1ca6464c0 Mon Sep 17 00:00:00 2001 From: Lennert Buytenhek Date: Mon, 25 May 2015 15:38:51 +0300 Subject: mac802154: mac802154_mlme_start_req() optimisation. mac802154_mlme_start_req() calls ieee802154_mlme_ops(dev)->llsec->set_params() on the net_device passed into it, however, this net_device will always be a mac802154 net_device, so just call mac802154_set_params() directly instead. Signed-off-by: Lennert Buytenhek Acked-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/mac802154/mac_cmd.c | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/mac802154/mac_cmd.c b/net/mac802154/mac_cmd.c index 5220c2b2735b..8606da459ff3 100644 --- a/net/mac802154/mac_cmd.c +++ b/net/mac802154/mac_cmd.c @@ -36,8 +36,8 @@ static int mac802154_mlme_start_req(struct net_device *dev, u8 pan_coord, u8 blx, u8 coord_realign) { - struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev); - int rc = 0; + struct ieee802154_llsec_params params; + int changed = 0; ASSERT_RTNL(); @@ -47,26 +47,19 @@ static int mac802154_mlme_start_req(struct net_device *dev, dev->ieee802154_ptr->short_addr = addr->short_addr; mac802154_dev_set_page_channel(dev, page, channel); - if (ops->llsec) { - struct ieee802154_llsec_params params; - int changed = 0; + params.pan_id = addr->pan_id; + changed |= IEEE802154_LLSEC_PARAM_PAN_ID; - params.coord_shortaddr = addr->short_addr; - changed |= IEEE802154_LLSEC_PARAM_COORD_SHORTADDR; + params.hwaddr = ieee802154_devaddr_from_raw(dev->dev_addr); + changed |= IEEE802154_LLSEC_PARAM_HWADDR; - params.pan_id = addr->pan_id; - changed |= IEEE802154_LLSEC_PARAM_PAN_ID; + params.coord_hwaddr = params.hwaddr; + changed |= IEEE802154_LLSEC_PARAM_COORD_HWADDR; - params.hwaddr = ieee802154_devaddr_from_raw(dev->dev_addr); - changed |= IEEE802154_LLSEC_PARAM_HWADDR; + params.coord_shortaddr = addr->short_addr; + changed |= IEEE802154_LLSEC_PARAM_COORD_SHORTADDR; - params.coord_hwaddr = params.hwaddr; - changed |= IEEE802154_LLSEC_PARAM_COORD_HWADDR; - - rc = ops->llsec->set_params(dev, ¶ms, changed); - } - - return rc; + return mac802154_set_params(dev, ¶ms, changed); } static int mac802154_set_mac_params(struct net_device *dev, -- cgit From fc4f80524368dbf3fa2eba639c7b79bb014aea0a Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Tue, 26 May 2015 23:11:31 +0200 Subject: nl802154: fix cca mode wpan phy flag This patch fix the handling to call cca mode setting. If the phy isn't flag then the driver doesn't support this setting. Signed-off-by: Alexander Aring Reported-by: Varka Bhadram Signed-off-by: Marcel Holtmann --- net/ieee802154/nl802154.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index 54f4959fc9bb..212a795b33c9 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -757,7 +757,7 @@ static int nl802154_set_cca_mode(struct sk_buff *skb, struct genl_info *info) struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct wpan_phy_cca cca; - if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_MODE) + if (!(rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_MODE)) return -EOPNOTSUPP; if (!info->attrs[NL802154_ATTR_CCA_MODE]) -- cgit From f3903bcc0091df871ac64261f65ed2e4c3519d39 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Tue, 26 May 2015 05:40:19 -0400 Subject: tipc: fix bug in link protocol message create function In commit dd3f9e70f59f43a5712eba9cf3ee4f1e6999540c ("tipc: add packet sequence number at instant of transmission") we made a change with the consequence that packets in the link backlog queue don't contain valid sequence numbers. However, when we create a link protocol message, we still use the sequence number of the first packet in the backlog, if there is any, as "next_sent" indicator in the message. This may entail unnecessary retransissions or stale packet transmission when there is very low traffic on the link. This commit fixes this issue by only using the current value of tipc_link::snd_nxt as indicator. Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index fb2a003c8e6d..ca8b8e0f49b5 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1320,8 +1320,6 @@ void tipc_link_proto_xmit(struct tipc_link *l_ptr, u32 msg_typ, int probe_msg, if (!tipc_link_is_up(l_ptr)) return; - if (skb_queue_len(&l_ptr->backlogq)) - next_sent = buf_seqno(skb_peek(&l_ptr->backlogq)); msg_set_next_sent(msg, next_sent); if (!skb_queue_empty(&l_ptr->deferdq)) { last_rcv = buf_seqno(skb_peek(&l_ptr->deferdq)); -- cgit From 095dc8e0c3686d586a01a50abc3e1bb9ac633054 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 May 2015 07:55:34 -0700 Subject: tcp: fix/cleanup inet_ehash_locks_alloc() If tcp ehash table is constrained to a very small number of buckets (eg boot parameter thash_entries=128), then we can crash if spinlock array has more entries. While we are at it, un-inline inet_ehash_locks_alloc() and make following changes : - Budget 2 cache lines per cpu worth of 'spinlocks' - Try to kmalloc() the array to avoid extra TLB pressure. (Most servers at Google allocate 8192 bytes for this hash table) - Get rid of various #ifdef Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/inet_hashtables.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'net') diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 3766bddb3e8a..185efef0f125 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -609,3 +610,33 @@ void inet_hashinfo_init(struct inet_hashinfo *h) } } EXPORT_SYMBOL_GPL(inet_hashinfo_init); + +int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) +{ + unsigned int i, nblocks = 1; + + if (sizeof(spinlock_t) != 0) { + /* allocate 2 cache lines or at least one spinlock per cpu */ + nblocks = max_t(unsigned int, + 2 * L1_CACHE_BYTES / sizeof(spinlock_t), + 1); + nblocks = roundup_pow_of_two(nblocks * num_possible_cpus()); + + /* no more locks than number of hash buckets */ + nblocks = min(nblocks, hashinfo->ehash_mask + 1); + + hashinfo->ehash_locks = kmalloc_array(nblocks, sizeof(spinlock_t), + GFP_KERNEL | __GFP_NOWARN); + if (!hashinfo->ehash_locks) + hashinfo->ehash_locks = vmalloc(nblocks * sizeof(spinlock_t)); + + if (!hashinfo->ehash_locks) + return -ENOMEM; + + for (i = 0; i < nblocks; i++) + spin_lock_init(&hashinfo->ehash_locks[i]); + } + hashinfo->ehash_locks_mask = nblocks - 1; + return 0; +} +EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc); -- cgit From d6a4e26afb80c049e7f94e1b7b506dcda61eee88 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 May 2015 08:55:28 -0700 Subject: tcp: tcp_tso_autosize() minimum is one packet By making sure sk->sk_gso_max_segs minimal value is one, and sysctl_tcp_min_tso_segs minimal value is one as well, tcp_tso_autosize() will return a non zero value. We can then revert 843925f33fcc293d80acf2c5c8a78adf3344d49b ("tcp: Do not apply TSO segment limit to non-TSO packets") and save few cpu cycles in fast path. Signed-off-by: Eric Dumazet Cc: Neal Cardwell Cc: Herbert Xu Acked-by: Neal Cardwell Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/sock.c | 5 ++++- net/ipv4/sysctl_net_ipv4.c | 2 +- net/ipv4/tcp_output.c | 4 ++-- 3 files changed, 7 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 29124fcdc42a..e72633c346b1 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1581,6 +1581,8 @@ EXPORT_SYMBOL_GPL(sk_clone_lock); void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { + u32 max_segs = 1; + __sk_dst_set(sk, dst); sk->sk_route_caps = dst->dev->features; if (sk->sk_route_caps & NETIF_F_GSO) @@ -1592,9 +1594,10 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) } else { sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; sk->sk_gso_max_size = dst->dev->gso_max_size; - sk->sk_gso_max_segs = dst->dev->gso_max_segs; + max_segs = max_t(u32, dst->dev->gso_max_segs, 1); } } + sk->sk_gso_max_segs = max_segs; } EXPORT_SYMBOL_GPL(sk_setup_caps); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 841de32f1fee..e64892769607 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -702,7 +702,7 @@ static struct ctl_table ipv4_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = &one, .extra2 = &gso_max_segs, }, { diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 534e5fdb04c1..190538a2a88c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2088,7 +2088,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) break; - if (tso_segs == 1 || !max_segs) { + if (tso_segs == 1) { if (unlikely(!tcp_nagle_test(tp, skb, mss_now, (tcp_skb_is_last(sk, skb) ? nonagle : TCP_NAGLE_PUSH)))) @@ -2101,7 +2101,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, } limit = mss_now; - if (tso_segs > 1 && max_segs && !tcp_urg_mode(tp)) + if (tso_segs > 1 && !tcp_urg_mode(tp)) limit = tcp_mss_split_point(sk, skb, mss_now, min_t(unsigned int, cwnd_quota, -- cgit From ffa915d071ce4a05dcd866409df26513d25786f8 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 27 May 2015 00:19:03 -0400 Subject: ipv4: Fix fib_trie.c build, missing linux/vmalloc.h include. We used to get this indirectly I supposed, but no longer do. Either way, an explicit include should have been done in the first place. net/ipv4/fib_trie.c: In function '__node_free_rcu': >> net/ipv4/fib_trie.c:293:3: error: implicit declaration of function 'vfree' [-Werror=implicit-function-declaration] vfree(n); ^ net/ipv4/fib_trie.c: In function 'tnode_alloc': >> net/ipv4/fib_trie.c:312:3: error: implicit declaration of function 'vzalloc' [-Werror=implicit-function-declaration] return vzalloc(size); ^ >> net/ipv4/fib_trie.c:312:3: warning: return makes pointer from integer without a cast cc1: some warnings being treated as errors Reported-by: kbuild test robot Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 5a5d9bdeaeb4..01bce1506cd7 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -72,6 +72,7 @@ #include #include #include +#include #include #include #include -- cgit From 0f999b09f5c1b135e840501840dbcd01fad66f79 Mon Sep 17 00:00:00 2001 From: Varka Bhadram Date: Wed, 27 May 2015 09:10:54 +0530 Subject: ieee802154: add set transmit power support This patch adds transmission power setting support for IEEE-802.15.4 devices via nl802154. Signed-off-by: Varka Bhadram Acked-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/ieee802154/nl802154.c | 30 ++++++++++++++++++++++++++++++ net/ieee802154/rdev-ops.h | 12 ++++++++++++ net/ieee802154/trace.h | 15 +++++++++++++++ net/mac802154/cfg.c | 19 +++++++++++++++++++ 4 files changed, 76 insertions(+) (limited to 'net') diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index 212a795b33c9..5c9d524ba02c 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -783,6 +783,28 @@ static int nl802154_set_cca_mode(struct sk_buff *skb, struct genl_info *info) return rdev_set_cca_mode(rdev, &cca); } +static int nl802154_set_tx_power(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg802154_registered_device *rdev = info->user_ptr[0]; + s32 power; + int i; + + if (!(rdev->wpan_phy.flags & WPAN_PHY_FLAG_TXPOWER)) + return -EOPNOTSUPP; + + if (!info->attrs[NL802154_ATTR_TX_POWER]) + return -EINVAL; + + power = nla_get_s32(info->attrs[NL802154_ATTR_TX_POWER]); + + for (i = 0; i < rdev->wpan_phy.supported.tx_powers_size; i++) { + if (power == rdev->wpan_phy.supported.tx_powers[i]) + return rdev_set_tx_power(rdev, power); + } + + return -EINVAL; +} + static int nl802154_set_pan_id(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; @@ -1093,6 +1115,14 @@ static const struct genl_ops nl802154_ops[] = { .internal_flags = NL802154_FLAG_NEED_WPAN_PHY | NL802154_FLAG_NEED_RTNL, }, + { + .cmd = NL802154_CMD_SET_TX_POWER, + .doit = nl802154_set_tx_power, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_WPAN_PHY | + NL802154_FLAG_NEED_RTNL, + }, { .cmd = NL802154_CMD_SET_PAN_ID, .doit = nl802154_set_pan_id, diff --git a/net/ieee802154/rdev-ops.h b/net/ieee802154/rdev-ops.h index 7b5a9dd94fe5..36456118a6ec 100644 --- a/net/ieee802154/rdev-ops.h +++ b/net/ieee802154/rdev-ops.h @@ -74,6 +74,18 @@ rdev_set_cca_mode(struct cfg802154_registered_device *rdev, return ret; } +static inline int +rdev_set_tx_power(struct cfg802154_registered_device *rdev, + s32 power) +{ + int ret; + + trace_802154_rdev_set_tx_power(&rdev->wpan_phy, power); + ret = rdev->ops->set_tx_power(&rdev->wpan_phy, power); + trace_802154_rdev_return_int(&rdev->wpan_phy, ret); + return ret; +} + static inline int rdev_set_pan_id(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, __le16 pan_id) diff --git a/net/ieee802154/trace.h b/net/ieee802154/trace.h index 5ac25eb6ed17..1f1cecc420d7 100644 --- a/net/ieee802154/trace.h +++ b/net/ieee802154/trace.h @@ -93,6 +93,21 @@ TRACE_EVENT(802154_rdev_set_channel, __entry->page, __entry->channel) ); +TRACE_EVENT(802154_rdev_set_tx_power, + TP_PROTO(struct wpan_phy *wpan_phy, s32 power), + TP_ARGS(wpan_phy, power), + TP_STRUCT__entry( + WPAN_PHY_ENTRY + __field(s32, power) + ), + TP_fast_assign( + WPAN_PHY_ASSIGN; + __entry->power = power; + ), + TP_printk(WPAN_PHY_PR_FMT ", power: %d", WPAN_PHY_PR_ARG, + __entry->power) +); + TRACE_EVENT(802154_rdev_set_cca_mode, TP_PROTO(struct wpan_phy *wpan_phy, const struct wpan_phy_cca *cca), TP_ARGS(wpan_phy, cca), diff --git a/net/mac802154/cfg.c b/net/mac802154/cfg.c index d5290ea49dca..ab12fa296eb3 100644 --- a/net/mac802154/cfg.c +++ b/net/mac802154/cfg.c @@ -105,6 +105,24 @@ ieee802154_set_cca_mode(struct wpan_phy *wpan_phy, return ret; } +static int +ieee802154_set_tx_power(struct wpan_phy *wpan_phy, s32 power) +{ + struct ieee802154_local *local = wpan_phy_priv(wpan_phy); + int ret; + + ASSERT_RTNL(); + + if (wpan_phy->transmit_power == power) + return 0; + + ret = drv_set_tx_power(local, power); + if (!ret) + wpan_phy->transmit_power = power; + + return ret; +} + static int ieee802154_set_pan_id(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, __le16 pan_id) @@ -195,6 +213,7 @@ const struct cfg802154_ops mac802154_config_ops = { .del_virtual_intf = ieee802154_del_iface, .set_channel = ieee802154_set_channel, .set_cca_mode = ieee802154_set_cca_mode, + .set_tx_power = ieee802154_set_tx_power, .set_pan_id = ieee802154_set_pan_id, .set_short_addr = ieee802154_set_short_addr, .set_backoff_exponent = ieee802154_set_backoff_exponent, -- cgit From dec169eccc26b12a533856a7fea87a245c47c11d Mon Sep 17 00:00:00 2001 From: Varka Bhadram Date: Wed, 27 May 2015 14:21:06 +0530 Subject: ieee802154: fix typo for file name Signed-off-by: Varka Bhadram Signed-off-by: Marcel Holtmann --- net/ieee802154/trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ieee802154/trace.h b/net/ieee802154/trace.h index 1f1cecc420d7..ac790bb1b4fe 100644 --- a/net/ieee802154/trace.h +++ b/net/ieee802154/trace.h @@ -1,4 +1,4 @@ -/* Based on net/wireless/tracing.h */ +/* Based on net/wireless/trace.h */ #undef TRACE_SYSTEM #define TRACE_SYSTEM cfg802154 -- cgit From c5501eb3406d0f88b3efb2c437c4c40b35f865d8 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 22 May 2015 16:32:50 +0200 Subject: net: ipv4: avoid repeated calls to ip_skb_dst_mtu helper ip_skb_dst_mtu is small inline helper, but its called in several places. before: 17061 44 0 17105 42d1 net/ipv4/ip_output.o after: 16805 44 0 16849 41d1 net/ipv4/ip_output.o Signed-off-by: Florian Westphal Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 451b009dae75..d6dd8ba04441 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -84,6 +84,7 @@ int sysctl_ip_default_ttl __read_mostly = IPDEFTTL; EXPORT_SYMBOL(sysctl_ip_default_ttl); static int ip_fragment(struct sock *sk, struct sk_buff *skb, + unsigned int mtu, int (*output)(struct sock *, struct sk_buff *)); /* Generate a checksum for an outgoing IP datagram. */ @@ -219,7 +220,8 @@ static inline int ip_finish_output2(struct sock *sk, struct sk_buff *skb) return -EINVAL; } -static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb) +static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb, + unsigned int mtu) { netdev_features_t features; struct sk_buff *segs; @@ -227,7 +229,7 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb) /* common case: locally created skb or seglen is <= mtu */ if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) || - skb_gso_network_seglen(skb) <= ip_skb_dst_mtu(skb)) + skb_gso_network_seglen(skb) <= mtu) return ip_finish_output2(sk, skb); /* Slowpath - GSO segment length is exceeding the dst MTU. @@ -251,7 +253,7 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb) int err; segs->next = NULL; - err = ip_fragment(sk, segs, ip_finish_output2); + err = ip_fragment(sk, segs, mtu, ip_finish_output2); if (err && ret == 0) ret = err; @@ -263,6 +265,8 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb) static int ip_finish_output(struct sock *sk, struct sk_buff *skb) { + unsigned int mtu; + #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ if (skb_dst(skb)->xfrm) { @@ -270,11 +274,12 @@ static int ip_finish_output(struct sock *sk, struct sk_buff *skb) return dst_output_sk(sk, skb); } #endif + mtu = ip_skb_dst_mtu(skb); if (skb_is_gso(skb)) - return ip_finish_output_gso(sk, skb); + return ip_finish_output_gso(sk, skb, mtu); - if (skb->len > ip_skb_dst_mtu(skb)) - return ip_fragment(sk, skb, ip_finish_output2); + if (skb->len > mtu) + return ip_fragment(sk, skb, mtu, ip_finish_output2); return ip_finish_output2(sk, skb); } @@ -482,10 +487,10 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) } static int ip_fragment(struct sock *sk, struct sk_buff *skb, + unsigned int mtu, int (*output)(struct sock *, struct sk_buff *)) { struct iphdr *iph = ip_hdr(skb); - unsigned int mtu = ip_skb_dst_mtu(skb); if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) || (IPCB(skb)->frag_max_size && -- cgit From d6b915e29f4adea94bc02ba7675bb4f84e6a1abd Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 22 May 2015 16:32:51 +0200 Subject: ip_fragment: don't forward defragmented DF packet We currently always send fragments without DF bit set. Thus, given following setup: mtu1500 - mtu1500:1400 - mtu1400:1280 - mtu1280 A R1 R2 B Where R1 and R2 run linux with netfilter defragmentation/conntrack enabled, then if Host A sent a fragmented packet _with_ DF set to B, R1 will respond with icmp too big error if one of these fragments exceeded 1400 bytes. However, if R1 receives fragment sizes 1200 and 100, it would forward the reassembled packet without refragmenting, i.e. R2 will send an icmp error in response to a packet that was never sent, citing mtu that the original sender never exceeded. The other minor issue is that a refragmentation on R1 will conceal the MTU of R2-B since refragmentation does not set DF bit on the fragments. This modifies ip_fragment so that we track largest fragment size seen both for DF and non-DF packets, and set frag_max_size to the largest value. If the DF fragment size is larger or equal to the non-df one, we will consider the packet a path mtu probe: We set DF bit on the reassembled skb and also tag it with a new IPCB flag to force refragmentation even if skb fits outdev mtu. We will also set DF bit on each fragment in this case. Joint work with Hannes Frederic Sowa. Reported-by: Jesse Gross Signed-off-by: Florian Westphal Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv4/ip_fragment.c | 31 ++++++++++++++++++++++++++----- net/ipv4/ip_output.c | 12 ++++++++++-- 2 files changed, 36 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 47fa64ee82b1..a50dc6d408d1 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -75,6 +75,7 @@ struct ipq { __be16 id; u8 protocol; u8 ecn; /* RFC3168 support */ + u16 max_df_size; /* largest frag with DF set seen */ int iif; unsigned int rid; struct inet_peer *peer; @@ -326,6 +327,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) { struct sk_buff *prev, *next; struct net_device *dev; + unsigned int fragsize; int flags, offset; int ihl, end; int err = -ENOENT; @@ -481,9 +483,14 @@ found: if (offset == 0) qp->q.flags |= INET_FRAG_FIRST_IN; + fragsize = skb->len + ihl; + + if (fragsize > qp->q.max_size) + qp->q.max_size = fragsize; + if (ip_hdr(skb)->frag_off & htons(IP_DF) && - skb->len + ihl > qp->q.max_size) - qp->q.max_size = skb->len + ihl; + fragsize > qp->max_df_size) + qp->max_df_size = fragsize; if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && qp->q.meat == qp->q.len) { @@ -613,13 +620,27 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, head->next = NULL; head->dev = dev; head->tstamp = qp->q.stamp; - IPCB(head)->frag_max_size = qp->q.max_size; + IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size); iph = ip_hdr(head); - /* max_size != 0 implies at least one fragment had IP_DF set */ - iph->frag_off = qp->q.max_size ? htons(IP_DF) : 0; iph->tot_len = htons(len); iph->tos |= ecn; + + /* When we set IP_DF on a refragmented skb we must also force a + * call to ip_fragment to avoid forwarding a DF-skb of size s while + * original sender only sent fragments of size f (where f < s). + * + * We only set DF/IPSKB_FRAG_PMTU if such DF fragment was the largest + * frag seen to avoid sending tiny DF-fragments in case skb was built + * from one very small df-fragment and one large non-df frag. + */ + if (qp->max_df_size == qp->q.max_size) { + IPCB(head)->flags |= IPSKB_FRAG_PMTU; + iph->frag_off = htons(IP_DF); + } else { + iph->frag_off = 0; + } + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS); qp->q.fragments = NULL; qp->q.fragments_tail = NULL; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index d6dd8ba04441..f5f5ef1cebd5 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -278,7 +278,7 @@ static int ip_finish_output(struct sock *sk, struct sk_buff *skb) if (skb_is_gso(skb)) return ip_finish_output_gso(sk, skb, mtu); - if (skb->len > mtu) + if (skb->len > mtu || (IPCB(skb)->flags & IPSKB_FRAG_PMTU)) return ip_fragment(sk, skb, mtu, ip_finish_output2); return ip_finish_output2(sk, skb); @@ -492,7 +492,10 @@ static int ip_fragment(struct sock *sk, struct sk_buff *skb, { struct iphdr *iph = ip_hdr(skb); - if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) || + if ((iph->frag_off & htons(IP_DF)) == 0) + return ip_do_fragment(sk, skb, output); + + if (unlikely(!skb->ignore_df || (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size > mtu))) { struct rtable *rt = skb_rtable(skb); @@ -537,6 +540,8 @@ int ip_do_fragment(struct sock *sk, struct sk_buff *skb, iph = ip_hdr(skb); mtu = ip_skb_dst_mtu(skb); + if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu) + mtu = IPCB(skb)->frag_max_size; /* * Setup starting values. @@ -732,6 +737,9 @@ slow_path: iph = ip_hdr(skb2); iph->frag_off = htons((offset >> 3)); + if (IPCB(skb)->flags & IPSKB_FRAG_PMTU) + iph->frag_off |= htons(IP_DF); + /* ANK: dirty, but effective trick. Upgrade options only if * the segment to be fragmented was THE FIRST (otherwise, * options are already fixed) and make it ONCE -- cgit From e4390592a4fc3ee10a8bc4742197fdbac3935f74 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 27 May 2015 13:42:09 +0200 Subject: nl802154: add support for cca ed level info This patch adds information about the current cca ed level when the phy is dumped over nl802154. Signed-off-by: Alexander Aring Reviewed-by: Varka Bhadram Signed-off-by: Marcel Holtmann --- net/ieee802154/nl802154.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index 5c9d524ba02c..a89d9d024126 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -211,6 +211,7 @@ static const struct nla_policy nl802154_policy[NL802154_ATTR_MAX+1] = { [NL802154_ATTR_CCA_MODE] = { .type = NLA_U32, }, [NL802154_ATTR_CCA_OPT] = { .type = NLA_U32, }, + [NL802154_ATTR_CCA_ED_LEVEL] = { .type = NLA_S32, }, [NL802154_ATTR_SUPPORTED_CHANNEL] = { .type = NLA_U32, }, @@ -421,6 +422,12 @@ static int nl802154_send_wpan_phy(struct cfg802154_registered_device *rdev, goto nla_put_failure; } + if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_ED_LEVEL) { + if (nla_put_s32(msg, NL802154_ATTR_CCA_ED_LEVEL, + rdev->wpan_phy.cca_ed_level)) + goto nla_put_failure; + } + if (nl802154_put_capabilities(msg, rdev)) goto nla_put_failure; -- cgit From b69644c1c72e179738dd5c7e52e99d8550189472 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 27 May 2015 13:42:10 +0200 Subject: nl802154: add support to set cca ed level This patch adds support for setting the current cca ed level value over nl802154. Signed-off-by: Alexander Aring Reviewed-by: Varka Bhadram Signed-off-by: Marcel Holtmann --- net/ieee802154/nl802154.c | 30 ++++++++++++++++++++++++++++++ net/ieee802154/rdev-ops.h | 11 +++++++++++ net/ieee802154/trace.h | 15 +++++++++++++++ net/mac802154/cfg.c | 19 +++++++++++++++++++ 4 files changed, 75 insertions(+) (limited to 'net') diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index a89d9d024126..7dbb1f4ce7df 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -790,6 +790,28 @@ static int nl802154_set_cca_mode(struct sk_buff *skb, struct genl_info *info) return rdev_set_cca_mode(rdev, &cca); } +static int nl802154_set_cca_ed_level(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg802154_registered_device *rdev = info->user_ptr[0]; + s32 ed_level; + int i; + + if (!(rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_ED_LEVEL)) + return -EOPNOTSUPP; + + if (!info->attrs[NL802154_ATTR_CCA_ED_LEVEL]) + return -EINVAL; + + ed_level = nla_get_s32(info->attrs[NL802154_ATTR_CCA_ED_LEVEL]); + + for (i = 0; i < rdev->wpan_phy.supported.cca_ed_levels_size; i++) { + if (ed_level == rdev->wpan_phy.supported.cca_ed_levels[i]) + return rdev_set_cca_ed_level(rdev, ed_level); + } + + return -EINVAL; +} + static int nl802154_set_tx_power(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; @@ -1122,6 +1144,14 @@ static const struct genl_ops nl802154_ops[] = { .internal_flags = NL802154_FLAG_NEED_WPAN_PHY | NL802154_FLAG_NEED_RTNL, }, + { + .cmd = NL802154_CMD_SET_CCA_ED_LEVEL, + .doit = nl802154_set_cca_ed_level, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_WPAN_PHY | + NL802154_FLAG_NEED_RTNL, + }, { .cmd = NL802154_CMD_SET_TX_POWER, .doit = nl802154_set_tx_power, diff --git a/net/ieee802154/rdev-ops.h b/net/ieee802154/rdev-ops.h index 36456118a6ec..b2155a123f6c 100644 --- a/net/ieee802154/rdev-ops.h +++ b/net/ieee802154/rdev-ops.h @@ -74,6 +74,17 @@ rdev_set_cca_mode(struct cfg802154_registered_device *rdev, return ret; } +static inline int +rdev_set_cca_ed_level(struct cfg802154_registered_device *rdev, s32 ed_level) +{ + int ret; + + trace_802154_rdev_set_cca_ed_level(&rdev->wpan_phy, ed_level); + ret = rdev->ops->set_cca_ed_level(&rdev->wpan_phy, ed_level); + trace_802154_rdev_return_int(&rdev->wpan_phy, ret); + return ret; +} + static inline int rdev_set_tx_power(struct cfg802154_registered_device *rdev, s32 power) diff --git a/net/ieee802154/trace.h b/net/ieee802154/trace.h index ac790bb1b4fe..73eb7605c1eb 100644 --- a/net/ieee802154/trace.h +++ b/net/ieee802154/trace.h @@ -123,6 +123,21 @@ TRACE_EVENT(802154_rdev_set_cca_mode, WPAN_CCA_PR_ARG) ); +TRACE_EVENT(802154_rdev_set_cca_ed_level, + TP_PROTO(struct wpan_phy *wpan_phy, s32 ed_level), + TP_ARGS(wpan_phy, ed_level), + TP_STRUCT__entry( + WPAN_PHY_ENTRY + __field(s32, ed_level) + ), + TP_fast_assign( + WPAN_PHY_ASSIGN; + __entry->ed_level = ed_level; + ), + TP_printk(WPAN_PHY_PR_FMT ", ed_level: %d", WPAN_PHY_PR_ARG, + __entry->ed_level) +); + DECLARE_EVENT_CLASS(802154_le16_template, TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, __le16 le16arg), diff --git a/net/mac802154/cfg.c b/net/mac802154/cfg.c index ab12fa296eb3..317c4662e544 100644 --- a/net/mac802154/cfg.c +++ b/net/mac802154/cfg.c @@ -105,6 +105,24 @@ ieee802154_set_cca_mode(struct wpan_phy *wpan_phy, return ret; } +static int +ieee802154_set_cca_ed_level(struct wpan_phy *wpan_phy, s32 ed_level) +{ + struct ieee802154_local *local = wpan_phy_priv(wpan_phy); + int ret; + + ASSERT_RTNL(); + + if (wpan_phy->cca_ed_level == ed_level) + return 0; + + ret = drv_set_cca_ed_level(local, ed_level); + if (!ret) + wpan_phy->cca_ed_level = ed_level; + + return ret; +} + static int ieee802154_set_tx_power(struct wpan_phy *wpan_phy, s32 power) { @@ -213,6 +231,7 @@ const struct cfg802154_ops mac802154_config_ops = { .del_virtual_intf = ieee802154_del_iface, .set_channel = ieee802154_set_channel, .set_cca_mode = ieee802154_set_cca_mode, + .set_cca_ed_level = ieee802154_set_cca_ed_level, .set_tx_power = ieee802154_set_tx_power, .set_pan_id = ieee802154_set_pan_id, .set_short_addr = ieee802154_set_short_addr, -- cgit From 07f4c90062f8fc7c8c26f8f95324cbe8fa3145a5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 24 May 2015 14:49:35 -0700 Subject: tcp/dccp: try to not exhaust ip_local_port_range in connect() A long standing problem on busy servers is the tiny available TCP port range (/proc/sys/net/ipv4/ip_local_port_range) and the default sequential allocation of source ports in connect() system call. If a host is having a lot of active TCP sessions, chances are very high that all ports are in use by at least one flow, and subsequent bind(0) attempts fail, or have to scan a big portion of space to find a slot. In this patch, I changed the starting point in __inet_hash_connect() so that we try to favor even [1] ports, leaving odd ports for bind() users. We still perform a sequential search, so there is no guarantee, but if connect() targets are very different, end result is we leave more ports available to bind(), and we spread them all over the range, lowering time for both connect() and bind() to find a slot. This strategy only works well if /proc/sys/net/ipv4/ip_local_port_range is even, ie if start/end values have different parity. Therefore, default /proc/sys/net/ipv4/ip_local_port_range was changed to 32768 - 60999 (instead of 32768 - 61000) There is no change on security aspects here, only some poor hashing schemes could be eventually impacted by this change. [1] : The odd/even property depends on ip_local_port_range values parity Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 2 +- net/ipv4/inet_hashtables.c | 10 ++++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 235d36afece3..6ad0f7a711c9 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1595,7 +1595,7 @@ static __net_init int inet_init_net(struct net *net) */ seqlock_init(&net->ipv4.ip_local_ports.lock); net->ipv4.ip_local_ports.range[0] = 32768; - net->ipv4.ip_local_ports.range[1] = 61000; + net->ipv4.ip_local_ports.range[1] = 60999; seqlock_init(&net->ipv4.ping_group_range.lock); /* diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 185efef0f125..1b336dc179f8 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -502,8 +502,14 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; + /* By starting with offset being an even number, + * we tend to leave about 50% of ports for other uses, + * like bind(0). + */ + offset &= ~1; + local_bh_disable(); - for (i = 1; i <= remaining; i++) { + for (i = 0; i < remaining; i++) { port = low + (i + offset) % remaining; if (inet_is_local_reserved_port(net, port)) continue; @@ -547,7 +553,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, return -EADDRNOTAVAIL; ok: - hint += i; + hint += (i + 2) & ~1; /* Head lock still held and bh's disabled */ inet_bind_hash(sk, tb, port); -- cgit From e2baad9e4b153c67dddc5ccf987395b842329c84 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 27 May 2015 10:46:02 -0700 Subject: tcp: connect() from bound sockets can be faster __inet_hash_connect() does not use its third argument (port_offset) if socket was already bound to a source port. No need to perform useless but expensive md5 computations. Reported-by: Crestez Dan Leonard Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/inet_hashtables.c | 9 +++++++-- net/ipv6/inet6_hashtables.c | 8 ++++++-- 2 files changed, 13 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 1b336dc179f8..5f9b063bbe8a 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -394,9 +394,10 @@ not_unique: return -EADDRNOTAVAIL; } -static inline u32 inet_sk_port_offset(const struct sock *sk) +static u32 inet_sk_port_offset(const struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); + return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr, inet->inet_daddr, inet->inet_dport); @@ -600,7 +601,11 @@ out: int inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk) { - return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk), + u32 port_offset = 0; + + if (!inet_sk(sk)->inet_num) + port_offset = inet_sk_port_offset(sk); + return __inet_hash_connect(death_row, sk, port_offset, __inet_check_established); } EXPORT_SYMBOL_GPL(inet_hash_connect); diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 871641bc1ed4..b4fd96de97e6 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -257,7 +257,7 @@ not_unique: return -EADDRNOTAVAIL; } -static inline u32 inet6_sk_port_offset(const struct sock *sk) +static u32 inet6_sk_port_offset(const struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); @@ -269,7 +269,11 @@ static inline u32 inet6_sk_port_offset(const struct sock *sk) int inet6_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk) { - return __inet_hash_connect(death_row, sk, inet6_sk_port_offset(sk), + u32 port_offset = 0; + + if (!inet_sk(sk)->inet_num) + port_offset = inet6_sk_port_offset(sk); + return __inet_hash_connect(death_row, sk, port_offset, __inet6_check_established); } EXPORT_SYMBOL_GPL(inet6_hash_connect); -- cgit From ed2dfd900992aa7b6b3d0abd8ec9a7e9d2c7f827 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 27 May 2015 11:34:37 -0700 Subject: tcp/dccp: warn user for preferred ip_local_port_range After commit 07f4c90062f8f ("tcp/dccp: try to not exhaust ip_local_port_range in connect()") it is advised to have an even number of ports described in /proc/sys/net/ipv4/ip_local_port_range This means start/end values should have a different parity. Let's warn sysadmins of this, so that they can update their settings if they want to. Suggested-by: David S. Miller Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index e64892769607..0330ab2e2b63 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -45,7 +45,13 @@ static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; /* Update system visible IP port range */ static void set_local_port_range(struct net *net, int range[2]) { + bool same_parity = !((range[0] ^ range[1]) & 1); + write_seqlock(&net->ipv4.ip_local_ports.lock); + if (same_parity && !net->ipv4.ip_local_ports.warned) { + net->ipv4.ip_local_ports.warned = true; + pr_err_ratelimited("ip_local_port_range: prefer different parity for start/end values.\n"); + } net->ipv4.ip_local_ports.range[0] = range[0]; net->ipv4.ip_local_ports.range[1] = range[1]; write_sequnlock(&net->ipv4.ip_local_ports.lock); -- cgit From 70e717762d2fa80e995303995c80e0abc6419997 Mon Sep 17 00:00:00 2001 From: Simon Wunderlich Date: Tue, 28 Apr 2015 20:29:53 +0200 Subject: batman-adv: Start new development cycle Signed-off-by: Simon Wunderlich --- net/batman-adv/main.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 4d2318829a34..9d5657e14765 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -24,7 +24,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2015.0" +#define BATADV_SOURCE_VERSION "2015.1" #endif /* B.A.T.M.A.N. parameters */ -- cgit From 9f6446c7f9af084763037334d37e85dacfcbd403 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Thu, 23 Apr 2015 13:16:35 +0200 Subject: batman-adv: update copyright years for 2015 Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner --- net/batman-adv/Makefile | 2 +- net/batman-adv/bat_algo.h | 2 +- net/batman-adv/bat_iv_ogm.c | 2 +- net/batman-adv/bitarray.c | 2 +- net/batman-adv/bitarray.h | 2 +- net/batman-adv/bridge_loop_avoidance.c | 2 +- net/batman-adv/bridge_loop_avoidance.h | 2 +- net/batman-adv/debugfs.c | 2 +- net/batman-adv/debugfs.h | 2 +- net/batman-adv/distributed-arp-table.c | 2 +- net/batman-adv/distributed-arp-table.h | 2 +- net/batman-adv/fragmentation.c | 2 +- net/batman-adv/fragmentation.h | 2 +- net/batman-adv/gateway_client.c | 2 +- net/batman-adv/gateway_client.h | 2 +- net/batman-adv/gateway_common.c | 2 +- net/batman-adv/gateway_common.h | 2 +- net/batman-adv/hard-interface.c | 2 +- net/batman-adv/hard-interface.h | 2 +- net/batman-adv/hash.c | 2 +- net/batman-adv/hash.h | 2 +- net/batman-adv/icmp_socket.c | 2 +- net/batman-adv/icmp_socket.h | 2 +- net/batman-adv/main.c | 2 +- net/batman-adv/main.h | 2 +- net/batman-adv/multicast.c | 2 +- net/batman-adv/multicast.h | 2 +- net/batman-adv/network-coding.c | 2 +- net/batman-adv/network-coding.h | 2 +- net/batman-adv/originator.c | 2 +- net/batman-adv/originator.h | 2 +- net/batman-adv/packet.h | 2 +- net/batman-adv/routing.c | 2 +- net/batman-adv/routing.h | 2 +- net/batman-adv/send.c | 2 +- net/batman-adv/send.h | 2 +- net/batman-adv/soft-interface.c | 2 +- net/batman-adv/soft-interface.h | 2 +- net/batman-adv/sysfs.c | 2 +- net/batman-adv/sysfs.h | 2 +- net/batman-adv/translation-table.c | 2 +- net/batman-adv/translation-table.h | 2 +- net/batman-adv/types.h | 2 +- 43 files changed, 43 insertions(+), 43 deletions(-) (limited to 'net') diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile index eb7d8c0388e4..79833ebcc2dd 100644 --- a/net/batman-adv/Makefile +++ b/net/batman-adv/Makefile @@ -1,5 +1,5 @@ # -# Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +# Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: # # Marek Lindner, Simon Wunderlich # diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h index 4e49666f8c65..4e59cf3eb079 100644 --- a/net/batman-adv/bat_algo.h +++ b/net/batman-adv/bat_algo.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2015 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 00e00e09b000..fc299c03a438 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c index e3da07a64026..40e4a2a18e45 100644 --- a/net/batman-adv/bitarray.c +++ b/net/batman-adv/bitarray.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2006-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2015 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h index 2acaafe60188..be497be696d1 100644 --- a/net/batman-adv/bitarray.h +++ b/net/batman-adv/bitarray.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2006-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2015 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index ac4b96eccade..fa941cd7d8ad 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2015 B.A.T.M.A.N. contributors: * * Simon Wunderlich * diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h index 43c985d92c3e..1f506d34039e 100644 --- a/net/batman-adv/bridge_loop_avoidance.h +++ b/net/batman-adv/bridge_loop_avoidance.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2015 B.A.T.M.A.N. contributors: * * Simon Wunderlich * diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c index a4972874c056..0c213a892296 100644 --- a/net/batman-adv/debugfs.c +++ b/net/batman-adv/debugfs.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2010-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2015 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h index 37c4d6ddd04d..f3b49c394446 100644 --- a/net/batman-adv/debugfs.h +++ b/net/batman-adv/debugfs.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2010-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2015 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index aad022dd15df..da1742d9059f 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2015 B.A.T.M.A.N. contributors: * * Antonio Quartulli * diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h index 2fe0764c64be..ed41b8edba18 100644 --- a/net/batman-adv/distributed-arp-table.h +++ b/net/batman-adv/distributed-arp-table.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2015 B.A.T.M.A.N. contributors: * * Antonio Quartulli * diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 3d1dcaa3e8b5..af495cc861fe 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2013-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2015 B.A.T.M.A.N. contributors: * * Martin Hundebøll * diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h index d848cf6676a2..ec1e86f899e8 100644 --- a/net/batman-adv/fragmentation.h +++ b/net/batman-adv/fragmentation.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2013-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2015 B.A.T.M.A.N. contributors: * * Martin Hundebøll * diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index 090828cf1fa7..a85eaca344e8 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2015 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h index 7ee53bb7d50f..185fb0887654 100644 --- a/net/batman-adv/gateway_client.h +++ b/net/batman-adv/gateway_client.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2015 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index 88a1bc3804d1..0792e2f101e4 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2015 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h index aa5116561947..df5434229675 100644 --- a/net/batman-adv/gateway_common.h +++ b/net/batman-adv/gateway_common.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2015 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index baf1f9843f2c..bdb020e29272 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h index 1918cd50b62e..e8b6ffea703d 100644 --- a/net/batman-adv/hard-interface.h +++ b/net/batman-adv/hard-interface.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c index 7c1c63080e20..3a0e1dcd1f29 100644 --- a/net/batman-adv/hash.c +++ b/net/batman-adv/hash.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2006-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2015 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h index 539fc1266793..379e32acf2b4 100644 --- a/net/batman-adv/hash.h +++ b/net/batman-adv/hash.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2006-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2015 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c index 161ef8f17d2e..6c3cfb57d132 100644 --- a/net/batman-adv/icmp_socket.c +++ b/net/batman-adv/icmp_socket.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h index 0c33950aa4aa..4815824e2f61 100644 --- a/net/batman-adv/icmp_socket.h +++ b/net/batman-adv/icmp_socket.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 12fc77bef23f..ca63d487c490 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 9d5657e14765..11d2d9fbfc5e 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index b24e4bb64fb5..09f2838dedf2 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2014-2015 B.A.T.M.A.N. contributors: * * Linus Lüssing * diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h index 3a44ebdb43cb..033d80e84fdf 100644 --- a/net/batman-adv/multicast.h +++ b/net/batman-adv/multicast.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2014-2015 B.A.T.M.A.N. contributors: * * Linus Lüssing * diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index 127cc4d7380a..89e1d4772cff 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2012-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2015 B.A.T.M.A.N. contributors: * * Martin Hundebøll, Jeppe Ledet-Pedersen * diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h index 358c0d686ab0..b5ab8ff544ee 100644 --- a/net/batman-adv/network-coding.h +++ b/net/batman-adv/network-coding.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2012-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2015 B.A.T.M.A.N. contributors: * * Martin Hundebøll, Jeppe Ledet-Pedersen * diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 90e805aba379..e3900e452616 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index aa4a43696295..91339143a2f7 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h index b81fbbf21a63..9468bc09c7c4 100644 --- a/net/batman-adv/packet.h +++ b/net/batman-adv/packet.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index da83982bf974..c5d90095bc3c 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h index 557d3d12a9ab..6573f12b3ddc 100644 --- a/net/batman-adv/routing.h +++ b/net/batman-adv/routing.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 3d64ed20c393..fa70ae8c9fe8 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h index 38d0ec1833ae..60c233eb35ed 100644 --- a/net/batman-adv/send.h +++ b/net/batman-adv/send.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 5ec31d7de24f..d85a45c24f62 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h index dbab22fd89a5..9ce08049ffd0 100644 --- a/net/batman-adv/soft-interface.h +++ b/net/batman-adv/soft-interface.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index a75dc12f96f8..fa8c347bf057 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2010-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2015 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/sysfs.h b/net/batman-adv/sysfs.h index b715b60db7cd..b9e79ad806ac 100644 --- a/net/batman-adv/sysfs.h +++ b/net/batman-adv/sysfs.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2010-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2015 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 07b263a437d1..b098e53edded 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli * diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h index ad84d7b89e39..5769037c7e2d 100644 --- a/net/batman-adv/translation-table.h +++ b/net/batman-adv/translation-table.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli * diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 9398c3fb4174..28f2461ea630 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * -- cgit From 53e771457e823fbc21834f60508c42a4270534fd Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Mon, 1 Dec 2014 10:37:27 +0100 Subject: batman-adv: Check total_size when queueing fragments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The fragmentation code was replaced in 610bfc6bc99bc83680d190ebc69359a05fc7f605 ("batman-adv: Receive fragmented packets and merge") by an implementation which handles the queueing+merging of fragments based on their size and the total_size of the non-fragmented packet. This total_size is announced by each fragment. The new implementation doesn't check if the the total_size information of the packets inside one chain is consistent. This is consistency check is recommended to allow using any of the packets in the queue to decide whether all fragments of a packet are received or not. Signed-off-by: Sven Eckelmann Acked-by: Martin Hundebøll Signed-off-by: Marek Lindner --- net/batman-adv/fragmentation.c | 7 +++++-- net/batman-adv/types.h | 2 ++ 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index af495cc861fe..a9fc6537214b 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -161,6 +161,7 @@ static bool batadv_frag_insert_packet(struct batadv_orig_node *orig_node, hlist_add_head(&frag_entry_new->list, &chain->head); chain->size = skb->len - hdr_size; chain->timestamp = jiffies; + chain->total_size = ntohs(frag_packet->total_size); ret = true; goto out; } @@ -195,9 +196,11 @@ static bool batadv_frag_insert_packet(struct batadv_orig_node *orig_node, out: if (chain->size > batadv_frag_size_limit() || - ntohs(frag_packet->total_size) > batadv_frag_size_limit()) { + chain->total_size != ntohs(frag_packet->total_size) || + chain->total_size > batadv_frag_size_limit()) { /* Clear chain if total size of either the list or the packet - * exceeds the maximum size of one merged packet. + * exceeds the maximum size of one merged packet. Don't allow + * packets to have different total_size. */ batadv_frag_clear_chain(&chain->head); chain->size = 0; diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 28f2461ea630..e95db4273356 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -132,6 +132,7 @@ struct batadv_orig_ifinfo { * @timestamp: time (jiffie) of last received fragment * @seqno: sequence number of the fragments in the list * @size: accumulated size of packets in list + * @total_size: expected size of the assembled packet */ struct batadv_frag_table_entry { struct hlist_head head; @@ -139,6 +140,7 @@ struct batadv_frag_table_entry { unsigned long timestamp; uint16_t seqno; uint16_t size; + uint16_t total_size; }; /** -- cgit From 83e8b87721f21b26b843633caca8ef453e943623 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Mon, 1 Dec 2014 10:37:28 +0100 Subject: batman-adv: Use only queued fragments when merging MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The fragment queueing code now validates the total_size of each fragment, checks when enough fragments are queued to allow to merge them into a single packet and if the fragments have the correct size. Therefore, it is not required to have any other parameter for the merging function than a list of queued fragments. This change should avoid problems like in the past when the different skb from the list and the function parameter were mixed incorrectly. Signed-off-by: Sven Eckelmann Acked-by: Martin Hundebøll Signed-off-by: Marek Lindner --- net/batman-adv/fragmentation.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index a9fc6537214b..6ce3c84a7e55 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -231,19 +231,13 @@ err: * Returns the merged skb or NULL on error. */ static struct sk_buff * -batadv_frag_merge_packets(struct hlist_head *chain, struct sk_buff *skb) +batadv_frag_merge_packets(struct hlist_head *chain) { struct batadv_frag_packet *packet; struct batadv_frag_list_entry *entry; struct sk_buff *skb_out = NULL; int size, hdr_size = sizeof(struct batadv_frag_packet); - /* Make sure incoming skb has non-bogus data. */ - packet = (struct batadv_frag_packet *)skb->data; - size = ntohs(packet->total_size); - if (size > batadv_frag_size_limit()) - goto free; - /* Remove first entry, as this is the destination for the rest of the * fragments. */ @@ -252,6 +246,9 @@ batadv_frag_merge_packets(struct hlist_head *chain, struct sk_buff *skb) skb_out = entry->skb; kfree(entry); + packet = (struct batadv_frag_packet *)skb_out->data; + size = ntohs(packet->total_size); + /* Make room for the rest of the fragments. */ if (pskb_expand_head(skb_out, 0, size - skb_out->len, GFP_ATOMIC) < 0) { kfree_skb(skb_out); @@ -307,7 +304,7 @@ bool batadv_frag_skb_buffer(struct sk_buff **skb, if (hlist_empty(&head)) goto out; - skb_out = batadv_frag_merge_packets(&head, *skb); + skb_out = batadv_frag_merge_packets(&head); if (!skb_out) goto out_err; -- cgit From 9bb218828c8f4fa6587af93e248903c96ce469d0 Mon Sep 17 00:00:00 2001 From: Markus Pargmann Date: Fri, 26 Dec 2014 12:41:18 +0100 Subject: batman-adv: debugfs, avoid compiling for !DEBUG_FS Normally the debugfs framework will return error pointer with -ENODEV for function calls when DEBUG_FS is not set. batman does not notice this error code and continues trying to create debugfs files and executes more code. We can avoid this code execution by disabling compiling debugfs.c when DEBUG_FS is not set. Signed-off-by: Markus Pargmann Signed-off-by: Marek Lindner --- net/batman-adv/Makefile | 2 +- net/batman-adv/debugfs.c | 8 -------- net/batman-adv/debugfs.h | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 35 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile index 79833ebcc2dd..bd7e343c98d1 100644 --- a/net/batman-adv/Makefile +++ b/net/batman-adv/Makefile @@ -20,7 +20,7 @@ obj-$(CONFIG_BATMAN_ADV) += batman-adv.o batman-adv-y += bat_iv_ogm.o batman-adv-y += bitarray.o batman-adv-$(CONFIG_BATMAN_ADV_BLA) += bridge_loop_avoidance.o -batman-adv-y += debugfs.o +batman-adv-$(CONFIG_DEBUG_FS) += debugfs.o batman-adv-$(CONFIG_BATMAN_ADV_DAT) += distributed-arp-table.o batman-adv-y += fragmentation.o batman-adv-y += gateway_client.o diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c index 0c213a892296..46118084221a 100644 --- a/net/batman-adv/debugfs.c +++ b/net/batman-adv/debugfs.c @@ -482,11 +482,7 @@ rem_attr: debugfs_remove_recursive(hard_iface->debug_dir); hard_iface->debug_dir = NULL; out: -#ifdef CONFIG_DEBUG_FS return -ENOMEM; -#else - return 0; -#endif /* CONFIG_DEBUG_FS */ } /** @@ -541,11 +537,7 @@ rem_attr: debugfs_remove_recursive(bat_priv->debug_dir); bat_priv->debug_dir = NULL; out: -#ifdef CONFIG_DEBUG_FS return -ENOMEM; -#else - return 0; -#endif /* CONFIG_DEBUG_FS */ } void batadv_debugfs_del_meshif(struct net_device *dev) diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h index f3b49c394446..ed25605ca732 100644 --- a/net/batman-adv/debugfs.h +++ b/net/batman-adv/debugfs.h @@ -20,6 +20,8 @@ #define BATADV_DEBUGFS_SUBDIR "batman_adv" +#if IS_ENABLED(CONFIG_DEBUG_FS) + void batadv_debugfs_init(void); void batadv_debugfs_destroy(void); int batadv_debugfs_add_meshif(struct net_device *dev); @@ -27,4 +29,36 @@ void batadv_debugfs_del_meshif(struct net_device *dev); int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface); void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface); +#else + +static inline void batadv_debugfs_init(void) +{ +} + +static inline void batadv_debugfs_destroy(void) +{ +} + +static inline int batadv_debugfs_add_meshif(struct net_device *dev) +{ + return 0; +} + +static inline void batadv_debugfs_del_meshif(struct net_device *dev) +{ +} + +static inline +int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface) +{ + return 0; +} + +static inline +void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface) +{ +} + +#endif + #endif /* _NET_BATMAN_ADV_DEBUGFS_H_ */ -- cgit From 16b9ce83fb850602794a3bc534693362408a5408 Mon Sep 17 00:00:00 2001 From: Markus Pargmann Date: Fri, 26 Dec 2014 12:41:23 +0100 Subject: batman-adv: tvlv realloc, move error handling into if block Instead of hiding the normal function flow inside an if block, we should just put the error handling into the if block. Signed-off-by: Markus Pargmann Signed-off-by: Marek Lindner --- net/batman-adv/main.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index ca63d487c490..fd9333dffc97 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -819,15 +819,15 @@ static bool batadv_tvlv_realloc_packet_buff(unsigned char **packet_buff, new_buff = kmalloc(min_packet_len + additional_packet_len, GFP_ATOMIC); /* keep old buffer if kmalloc should fail */ - if (new_buff) { - memcpy(new_buff, *packet_buff, min_packet_len); - kfree(*packet_buff); - *packet_buff = new_buff; - *packet_buff_len = min_packet_len + additional_packet_len; - return true; - } + if (!new_buff) + return false; + + memcpy(new_buff, *packet_buff, min_packet_len); + kfree(*packet_buff); + *packet_buff = new_buff; + *packet_buff_len = min_packet_len + additional_packet_len; - return false; + return true; } /** -- cgit From 9fc1883ef25a43b895531ac44a38257ba8c2ca62 Mon Sep 17 00:00:00 2001 From: Markus Pargmann Date: Fri, 26 Dec 2014 12:41:25 +0100 Subject: batman-adv: Makefile, Sort alphabetically The whole Makefile is sorted, just the multicast rule is not at the right position. Signed-off-by: Markus Pargmann Signed-off-by: Marek Lindner --- net/batman-adv/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile index bd7e343c98d1..21434ab79d2c 100644 --- a/net/batman-adv/Makefile +++ b/net/batman-adv/Makefile @@ -29,6 +29,7 @@ batman-adv-y += hard-interface.o batman-adv-y += hash.o batman-adv-y += icmp_socket.o batman-adv-y += main.o +batman-adv-$(CONFIG_BATMAN_ADV_MCAST) += multicast.o batman-adv-$(CONFIG_BATMAN_ADV_NC) += network-coding.o batman-adv-y += originator.o batman-adv-y += routing.o @@ -36,4 +37,3 @@ batman-adv-y += send.o batman-adv-y += soft-interface.o batman-adv-y += sysfs.o batman-adv-y += translation-table.o -batman-adv-$(CONFIG_BATMAN_ADV_MCAST) += multicast.o -- cgit From 42d9f2cbd42e1ba137584da8305cf6f68b84f39d Mon Sep 17 00:00:00 2001 From: Markus Pargmann Date: Fri, 26 Dec 2014 12:41:26 +0100 Subject: batman-adv: iv_ogm_iface_enable, direct return values Directly return error values. No need to use a return variable. Signed-off-by: Markus Pargmann Signed-off-by: Marek Lindner --- net/batman-adv/bat_iv_ogm.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index fc299c03a438..123aabbcb003 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -308,7 +308,6 @@ static int batadv_iv_ogm_iface_enable(struct batadv_hard_iface *hard_iface) struct batadv_ogm_packet *batadv_ogm_packet; unsigned char *ogm_buff; uint32_t random_seqno; - int res = -ENOMEM; /* randomize initial seqno to avoid collision */ get_random_bytes(&random_seqno, sizeof(random_seqno)); @@ -317,7 +316,7 @@ static int batadv_iv_ogm_iface_enable(struct batadv_hard_iface *hard_iface) hard_iface->bat_iv.ogm_buff_len = BATADV_OGM_HLEN; ogm_buff = kmalloc(hard_iface->bat_iv.ogm_buff_len, GFP_ATOMIC); if (!ogm_buff) - goto out; + return -ENOMEM; hard_iface->bat_iv.ogm_buff = ogm_buff; @@ -329,10 +328,7 @@ static int batadv_iv_ogm_iface_enable(struct batadv_hard_iface *hard_iface) batadv_ogm_packet->reserved = 0; batadv_ogm_packet->tq = BATADV_TQ_MAX_VALUE; - res = 0; - -out: - return res; + return 0; } static void batadv_iv_ogm_iface_disable(struct batadv_hard_iface *hard_iface) -- cgit From 9fd9b19ea03cce2e47544a3b8e0b532485b34758 Mon Sep 17 00:00:00 2001 From: Markus Pargmann Date: Fri, 26 Dec 2014 12:41:27 +0100 Subject: batman-adv: iv_ogm_aggr_packet, bool return value This function returns bool values, so it should be defined to return them instead of the whole int range. Signed-off-by: Markus Pargmann Signed-off-by: Marek Lindner --- net/batman-adv/bat_iv_ogm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 123aabbcb003..00151385ad11 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -392,8 +392,8 @@ static uint8_t batadv_hop_penalty(uint8_t tq, } /* is there another aggregated packet here? */ -static int batadv_iv_ogm_aggr_packet(int buff_pos, int packet_len, - __be16 tvlv_len) +static bool batadv_iv_ogm_aggr_packet(int buff_pos, int packet_len, + __be16 tvlv_len) { int next_buff_pos = 0; -- cgit From de12baece9b8010d036b4104a9f08d21418b60e8 Mon Sep 17 00:00:00 2001 From: Markus Pargmann Date: Fri, 26 Dec 2014 12:41:28 +0100 Subject: batman-adv: iv_ogm_send_to_if, declare char* as const This string pointer is later assigned to a constant string, so it should be defined constant at the beginning. Signed-off-by: Markus Pargmann Signed-off-by: Marek Lindner --- net/batman-adv/bat_iv_ogm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 00151385ad11..f9c561094942 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -409,7 +409,7 @@ static void batadv_iv_ogm_send_to_if(struct batadv_forw_packet *forw_packet, struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); - char *fwd_str; + const char *fwd_str; uint8_t packet_num; int16_t buff_pos; struct batadv_ogm_packet *batadv_ogm_packet; -- cgit From dab7b62190c5abbe90eef4e9f9e7c28492e77eba Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Wed, 18 Feb 2015 18:20:24 +0100 Subject: batman-adv: Use safer default config for optional features MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current default settings for optional features in batman-adv seems to be based around the idea that the user only compiles what he requires. They will automatically enabled when they are compiled in. For example the network coding part of batman-adv is by default disabled in the out-of-tree module but will be enabled when the code is compiled during the module build. But distributions like Debian just enable all features of the batman-adv kernel module and hope that more experimental features or features with possible negative effects have to be enabled using some runtime configuration interface. The network_coding feature can help in specific setups but also has drawbacks and is not disabled by default in the out-of-tree module. Disabling by default in the runtime config seems to be also quite sane. The bridge_loop_avoidance is the only feature which is disabled by default but may be necessary even in simple setups. Packet loops may even be created during the initial node setup when this is not enabled. This is different than STP on bridges because mesh is usually used on Adhoc WiFi. Having two nodes (by accident) in the same LAN segment and in the same mesh network is rather common in this situation. Signed-off-by: Sven Eckelmann Acked-by: Martin Hundebøll Signed-off-by: Marek Lindner --- net/batman-adv/network-coding.c | 2 +- net/batman-adv/soft-interface.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index 89e1d4772cff..4cb70bb4c3e8 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -155,7 +155,7 @@ err: */ void batadv_nc_init_bat_priv(struct batadv_priv *bat_priv) { - atomic_set(&bat_priv->network_coding, 1); + atomic_set(&bat_priv->network_coding, 0); bat_priv->nc.min_tq = 200; bat_priv->nc.max_fwd_delay = 10; bat_priv->nc.max_buffer_time = 200; diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index d85a45c24f62..9426b83caab6 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -732,7 +732,7 @@ static int batadv_softif_init_late(struct net_device *dev) atomic_set(&bat_priv->aggregated_ogms, 1); atomic_set(&bat_priv->bonding, 0); #ifdef CONFIG_BATMAN_ADV_BLA - atomic_set(&bat_priv->bridge_loop_avoidance, 0); + atomic_set(&bat_priv->bridge_loop_avoidance, 1); #endif #ifdef CONFIG_BATMAN_ADV_DAT atomic_set(&bat_priv->distributed_arp_table, 1); -- cgit From 00f548bf5494ade996ae9c5e85c497dd2a3fdad5 Mon Sep 17 00:00:00 2001 From: Marek Lindner Date: Wed, 18 Feb 2015 22:17:30 +0800 Subject: batman-adv: checkpatch - comparison to NULL could be rewritten Signed-off-by: Marek Lindner --- net/batman-adv/soft-interface.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 9426b83caab6..50cf722f4e1b 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -818,7 +818,7 @@ static int batadv_softif_slave_add(struct net_device *dev, int ret = -EINVAL; hard_iface = batadv_hardif_get_by_netdev(slave_dev); - if (!hard_iface || hard_iface->soft_iface != NULL) + if (!hard_iface || hard_iface->soft_iface) goto out; ret = batadv_hardif_enable_interface(hard_iface, dev->name); -- cgit From fc1f86936651191ca51e532083206efb351ccd9c Mon Sep 17 00:00:00 2001 From: Marek Lindner Date: Wed, 18 Feb 2015 22:19:20 +0800 Subject: batman-adv: checkpatch - spaces preferred around that '*' Signed-off-by: Marek Lindner --- net/batman-adv/main.h | 2 +- net/batman-adv/network-coding.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 11d2d9fbfc5e..026ba37f31a6 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -44,7 +44,7 @@ #define BATADV_TT_CLIENT_TEMP_TIMEOUT 600000 /* in milliseconds */ #define BATADV_TT_WORK_PERIOD 5000 /* 5 seconds */ #define BATADV_ORIG_WORK_PERIOD 1000 /* 1 second */ -#define BATADV_DAT_ENTRY_TIMEOUT (5*60000) /* 5 mins in milliseconds */ +#define BATADV_DAT_ENTRY_TIMEOUT (5 * 60000) /* 5 mins in milliseconds */ /* sliding packet range of received originator messages in sequence numbers * (should be a multiple of our word size) */ diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index 4cb70bb4c3e8..b984bc49deaf 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -275,7 +275,7 @@ static bool batadv_nc_to_purge_nc_path_decoding(struct batadv_priv *bat_priv, * max_buffer time */ return batadv_has_timed_out(nc_path->last_valid, - bat_priv->nc.max_buffer_time*10); + bat_priv->nc.max_buffer_time * 10); } /** -- cgit From 8f34b388786a952382c829453f280384f1bf3886 Mon Sep 17 00:00:00 2001 From: Markus Pargmann Date: Fri, 26 Dec 2014 12:41:29 +0100 Subject: batman-adv: iv_ogm_can_aggregate, code readability This patch tries to increase code readability by negating the first if block and rearranging some of the other conditional blocks. This way we save an indentation level, we also save some allocation that is not necessary for one of the conditions. Signed-off-by: Markus Pargmann Signed-off-by: Marek Lindner --- net/batman-adv/bat_iv_ogm.c | 102 +++++++++++++++++++++++--------------------- 1 file changed, 53 insertions(+), 49 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index f9c561094942..ce04cd75fae4 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -544,58 +544,62 @@ batadv_iv_ogm_can_aggregate(const struct batadv_ogm_packet *new_bat_ogm_packet, * - the send time is within our MAX_AGGREGATION_MS time * - the resulting packet wont be bigger than * MAX_AGGREGATION_BYTES + * otherwise aggregation is not possible */ - if (time_before(send_time, forw_packet->send_time) && - time_after_eq(aggregation_end_time, forw_packet->send_time) && - (aggregated_bytes <= BATADV_MAX_AGGREGATION_BYTES)) { - /* check aggregation compatibility - * -> direct link packets are broadcasted on - * their interface only - * -> aggregate packet if the current packet is - * a "global" packet as well as the base - * packet - */ - primary_if = batadv_primary_if_get_selected(bat_priv); - if (!primary_if) - goto out; - - /* packet is not leaving on the same interface. */ - if (forw_packet->if_outgoing != if_outgoing) - goto out; + if (!time_before(send_time, forw_packet->send_time) || + !time_after_eq(aggregation_end_time, forw_packet->send_time)) + return false; + + if (aggregated_bytes > BATADV_MAX_AGGREGATION_BYTES) + return false; + + /* packet is not leaving on the same interface. */ + if (forw_packet->if_outgoing != if_outgoing) + return false; + + /* check aggregation compatibility + * -> direct link packets are broadcasted on + * their interface only + * -> aggregate packet if the current packet is + * a "global" packet as well as the base + * packet + */ + primary_if = batadv_primary_if_get_selected(bat_priv); + if (!primary_if) + return false; - /* packets without direct link flag and high TTL - * are flooded through the net - */ - if ((!directlink) && - (!(batadv_ogm_packet->flags & BATADV_DIRECTLINK)) && - (batadv_ogm_packet->ttl != 1) && - - /* own packets originating non-primary - * interfaces leave only that interface - */ - ((!forw_packet->own) || - (forw_packet->if_incoming == primary_if))) { - res = true; - goto out; - } + /* packets without direct link flag and high TTL + * are flooded through the net + */ + if (!directlink && + !(batadv_ogm_packet->flags & BATADV_DIRECTLINK) && + batadv_ogm_packet->ttl != 1 && + + /* own packets originating non-primary + * interfaces leave only that interface + */ + (!forw_packet->own || + forw_packet->if_incoming == primary_if)) { + res = true; + goto out; + } - /* if the incoming packet is sent via this one - * interface only - we still can aggregate - */ - if ((directlink) && - (new_bat_ogm_packet->ttl == 1) && - (forw_packet->if_incoming == if_incoming) && - - /* packets from direct neighbors or - * own secondary interface packets - * (= secondary interface packets in general) - */ - (batadv_ogm_packet->flags & BATADV_DIRECTLINK || - (forw_packet->own && - forw_packet->if_incoming != primary_if))) { - res = true; - goto out; - } + /* if the incoming packet is sent via this one + * interface only - we still can aggregate + */ + if (directlink && + new_bat_ogm_packet->ttl == 1 && + forw_packet->if_incoming == if_incoming && + + /* packets from direct neighbors or + * own secondary interface packets + * (= secondary interface packets in general) + */ + (batadv_ogm_packet->flags & BATADV_DIRECTLINK || + (forw_packet->own && + forw_packet->if_incoming != primary_if))) { + res = true; + goto out; } out: -- cgit From 01b97a3eed3fe40e3a007d56935b421023bca2e3 Mon Sep 17 00:00:00 2001 From: Markus Pargmann Date: Fri, 26 Dec 2014 12:41:30 +0100 Subject: batman-adv: iv_ogm_orig_update, remove unnecessary brackets Remove these unnecessary brackets inside a condition. Signed-off-by: Markus Pargmann Signed-off-by: Marek Lindner --- net/batman-adv/bat_iv_ogm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index ce04cd75fae4..c5ba7a798de7 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -1081,7 +1081,7 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, * won't consider it either */ if (router_ifinfo && - (neigh_ifinfo->bat_iv.tq_avg == router_ifinfo->bat_iv.tq_avg)) { + neigh_ifinfo->bat_iv.tq_avg == router_ifinfo->bat_iv.tq_avg) { orig_node_tmp = router->orig_node; spin_lock_bh(&orig_node_tmp->bat_iv.ogm_cnt_lock); if_num = router->if_incoming->if_num; -- cgit From 8ea64e27080eb66dc26f64f28485c0bc9fc06b36 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Mon, 11 May 2015 20:34:52 +0200 Subject: batman-adv: Use common declaration order in *_send_skb_(packet|unicast) Signed-off-by: Antonio Quartulli --- net/batman-adv/send.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index fa70ae8c9fe8..23635bd63fec 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -255,8 +255,8 @@ int batadv_send_skb_unicast(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, unsigned short vid) { - struct ethhdr *ethhdr; struct batadv_unicast_packet *unicast_packet; + struct ethhdr *ethhdr; int ret = NET_XMIT_DROP; if (!orig_node) -- cgit From ab499db80fcf07c18e4053f91a619500f663e90e Mon Sep 17 00:00:00 2001 From: Michal Kazior Date: Fri, 22 May 2015 10:22:40 +0200 Subject: mac80211: prevent possible crypto tx tailroom corruption There was a possible race between ieee80211_reconfig() and ieee80211_delayed_tailroom_dec(). This could result in inability to transmit data if driver crashed during roaming or rekeying and subsequent skbs with insufficient tailroom appeared. This race was probably never seen in the wild because a device driver would have to crash AND recover within 0.5s which is very unlikely. I was able to prove this race exists after changing the delay to 10s locally and crashing ath10k via debugfs immediately after GTK rekeying. In case of ath10k the counter went below 0. This was harmless but other drivers which actually require tailroom (e.g. for WEP ICV or MMIC) could end up with the counter at 0 instead of >0 and introduce insufficient skb tailroom failures because mac80211 would not resize skbs appropriately anymore. Fixes: 8d1f7ecd2af5 ("mac80211: defer tailroom counter manipulation when roaming") Signed-off-by: Michal Kazior Signed-off-by: Johannes Berg --- net/mac80211/main.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 99d27babd9f0..674164fe5cdb 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -246,6 +246,7 @@ static void ieee80211_restart_work(struct work_struct *work) { struct ieee80211_local *local = container_of(work, struct ieee80211_local, restart_work); + struct ieee80211_sub_if_data *sdata; /* wait for scan work complete */ flush_workqueue(local->workqueue); @@ -254,6 +255,8 @@ static void ieee80211_restart_work(struct work_struct *work) "%s called with hardware scan in progress\n", __func__); rtnl_lock(); + list_for_each_entry(sdata, &local->interfaces, list) + flush_delayed_work(&sdata->dec_tailroom_needed_wk); ieee80211_scan_cancel(local); ieee80211_reconfig(local); rtnl_unlock(); -- cgit From 6cbfb1bb66e4e85da5db78e8ff429a85bd84ce64 Mon Sep 17 00:00:00 2001 From: Michal Kazior Date: Fri, 22 May 2015 10:57:22 +0200 Subject: cfg80211: ignore netif running state when changing iftype It was possible for mac80211 to be coerced into an unexpected flow causing sdata union to become corrupted. Station pointer was put into sdata->u.vlan.sta memory location while it was really master AP's sdata->u.ap.next_beacon. This led to station entry being later freed as next_beacon before __sta_info_flush() in ieee80211_stop_ap() and a subsequent invalid pointer dereference crash. The problem was that ieee80211_ptr->use_4addr wasn't cleared on interface type changes. This could be reproduced with the following steps: # host A and host B have just booted; no # wpa_s/hostapd running; all vifs are down host A> iw wlan0 set type station host A> iw wlan0 set 4addr on host A> printf 'interface=wlan0\nssid=4addrcrash\nchannel=1\nwds_sta=1' > /tmp/hconf host A> hostapd -B /tmp/conf host B> iw wlan0 set 4addr on host B> ifconfig wlan0 up host B> iw wlan0 connect -w hostAssid host A> pkill hostapd # host A crashed: [ 127.928192] BUG: unable to handle kernel NULL pointer dereference at 00000000000006c8 [ 127.929014] IP: [] __sta_info_flush+0xac/0x158 ... [ 127.934578] [] ieee80211_stop_ap+0x139/0x26c [ 127.934578] [] ? dump_trace+0x279/0x28a [ 127.934578] [] __cfg80211_stop_ap+0x84/0x191 [ 127.934578] [] cfg80211_stop_ap+0x3f/0x58 [ 127.934578] [] nl80211_stop_ap+0x1b/0x1d [ 127.934578] [] genl_family_rcv_msg+0x259/0x2b5 Note: This isn't a revert of f8cdddb8d61d ("cfg80211: check iface combinations only when iface is running") as far as functionality is considered because b6a550156bc ("cfg80211/mac80211: move more combination checks to mac80211") moved the logic somewhere else already. Fixes: f8cdddb8d61d ("cfg80211: check iface combinations only when iface is running") Signed-off-by: Michal Kazior Signed-off-by: Johannes Berg --- net/wireless/util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/util.c b/net/wireless/util.c index 4cb34557b873..baf7218cec15 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -945,7 +945,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, ntype == NL80211_IFTYPE_P2P_CLIENT)) return -EBUSY; - if (ntype != otype && netif_running(dev)) { + if (ntype != otype) { dev->ieee80211_ptr->use_4addr = false; dev->ieee80211_ptr->mesh_id_up_len = 0; wdev_lock(dev->ieee80211_ptr); -- cgit From f7959e9c73200f2ae361d0d311aa501f2c6a05c7 Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Thu, 28 May 2015 11:46:12 +0200 Subject: net: rfkill: gpio: make better use of gpiod API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since 39b2bbe3d715 (gpio: add flags argument to gpiod_get*() functions) which appeared in v3.17-rc1, the gpiod_get* functions take an additional parameter that allows to specify direction and initial value for output. Furthermore there is devm_gpiod_get_optional which is designed to get optional gpios. Simplify driver accordingly. Note this makes error checking more strict because only -ENOENT is ignored when searching for the GPIOs which is good. Signed-off-by: Uwe Kleine-König Signed-off-by: Johannes Berg --- net/rfkill/rfkill-gpio.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/rfkill/rfkill-gpio.c b/net/rfkill/rfkill-gpio.c index d978f2f46ff3..d5d58d919552 100644 --- a/net/rfkill/rfkill-gpio.c +++ b/net/rfkill/rfkill-gpio.c @@ -112,21 +112,17 @@ static int rfkill_gpio_probe(struct platform_device *pdev) rfkill->clk = devm_clk_get(&pdev->dev, NULL); - gpio = devm_gpiod_get(&pdev->dev, "reset"); - if (!IS_ERR(gpio)) { - ret = gpiod_direction_output(gpio, 0); - if (ret) - return ret; - rfkill->reset_gpio = gpio; - } + gpio = devm_gpiod_get_optional(&pdev->dev, "reset", GPIOD_OUT_LOW); + if (IS_ERR(gpio)) + return PTR_ERR(gpio); - gpio = devm_gpiod_get(&pdev->dev, "shutdown"); - if (!IS_ERR(gpio)) { - ret = gpiod_direction_output(gpio, 0); - if (ret) - return ret; - rfkill->shutdown_gpio = gpio; - } + rfkill->reset_gpio = gpio; + + gpio = devm_gpiod_get_optional(&pdev->dev, "shutdown", GPIOD_OUT_LOW); + if (IS_ERR(gpio)) + return PTR_ERR(gpio); + + rfkill->shutdown_gpio = gpio; /* Make sure at-least one of the GPIO is defined and that * a name is specified for this instance -- cgit From 8133534c760d4083f79d2cde42c636ccc0b2792e Mon Sep 17 00:00:00 2001 From: Sorin Dumitru Date: Wed, 27 May 2015 22:16:49 +0300 Subject: net: limit tcp/udp rmem/wmem to SOCK_{RCV,SND}BUF_MIN This is similar to b1cb59cf2efe(net: sysctl_net_core: check SNDBUF and RCVBUF for min length). I don't think too small values can cause crashes in the case of udp and tcp, but I've seen this set to too small values which triggered awful performance. It also makes the setting consistent across all the wmem/rmem sysctls. Signed-off-by: Sorin Dumitru Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 0330ab2e2b63..433231ccfb17 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -41,6 +41,8 @@ static int tcp_syn_retries_min = 1; static int tcp_syn_retries_max = MAX_TCP_SYNCNT; static int ip_ping_group_range_min[] = { 0, 0 }; static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; +static int min_sndbuf = SOCK_MIN_SNDBUF; +static int min_rcvbuf = SOCK_MIN_RCVBUF; /* Update system visible IP port range */ static void set_local_port_range(struct net *net, int range[2]) @@ -528,7 +530,7 @@ static struct ctl_table ipv4_table[] = { .maxlen = sizeof(sysctl_tcp_wmem), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = &min_sndbuf, }, { .procname = "tcp_notsent_lowat", @@ -543,7 +545,7 @@ static struct ctl_table ipv4_table[] = { .maxlen = sizeof(sysctl_tcp_rmem), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = &min_rcvbuf, }, { .procname = "tcp_app_win", @@ -756,7 +758,7 @@ static struct ctl_table ipv4_table[] = { .maxlen = sizeof(sysctl_udp_rmem_min), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one + .extra1 = &min_rcvbuf, }, { .procname = "udp_wmem_min", @@ -764,7 +766,7 @@ static struct ctl_table ipv4_table[] = { .maxlen = sizeof(sysctl_udp_wmem_min), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one + .extra1 = &min_sndbuf, }, { } }; -- cgit From 37e82c2f974b72c9ab49c787ef7b5bb1aec12768 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 27 May 2015 15:30:39 -0700 Subject: bpf: allow BPF programs access skb->skb_iif and skb->dev->ifindex fields classic BPF already exposes skb->dev->ifindex via SKF_AD_IFINDEX extension. Allow eBPF program to access it as well. Note that classic aborts execution of the program if 'skb->dev == NULL' (which is inconvenient for program writers), whereas eBPF returns zero in such case. Also expose the 'skb_iif' field, since programs triggered by redirected packet need to known the original interface index. Summary: __skb->ifindex -> skb->dev->ifindex __skb->ingress_ifindex -> skb->skb_iif Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/core/filter.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 3adcca6f17a4..2c30d6632d66 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1499,6 +1499,24 @@ static u32 sk_filter_convert_ctx_access(int dst_reg, int src_reg, int ctx_off, offsetof(struct sk_buff, priority)); break; + case offsetof(struct __sk_buff, ingress_ifindex): + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, skb_iif) != 4); + + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sk_buff, skb_iif)); + break; + + case offsetof(struct __sk_buff, ifindex): + BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); + + *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)), + dst_reg, src_reg, + offsetof(struct sk_buff, dev)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg, + offsetof(struct net_device, ifindex)); + break; + case offsetof(struct __sk_buff, mark): return convert_skb_access(SKF_AD_MARK, dst_reg, src_reg, insn); -- cgit From 1ea23a21176e449685a9d0523ab6da83e3779eb1 Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Thu, 28 May 2015 13:19:22 +0800 Subject: tipc: unconditionally put sock refcnt when sock timer to be deleted is pending As sock refcnt is taken when sock timer is started in sk_reset_timer(), the sock refcnt should be put when sock timer to be deleted is in pending state no matter what "probing_state" value of tipc sock is. Reviewed-by: Erik Hugne Reviewed-by: Jon Maloy Signed-off-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/socket.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 9370f953e16f..30ea82a9b0f1 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -410,7 +410,7 @@ static int tipc_release(struct socket *sock) struct net *net; struct tipc_sock *tsk; struct sk_buff *skb; - u32 dnode, probing_state; + u32 dnode; /* * Exit if socket isn't fully initialized (occurs when a failed accept() @@ -448,10 +448,7 @@ static int tipc_release(struct socket *sock) } tipc_sk_withdraw(tsk, 0, NULL); - probing_state = tsk->probing_state; - if (del_timer_sync(&sk->sk_timer) && - probing_state != TIPC_CONN_PROBING) - sock_put(sk); + sk_stop_timer(sk, &sk->sk_timer); tipc_sk_remove(tsk); if (tsk->connected) { skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, -- cgit From 3d2f6d41d1588c975d16c5969726d018bba90794 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Thu, 28 May 2015 23:02:17 +0200 Subject: ipv6: drop unneeded goto Delete jump to a label on the next line, when that label is not used elsewhere. A simplified version of the semantic patch that makes this change is as follows: (http://coccinelle.lip6.fr/) // @r@ identifier l; @@ -if (...) goto l; -l: // Also remove the unnecessary ret variable. Signed-off-by: Julia Lawall Signed-off-by: David S. Miller --- net/ipv6/raw.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 484a5c144073..ca4700cb26c4 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -1327,13 +1327,7 @@ static struct inet_protosw rawv6_protosw = { int __init rawv6_init(void) { - int ret; - - ret = inet6_register_protosw(&rawv6_protosw); - if (ret) - goto out; -out: - return ret; + return inet6_register_protosw(&rawv6_protosw); } void rawv6_exit(void) -- cgit From 282c320d33541d61f4ab91248448040c4ebab1fc Mon Sep 17 00:00:00 2001 From: Wang Long Date: Fri, 29 May 2015 01:02:08 +0000 Subject: netevent: remove automatic variable in register_netevent_notifier() Remove automatic variable 'err' in register_netevent_notifier() and return the result of atomic_notifier_chain_register() directly. Signed-off-by: Wang Long Signed-off-by: David S. Miller --- net/core/netevent.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/netevent.c b/net/core/netevent.c index f17ccd291d39..8b3bc4fac613 100644 --- a/net/core/netevent.c +++ b/net/core/netevent.c @@ -31,10 +31,7 @@ static ATOMIC_NOTIFIER_HEAD(netevent_notif_chain); */ int register_netevent_notifier(struct notifier_block *nb) { - int err; - - err = atomic_notifier_chain_register(&netevent_notif_chain, nb); - return err; + return atomic_notifier_chain_register(&netevent_notif_chain, nb); } EXPORT_SYMBOL_GPL(register_netevent_notifier); -- cgit From daf4e2c89254ed6eb8cf7ef60f614edebfdb9f3a Mon Sep 17 00:00:00 2001 From: Lennert Buytenhek Date: Thu, 28 May 2015 15:38:43 +0300 Subject: ieee802154: Fix EUI-64 station address validation. Refuse to allow setting an EUI-64 group address as an interface address, as those are not valid station addresses. Signed-off-by: Lennert Buytenhek Acked-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/mac802154/iface.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c index b544b5dc4bfb..6ac023932ce0 100644 --- a/net/mac802154/iface.c +++ b/net/mac802154/iface.c @@ -126,7 +126,7 @@ static int mac802154_wpan_mac_addr(struct net_device *dev, void *p) return -EBUSY; ieee802154_be64_to_le64(&extended_addr, addr->sa_data); - if (!ieee802154_is_valid_extended_addr(extended_addr)) + if (!ieee802154_is_valid_extended_unicast_addr(extended_addr)) return -EINVAL; memcpy(dev->dev_addr, addr->sa_data, dev->addr_len); @@ -539,7 +539,7 @@ ieee802154_if_add(struct ieee802154_local *local, const char *name, switch (type) { case NL802154_IFTYPE_NODE: ndev->type = ARPHRD_IEEE802154; - if (ieee802154_is_valid_extended_addr(extended_addr)) + if (ieee802154_is_valid_extended_unicast_addr(extended_addr)) ieee802154_le64_to_be64(ndev->dev_addr, &extended_addr); else memcpy(ndev->dev_addr, ndev->perm_addr, -- cgit From 17ca8cbf49be3aa94bb1c2b7ee6545fd70094eb4 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 29 May 2015 23:23:06 +0200 Subject: ebpf: allow bpf_ktime_get_ns_proto also for networking As this is already exported from tracing side via commit d9847d310ab4 ("tracing: Allow BPF programs to call bpf_ktime_get_ns()"), we might as well want to move it to the core, so also networking users can make use of it, e.g. to measure diffs for certain flows from ingress/egress. Signed-off-by: Daniel Borkmann Cc: Alexei Starovoitov Cc: Ingo Molnar Signed-off-by: David S. Miller --- net/core/filter.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 2c30d6632d66..b78a010a957f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1423,6 +1423,8 @@ sk_filter_func_proto(enum bpf_func_id func_id) return &bpf_get_smp_processor_id_proto; case BPF_FUNC_tail_call: return &bpf_tail_call_proto; + case BPF_FUNC_ktime_get_ns: + return &bpf_ktime_get_ns_proto; default: return NULL; } -- cgit From a28c257c9eb0bd76a4adcac97c07e34044ec71fb Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Fri, 29 May 2015 17:28:07 -0400 Subject: net/rds: Declare SO_RDS_TRANSPORT and RDS_TRANS_* constants in uapi/linux/rds.h User space applications that desire to explicitly select the underlying transport for a PF_RDS socket may do so by using the SO_RDS_TRANSPORT socket option at the SOL_RDS level before bind(). The integer argument provided to the socket option would be one of the RDS_TRANS_* values, e.g., RDS_TRANS_TCP. This commit exports the constant values need by such applications via Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/rds.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'net') diff --git a/net/rds/rds.h b/net/rds/rds.h index 0d41155a2258..76db508f73a1 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -408,11 +408,6 @@ struct rds_notifier { * should try hard not to block. */ -#define RDS_TRANS_IB 0 -#define RDS_TRANS_IWARP 1 -#define RDS_TRANS_TCP 2 -#define RDS_TRANS_COUNT 3 - struct rds_transport { char t_name[TRANSNAMSIZ]; struct list_head t_item; -- cgit From d97dac54bf83b2a3df85fba37595e355627e0f2b Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Fri, 29 May 2015 17:28:08 -0400 Subject: net/rds: Add setsockopt support for SO_RDS_TRANSPORT An application may deterministically attach the underlying transport for a PF_RDS socket by invoking setsockopt(2) with the SO_RDS_TRANSPORT option at the SOL_RDS level. The integer argument to setsockopt must be one of the RDS_TRANS_* transport types, e.g., RDS_TRANS_TCP. The option must be specified before invoking bind(2) on the socket, and may only be used once on the socket. An attempt to set the option on a bound socket, or to invoke the option after a successful SO_RDS_TRANSPORT attachment, will return EOPNOTSUPP. Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/af_rds.c | 27 +++++++++++++++++++++++++++ net/rds/bind.c | 4 ++++ net/rds/rds.h | 1 + net/rds/transport.c | 21 +++++++++++++++++++++ 4 files changed, 53 insertions(+) (limited to 'net') diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index 3d83641f2861..04877441fb01 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -270,6 +270,28 @@ static int rds_cong_monitor(struct rds_sock *rs, char __user *optval, return ret; } +static int rds_set_transport(struct rds_sock *rs, char __user *optval, + int optlen) +{ + int t_type; + + if (rs->rs_transport) + return -EOPNOTSUPP; /* previously attached to transport */ + + if (optlen != sizeof(int)) + return -EINVAL; + + if (copy_from_user(&t_type, (int __user *)optval, sizeof(t_type))) + return -EFAULT; + + if (t_type < 0 || t_type >= RDS_TRANS_COUNT) + return -EINVAL; + + rs->rs_transport = rds_trans_get(t_type); + + return rs->rs_transport ? 0 : -ENOPROTOOPT; +} + static int rds_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) { @@ -300,6 +322,11 @@ static int rds_setsockopt(struct socket *sock, int level, int optname, case RDS_CONG_MONITOR: ret = rds_cong_monitor(rs, optval, optlen); break; + case SO_RDS_TRANSPORT: + lock_sock(sock->sk); + ret = rds_set_transport(rs, optval, optlen); + release_sock(sock->sk); + break; default: ret = -ENOPROTOOPT; } diff --git a/net/rds/bind.c b/net/rds/bind.c index a2e6562da751..4ebd29c128b6 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -181,6 +181,10 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (ret) goto out; + if (rs->rs_transport) { /* previously bound */ + ret = 0; + goto out; + } trans = rds_trans_get_preferred(sin->sin_addr.s_addr); if (!trans) { ret = -EADDRNOTAVAIL; diff --git a/net/rds/rds.h b/net/rds/rds.h index 76db508f73a1..a33fb4ad3535 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -798,6 +798,7 @@ struct rds_transport *rds_trans_get_preferred(__be32 addr); void rds_trans_put(struct rds_transport *trans); unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, unsigned int avail); +struct rds_transport *rds_trans_get(int t_type); int rds_trans_init(void); void rds_trans_exit(void); diff --git a/net/rds/transport.c b/net/rds/transport.c index 7f2ac4fec367..8b4a6cd2c3a7 100644 --- a/net/rds/transport.c +++ b/net/rds/transport.c @@ -101,6 +101,27 @@ struct rds_transport *rds_trans_get_preferred(__be32 addr) return ret; } +struct rds_transport *rds_trans_get(int t_type) +{ + struct rds_transport *ret = NULL; + struct rds_transport *trans; + unsigned int i; + + down_read(&rds_trans_sem); + for (i = 0; i < RDS_TRANS_COUNT; i++) { + trans = transports[i]; + + if (trans && trans->t_type == t_type && + (!trans->t_owner || try_module_get(trans->t_owner))) { + ret = trans; + break; + } + } + up_read(&rds_trans_sem); + + return ret; +} + /* * This returns the number of stats entries in the snapshot and only * copies them using the iter if there is enough space for them. The -- cgit From 8ba38460f363e4d26d666aae9bc7fd3afa5f8e43 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Fri, 29 May 2015 17:28:09 -0400 Subject: net/rds Add getsockopt support for SO_RDS_TRANSPORT The currently attached transport for a PF_RDS socket may be obtained from user space by invoking getsockopt(2) using the SO_RDS_TRANSPORT option at the SOL_RDS level. The integer optval returned will be one of the RDS_TRANS_* constants defined in linux/rds.h. Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/af_rds.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'net') diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index 04877441fb01..2ad9032372b2 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -339,6 +339,7 @@ static int rds_getsockopt(struct socket *sock, int level, int optname, { struct rds_sock *rs = rds_sk_to_rs(sock->sk); int ret = -ENOPROTOOPT, len; + int trans; if (level != SOL_RDS) goto out; @@ -364,6 +365,19 @@ static int rds_getsockopt(struct socket *sock, int level, int optname, else ret = 0; break; + case SO_RDS_TRANSPORT: + if (len < sizeof(int)) { + ret = -EINVAL; + break; + } + trans = (rs->rs_transport ? rs->rs_transport->t_type : + RDS_TRANS_NONE); /* unbound */ + if (put_user(trans, (int __user *)optval) || + put_user(sizeof(int), optlen)) + ret = -EFAULT; + else + ret = 0; + break; default: break; } -- cgit From c9c99f8938e41b878802d5d8481d6f7a212515b0 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 1 Jun 2015 14:10:09 +0200 Subject: mac80211: act upon and report deauth while associating When trying to associate, the AP could send a deauth frame instead. Currently mac80211 drops that frame and doesn't report it to the supplicant, which, in some versions and/or in certain circumstances will simply keep trying to associate over and over again instead of trying authentication again. Fix this by reacting to deauth frames while associating, reporting them to the supplicant and dropping the association attempt (which is bound to fail.) Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 92 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 53 insertions(+), 39 deletions(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 387fe70ab126..853cfa71b96a 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -2521,6 +2521,34 @@ static void ieee80211_destroy_auth_data(struct ieee80211_sub_if_data *sdata, sdata->u.mgd.auth_data = NULL; } +static void ieee80211_destroy_assoc_data(struct ieee80211_sub_if_data *sdata, + bool assoc) +{ + struct ieee80211_mgd_assoc_data *assoc_data = sdata->u.mgd.assoc_data; + + sdata_assert_lock(sdata); + + if (!assoc) { + /* + * we are not associated yet, the only timer that could be + * running is the timeout for the association response which + * which is not relevant anymore. + */ + del_timer_sync(&sdata->u.mgd.timer); + sta_info_destroy_addr(sdata, assoc_data->bss->bssid); + + eth_zero_addr(sdata->u.mgd.bssid); + ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BSSID); + sdata->u.mgd.flags = 0; + mutex_lock(&sdata->local->mtx); + ieee80211_vif_release_channel(sdata); + mutex_unlock(&sdata->local->mtx); + } + + kfree(assoc_data); + sdata->u.mgd.assoc_data = NULL; +} + static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { @@ -2713,28 +2741,42 @@ static void ieee80211_rx_mgmt_deauth(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; - const u8 *bssid = NULL; - u16 reason_code; + u16 reason_code = le16_to_cpu(mgmt->u.deauth.reason_code); sdata_assert_lock(sdata); if (len < 24 + 2) return; - if (!ifmgd->associated || - !ether_addr_equal(mgmt->bssid, ifmgd->associated->bssid)) - return; + if (ifmgd->associated && + ether_addr_equal(mgmt->bssid, ifmgd->associated->bssid)) { + const u8 *bssid = ifmgd->associated->bssid; - bssid = ifmgd->associated->bssid; + sdata_info(sdata, "deauthenticated from %pM (Reason: %u=%s)\n", + bssid, reason_code, + ieee80211_get_reason_code_string(reason_code)); - reason_code = le16_to_cpu(mgmt->u.deauth.reason_code); + ieee80211_set_disassoc(sdata, 0, 0, false, NULL); - sdata_info(sdata, "deauthenticated from %pM (Reason: %u=%s)\n", - bssid, reason_code, ieee80211_get_reason_code_string(reason_code)); + ieee80211_report_disconnect(sdata, (u8 *)mgmt, len, false, + reason_code); + return; + } - ieee80211_set_disassoc(sdata, 0, 0, false, NULL); + if (ifmgd->assoc_data && + ether_addr_equal(mgmt->bssid, ifmgd->assoc_data->bss->bssid)) { + const u8 *bssid = ifmgd->assoc_data->bss->bssid; - ieee80211_report_disconnect(sdata, (u8 *)mgmt, len, false, reason_code); + sdata_info(sdata, + "deauthenticated from %pM while associating (Reason: %u=%s)\n", + bssid, reason_code, + ieee80211_get_reason_code_string(reason_code)); + + ieee80211_destroy_assoc_data(sdata, false); + + cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len); + return; + } } @@ -2814,34 +2856,6 @@ static void ieee80211_get_rates(struct ieee80211_supported_band *sband, } } -static void ieee80211_destroy_assoc_data(struct ieee80211_sub_if_data *sdata, - bool assoc) -{ - struct ieee80211_mgd_assoc_data *assoc_data = sdata->u.mgd.assoc_data; - - sdata_assert_lock(sdata); - - if (!assoc) { - /* - * we are not associated yet, the only timer that could be - * running is the timeout for the association response which - * which is not relevant anymore. - */ - del_timer_sync(&sdata->u.mgd.timer); - sta_info_destroy_addr(sdata, assoc_data->bss->bssid); - - eth_zero_addr(sdata->u.mgd.bssid); - ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BSSID); - sdata->u.mgd.flags = 0; - mutex_lock(&sdata->local->mtx); - ieee80211_vif_release_channel(sdata); - mutex_unlock(&sdata->local->mtx); - } - - kfree(assoc_data); - sdata->u.mgd.assoc_data = NULL; -} - static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, struct cfg80211_bss *cbss, struct ieee80211_mgmt *mgmt, size_t len) -- cgit From bdef7de4b8d9be4cf7bf5aea977f827310ab3ff0 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 1 Jun 2015 14:56:09 -0700 Subject: net: Add priority to packet_offload objects. When we scan a packet for GRO processing, we want to see the most common packet types in the front of the offload_base list. So add a priority field so we can handle this properly. IPv4/IPv6 get the highest priority with the implicit zero priority field. Next comes ethernet with a priority of 10, and then we have the MPLS types with a priority of 15. Suggested-by: Eric Dumazet Suggested-by: Toshiaki Makita Signed-off-by: David S. Miller --- net/core/dev.c | 8 ++++++-- net/ethernet/eth.c | 1 + net/mpls/mpls_gso.c | 2 ++ 3 files changed, 9 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 594163d0c6eb..0602e917a305 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -469,10 +469,14 @@ EXPORT_SYMBOL(dev_remove_pack); */ void dev_add_offload(struct packet_offload *po) { - struct list_head *head = &offload_base; + struct packet_offload *elem; spin_lock(&offload_lock); - list_add_rcu(&po->list, head); + list_for_each_entry(elem, &offload_base, list) { + if (po->priority < elem->priority) + break; + } + list_add_rcu(&po->list, elem->list.prev); spin_unlock(&offload_lock); } EXPORT_SYMBOL(dev_add_offload); diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index c3325bd2f3fb..7d0e239a6755 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -470,6 +470,7 @@ EXPORT_SYMBOL(eth_gro_complete); static struct packet_offload eth_packet_offload __read_mostly = { .type = cpu_to_be16(ETH_P_TEB), + .priority = 10, .callbacks = { .gro_receive = eth_gro_receive, .gro_complete = eth_gro_complete, diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c index 809df534a720..0183b32da942 100644 --- a/net/mpls/mpls_gso.c +++ b/net/mpls/mpls_gso.c @@ -62,6 +62,7 @@ out: static struct packet_offload mpls_mc_offload __read_mostly = { .type = cpu_to_be16(ETH_P_MPLS_MC), + .priority = 15, .callbacks = { .gso_segment = mpls_gso_segment, }, @@ -69,6 +70,7 @@ static struct packet_offload mpls_mc_offload __read_mostly = { static struct packet_offload mpls_uc_offload __read_mostly = { .type = cpu_to_be16(ETH_P_MPLS_UC), + .priority = 15, .callbacks = { .gso_segment = mpls_gso_segment, }, -- cgit From ccea74457bbdafe33dce8bffcb5cb183aeb5f2bb Mon Sep 17 00:00:00 2001 From: Neil McKee Date: Tue, 26 May 2015 20:59:43 -0700 Subject: openvswitch: include datapath actions with sampled-packet upcall to userspace If new optional attribute OVS_USERSPACE_ATTR_ACTIONS is added to an OVS_ACTION_ATTR_USERSPACE action, then include the datapath actions in the upcall. This Directly associates the sampled packet with the path it takes through the virtual switch. Path information currently includes mangling, encapsulation and decapsulation actions for tunneling protocols GRE, VXLAN, Geneve, MPLS and QinQ, but this extension requires no further changes to accommodate datapath actions that may be added in the future. Adding path information enhances visibility into complex virtual networks. Signed-off-by: Neil McKee Signed-off-by: David S. Miller --- net/openvswitch/actions.c | 23 +++++++++++++++-------- net/openvswitch/datapath.c | 18 ++++++++++++++++-- net/openvswitch/datapath.h | 2 ++ 3 files changed, 33 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index b491c1c296fe..8a8c0b8b4f63 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -608,17 +608,16 @@ static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port) } static int output_userspace(struct datapath *dp, struct sk_buff *skb, - struct sw_flow_key *key, const struct nlattr *attr) + struct sw_flow_key *key, const struct nlattr *attr, + const struct nlattr *actions, int actions_len) { struct ovs_tunnel_info info; struct dp_upcall_info upcall; const struct nlattr *a; int rem; + memset(&upcall, 0, sizeof(upcall)); upcall.cmd = OVS_PACKET_CMD_ACTION; - upcall.userdata = NULL; - upcall.portid = 0; - upcall.egress_tun_info = NULL; for (a = nla_data(attr), rem = nla_len(attr); rem > 0; a = nla_next(a, &rem)) { @@ -647,6 +646,13 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, break; } + case OVS_USERSPACE_ATTR_ACTIONS: { + /* Include actions. */ + upcall.actions = actions; + upcall.actions_len = actions_len; + break; + } + } /* End of switch. */ } @@ -654,7 +660,8 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, } static int sample(struct datapath *dp, struct sk_buff *skb, - struct sw_flow_key *key, const struct nlattr *attr) + struct sw_flow_key *key, const struct nlattr *attr, + const struct nlattr *actions, int actions_len) { const struct nlattr *acts_list = NULL; const struct nlattr *a; @@ -688,7 +695,7 @@ static int sample(struct datapath *dp, struct sk_buff *skb, */ if (likely(nla_type(a) == OVS_ACTION_ATTR_USERSPACE && nla_is_last(a, rem))) - return output_userspace(dp, skb, key, a); + return output_userspace(dp, skb, key, a, actions, actions_len); skb = skb_clone(skb, GFP_ATOMIC); if (!skb) @@ -872,7 +879,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, break; case OVS_ACTION_ATTR_USERSPACE: - output_userspace(dp, skb, key, a); + output_userspace(dp, skb, key, a, attr, len); break; case OVS_ACTION_ATTR_HASH: @@ -916,7 +923,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, break; case OVS_ACTION_ATTR_SAMPLE: - err = sample(dp, skb, key, a); + err = sample(dp, skb, key, a, attr, len); break; } diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 3b90461317ec..ff8c4a4c1609 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -272,10 +272,9 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) struct dp_upcall_info upcall; int error; + memset(&upcall, 0, sizeof(upcall)); upcall.cmd = OVS_PACKET_CMD_MISS; - upcall.userdata = NULL; upcall.portid = ovs_vport_find_upcall_portid(p, skb); - upcall.egress_tun_info = NULL; error = ovs_dp_upcall(dp, skb, key, &upcall); if (unlikely(error)) kfree_skb(skb); @@ -397,6 +396,10 @@ static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info, if (upcall_info->egress_tun_info) size += nla_total_size(ovs_tun_key_attr_size()); + /* OVS_PACKET_ATTR_ACTIONS */ + if (upcall_info->actions_len) + size += nla_total_size(upcall_info->actions_len); + return size; } @@ -478,6 +481,17 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, nla_nest_end(user_skb, nla); } + if (upcall_info->actions_len) { + nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_ACTIONS); + err = ovs_nla_put_actions(upcall_info->actions, + upcall_info->actions_len, + user_skb); + if (!err) + nla_nest_end(user_skb, nla); + else + nla_nest_cancel(user_skb, nla); + } + /* Only reserve room for attribute header, packet data is added * in skb_zerocopy() */ if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) { diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 4ec4a480b147..cd691e935e08 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -116,6 +116,8 @@ struct ovs_skb_cb { struct dp_upcall_info { const struct ovs_tunnel_info *egress_tun_info; const struct nlattr *userdata; + const struct nlattr *actions; + int actions_len; u32 portid; u8 cmd; }; -- cgit From 66e5133f19e901a044fa5eaeeb6ecff4545839e5 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Mon, 1 Jun 2015 21:55:06 +0900 Subject: vlan: Add GRO support for non hardware accelerated vlan Currently packets with non-hardware-accelerated vlan cannot be handled by GRO. This causes low performance for 802.1ad and stacked vlan, as their vlan tags are currently not stripped by hardware. This patch adds GRO support for non-hardware-accelerated vlan and improves receive performance of them. Test Environment: vlan device (.1Q) on vlan device (.1ad) on ixgbe (82599) Result: - Before $ netperf -t TCP_STREAM -H 192.168.20.2 -l 60 Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/sec 87380 16384 16384 60.00 5233.17 Rx side CPU usage: %usr %sys %irq %soft %idle 0.27 58.03 0.00 41.70 0.00 - After $ netperf -t TCP_STREAM -H 192.168.20.2 -l 60 Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/sec 87380 16384 16384 60.00 7586.85 Rx side CPU usage: %usr %sys %irq %soft %idle 0.50 25.83 0.00 59.53 14.14 [ Register VLAN offloads with priority 10 -DaveM ] Signed-off-by: Toshiaki Makita Signed-off-by: David S. Miller --- net/8021q/vlan.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) (limited to 'net') diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 59555f0f8fc8..d2cd9de4b724 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -618,6 +618,92 @@ out: return err; } +static struct sk_buff **vlan_gro_receive(struct sk_buff **head, + struct sk_buff *skb) +{ + struct sk_buff *p, **pp = NULL; + struct vlan_hdr *vhdr; + unsigned int hlen, off_vlan; + const struct packet_offload *ptype; + __be16 type; + int flush = 1; + + off_vlan = skb_gro_offset(skb); + hlen = off_vlan + sizeof(*vhdr); + vhdr = skb_gro_header_fast(skb, off_vlan); + if (skb_gro_header_hard(skb, hlen)) { + vhdr = skb_gro_header_slow(skb, hlen, off_vlan); + if (unlikely(!vhdr)) + goto out; + } + + type = vhdr->h_vlan_encapsulated_proto; + + rcu_read_lock(); + ptype = gro_find_receive_by_type(type); + if (!ptype) + goto out_unlock; + + flush = 0; + + for (p = *head; p; p = p->next) { + struct vlan_hdr *vhdr2; + + if (!NAPI_GRO_CB(p)->same_flow) + continue; + + vhdr2 = (struct vlan_hdr *)(p->data + off_vlan); + if (compare_vlan_header(vhdr, vhdr2)) + NAPI_GRO_CB(p)->same_flow = 0; + } + + skb_gro_pull(skb, sizeof(*vhdr)); + skb_gro_postpull_rcsum(skb, vhdr, sizeof(*vhdr)); + pp = ptype->callbacks.gro_receive(head, skb); + +out_unlock: + rcu_read_unlock(); +out: + NAPI_GRO_CB(skb)->flush |= flush; + + return pp; +} + +static int vlan_gro_complete(struct sk_buff *skb, int nhoff) +{ + struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data + nhoff); + __be16 type = vhdr->h_vlan_encapsulated_proto; + struct packet_offload *ptype; + int err = -ENOENT; + + rcu_read_lock(); + ptype = gro_find_complete_by_type(type); + if (ptype) + err = ptype->callbacks.gro_complete(skb, nhoff + sizeof(*vhdr)); + + rcu_read_unlock(); + return err; +} + +static struct packet_offload vlan_packet_offloads[] __read_mostly = { + { + .type = cpu_to_be16(ETH_P_8021Q), + .priority = 10, + .callbacks = { + .gro_receive = vlan_gro_receive, + .gro_complete = vlan_gro_complete, + }, + }, + { + .type = cpu_to_be16(ETH_P_8021AD), + .priority = 10, + .callbacks = { + .gro_receive = vlan_gro_receive, + .gro_complete = vlan_gro_complete, + }, + }, +}; + static int __net_init vlan_init_net(struct net *net) { struct vlan_net *vn = net_generic(net, vlan_net_id); @@ -645,6 +731,7 @@ static struct pernet_operations vlan_net_ops = { static int __init vlan_proto_init(void) { int err; + unsigned int i; pr_info("%s v%s\n", vlan_fullname, vlan_version); @@ -668,6 +755,9 @@ static int __init vlan_proto_init(void) if (err < 0) goto err5; + for (i = 0; i < ARRAY_SIZE(vlan_packet_offloads); i++) + dev_add_offload(&vlan_packet_offloads[i]); + vlan_ioctl_set(vlan_ioctl_handler); return 0; @@ -685,7 +775,13 @@ err0: static void __exit vlan_cleanup_module(void) { + unsigned int i; + vlan_ioctl_set(NULL); + + for (i = 0; i < ARRAY_SIZE(vlan_packet_offloads); i++) + dev_remove_offload(&vlan_packet_offloads[i]); + vlan_netlink_fini(); unregister_netdevice_notifier(&vlan_notifier_block); -- cgit From db388a567ff9600debc2433c1fddf79a8fc38b21 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 1 Jun 2015 15:36:51 +0200 Subject: mac80211: move TX PN to public part of key struct For drivers supporting TSO or similar features, but that still have PN assignment in software, there's a need to have some memory to store the current PN value. As mac80211 already stores this and it's somewhat complicated to add a per-driver area to the key struct (due to the dynamic sizing thereof) it makes sense to just move the TX PN to the keyconf, i.e. the public part of the key struct. As TKIP is more complicated and we won't able to offload it in this way right now (fast-xmit is skipped for TKIP unless the HW does it all, and our hardware needs MMIC calculation in software) I've not moved that for now - it's possible but requires exposing a lot of the internal TKIP state. As an bonus side effect, we can remove a lot of code by assuming the keyseq struct has a certain layout - with BUILD_BUG_ON to verify it. This might also improve performance, since now TX and RX no longer share a cacheline. Reviewed-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 59 ++++++---------------------------- net/mac80211/debugfs_key.c | 17 +--------- net/mac80211/key.c | 80 +++++++++++++--------------------------------- net/mac80211/key.h | 4 --- net/mac80211/tx.c | 10 +----- net/mac80211/wpa.c | 10 +++--- 6 files changed, 38 insertions(+), 142 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index bb9f83640b46..02f48c848ef5 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -361,66 +361,25 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev, break; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: - if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE && - !(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)) { - drv_get_key_seq(sdata->local, key, &kseq); - memcpy(seq, kseq.ccmp.pn, 6); - } else { - pn64 = atomic64_read(&key->u.ccmp.tx_pn); - seq[0] = pn64; - seq[1] = pn64 >> 8; - seq[2] = pn64 >> 16; - seq[3] = pn64 >> 24; - seq[4] = pn64 >> 32; - seq[5] = pn64 >> 40; - } - params.seq = seq; - params.seq_len = 6; - break; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: - if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE && - !(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)) { - drv_get_key_seq(sdata->local, key, &kseq); - memcpy(seq, kseq.aes_cmac.pn, 6); - } else { - pn64 = atomic64_read(&key->u.aes_cmac.tx_pn); - seq[0] = pn64; - seq[1] = pn64 >> 8; - seq[2] = pn64 >> 16; - seq[3] = pn64 >> 24; - seq[4] = pn64 >> 32; - seq[5] = pn64 >> 40; - } - params.seq = seq; - params.seq_len = 6; - break; + BUILD_BUG_ON(offsetof(typeof(kseq), ccmp) != + offsetof(typeof(kseq), aes_cmac)); case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: - if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE && - !(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)) { - drv_get_key_seq(sdata->local, key, &kseq); - memcpy(seq, kseq.aes_gmac.pn, 6); - } else { - pn64 = atomic64_read(&key->u.aes_gmac.tx_pn); - seq[0] = pn64; - seq[1] = pn64 >> 8; - seq[2] = pn64 >> 16; - seq[3] = pn64 >> 24; - seq[4] = pn64 >> 32; - seq[5] = pn64 >> 40; - } - params.seq = seq; - params.seq_len = 6; - break; + BUILD_BUG_ON(offsetof(typeof(kseq), ccmp) != + offsetof(typeof(kseq), aes_gmac)); case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: + BUILD_BUG_ON(offsetof(typeof(kseq), ccmp) != + offsetof(typeof(kseq), gcmp)); + if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE && !(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)) { drv_get_key_seq(sdata->local, key, &kseq); - memcpy(seq, kseq.gcmp.pn, 6); + memcpy(seq, kseq.ccmp.pn, 6); } else { - pn64 = atomic64_read(&key->u.gcmp.tx_pn); + pn64 = atomic64_read(&key->conf.tx_pn); seq[0] = pn64; seq[1] = pn64 >> 8; seq[2] = pn64 >> 16; diff --git a/net/mac80211/debugfs_key.c b/net/mac80211/debugfs_key.c index 71ac1b5f4da5..e82bf1e9d7a8 100644 --- a/net/mac80211/debugfs_key.c +++ b/net/mac80211/debugfs_key.c @@ -95,28 +95,13 @@ static ssize_t key_tx_spec_read(struct file *file, char __user *userbuf, break; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: - pn = atomic64_read(&key->u.ccmp.tx_pn); - len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n", - (u8)(pn >> 40), (u8)(pn >> 32), (u8)(pn >> 24), - (u8)(pn >> 16), (u8)(pn >> 8), (u8)pn); - break; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: - pn = atomic64_read(&key->u.aes_cmac.tx_pn); - len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n", - (u8)(pn >> 40), (u8)(pn >> 32), (u8)(pn >> 24), - (u8)(pn >> 16), (u8)(pn >> 8), (u8)pn); - break; case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: - pn = atomic64_read(&key->u.aes_gmac.tx_pn); - len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n", - (u8)(pn >> 40), (u8)(pn >> 32), (u8)(pn >> 24), - (u8)(pn >> 16), (u8)(pn >> 8), (u8)pn); - break; case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: - pn = atomic64_read(&key->u.gcmp.tx_pn); + pn = atomic64_read(&key->conf.tx_pn); len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n", (u8)(pn >> 40), (u8)(pn >> 32), (u8)(pn >> 24), (u8)(pn >> 16), (u8)(pn >> 8), (u8)pn); diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 2e677376c958..9a4a4bfafdc2 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -832,27 +832,19 @@ void ieee80211_get_key_tx_seq(struct ieee80211_key_conf *keyconf, break; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: - pn64 = atomic64_read(&key->u.ccmp.tx_pn); - seq->ccmp.pn[5] = pn64; - seq->ccmp.pn[4] = pn64 >> 8; - seq->ccmp.pn[3] = pn64 >> 16; - seq->ccmp.pn[2] = pn64 >> 24; - seq->ccmp.pn[1] = pn64 >> 32; - seq->ccmp.pn[0] = pn64 >> 40; - break; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: - pn64 = atomic64_read(&key->u.aes_cmac.tx_pn); - seq->ccmp.pn[5] = pn64; - seq->ccmp.pn[4] = pn64 >> 8; - seq->ccmp.pn[3] = pn64 >> 16; - seq->ccmp.pn[2] = pn64 >> 24; - seq->ccmp.pn[1] = pn64 >> 32; - seq->ccmp.pn[0] = pn64 >> 40; - break; + BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) != + offsetof(typeof(*seq), aes_cmac)); case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: - pn64 = atomic64_read(&key->u.aes_gmac.tx_pn); + BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) != + offsetof(typeof(*seq), aes_gmac)); + case WLAN_CIPHER_SUITE_GCMP: + case WLAN_CIPHER_SUITE_GCMP_256: + BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) != + offsetof(typeof(*seq), gcmp)); + pn64 = atomic64_read(&key->conf.tx_pn); seq->ccmp.pn[5] = pn64; seq->ccmp.pn[4] = pn64 >> 8; seq->ccmp.pn[3] = pn64 >> 16; @@ -860,16 +852,6 @@ void ieee80211_get_key_tx_seq(struct ieee80211_key_conf *keyconf, seq->ccmp.pn[1] = pn64 >> 32; seq->ccmp.pn[0] = pn64 >> 40; break; - case WLAN_CIPHER_SUITE_GCMP: - case WLAN_CIPHER_SUITE_GCMP_256: - pn64 = atomic64_read(&key->u.gcmp.tx_pn); - seq->gcmp.pn[5] = pn64; - seq->gcmp.pn[4] = pn64 >> 8; - seq->gcmp.pn[3] = pn64 >> 16; - seq->gcmp.pn[2] = pn64 >> 24; - seq->gcmp.pn[1] = pn64 >> 32; - seq->gcmp.pn[0] = pn64 >> 40; - break; default: WARN_ON(1); } @@ -944,43 +926,25 @@ void ieee80211_set_key_tx_seq(struct ieee80211_key_conf *keyconf, break; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: - pn64 = (u64)seq->ccmp.pn[5] | - ((u64)seq->ccmp.pn[4] << 8) | - ((u64)seq->ccmp.pn[3] << 16) | - ((u64)seq->ccmp.pn[2] << 24) | - ((u64)seq->ccmp.pn[1] << 32) | - ((u64)seq->ccmp.pn[0] << 40); - atomic64_set(&key->u.ccmp.tx_pn, pn64); - break; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: - pn64 = (u64)seq->aes_cmac.pn[5] | - ((u64)seq->aes_cmac.pn[4] << 8) | - ((u64)seq->aes_cmac.pn[3] << 16) | - ((u64)seq->aes_cmac.pn[2] << 24) | - ((u64)seq->aes_cmac.pn[1] << 32) | - ((u64)seq->aes_cmac.pn[0] << 40); - atomic64_set(&key->u.aes_cmac.tx_pn, pn64); - break; + BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) != + offsetof(typeof(*seq), aes_cmac)); case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: - pn64 = (u64)seq->aes_gmac.pn[5] | - ((u64)seq->aes_gmac.pn[4] << 8) | - ((u64)seq->aes_gmac.pn[3] << 16) | - ((u64)seq->aes_gmac.pn[2] << 24) | - ((u64)seq->aes_gmac.pn[1] << 32) | - ((u64)seq->aes_gmac.pn[0] << 40); - atomic64_set(&key->u.aes_gmac.tx_pn, pn64); - break; + BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) != + offsetof(typeof(*seq), aes_gmac)); case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: - pn64 = (u64)seq->gcmp.pn[5] | - ((u64)seq->gcmp.pn[4] << 8) | - ((u64)seq->gcmp.pn[3] << 16) | - ((u64)seq->gcmp.pn[2] << 24) | - ((u64)seq->gcmp.pn[1] << 32) | - ((u64)seq->gcmp.pn[0] << 40); - atomic64_set(&key->u.gcmp.tx_pn, pn64); + BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) != + offsetof(typeof(*seq), gcmp)); + pn64 = (u64)seq->ccmp.pn[5] | + ((u64)seq->ccmp.pn[4] << 8) | + ((u64)seq->ccmp.pn[3] << 16) | + ((u64)seq->ccmp.pn[2] << 24) | + ((u64)seq->ccmp.pn[1] << 32) | + ((u64)seq->ccmp.pn[0] << 40); + atomic64_set(&key->conf.tx_pn, pn64); break; default: WARN_ON(1); diff --git a/net/mac80211/key.h b/net/mac80211/key.h index df430a618764..ac747e442139 100644 --- a/net/mac80211/key.h +++ b/net/mac80211/key.h @@ -77,7 +77,6 @@ struct ieee80211_key { u32 mic_failures; } tkip; struct { - atomic64_t tx_pn; /* * Last received packet number. The first * IEEE80211_NUM_TIDS counters are used with Data @@ -89,21 +88,18 @@ struct ieee80211_key { u32 replays; /* dot11RSNAStatsCCMPReplays */ } ccmp; struct { - atomic64_t tx_pn; u8 rx_pn[IEEE80211_CMAC_PN_LEN]; struct crypto_cipher *tfm; u32 replays; /* dot11RSNAStatsCMACReplays */ u32 icverrors; /* dot11RSNAStatsCMACICVErrors */ } aes_cmac; struct { - atomic64_t tx_pn; u8 rx_pn[IEEE80211_GMAC_PN_LEN]; struct crypto_aead *tfm; u32 replays; /* dot11RSNAStatsCMACReplays */ u32 icverrors; /* dot11RSNAStatsCMACICVErrors */ } aes_gmac; struct { - atomic64_t tx_pn; /* Last received packet number. The first * IEEE80211_NUM_TIDS counters are used with Data * frames and the last counter is used with Robust diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 8df134213adf..dbef1b881b3d 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -2813,17 +2813,9 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, switch (fast_tx->key->conf.cipher) { case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: - pn = atomic64_inc_return(&fast_tx->key->u.ccmp.tx_pn); - crypto_hdr[0] = pn; - crypto_hdr[1] = pn >> 8; - crypto_hdr[4] = pn >> 16; - crypto_hdr[5] = pn >> 24; - crypto_hdr[6] = pn >> 32; - crypto_hdr[7] = pn >> 40; - break; case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: - pn = atomic64_inc_return(&fast_tx->key->u.gcmp.tx_pn); + pn = atomic64_inc_return(&fast_tx->key->conf.tx_pn); crypto_hdr[0] = pn; crypto_hdr[1] = pn >> 8; crypto_hdr[4] = pn >> 16; diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c index 9d63d93c836e..943f7606527e 100644 --- a/net/mac80211/wpa.c +++ b/net/mac80211/wpa.c @@ -444,7 +444,7 @@ static int ccmp_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb, hdr = (struct ieee80211_hdr *) pos; pos += hdrlen; - pn64 = atomic64_inc_return(&key->u.ccmp.tx_pn); + pn64 = atomic64_inc_return(&key->conf.tx_pn); pn[5] = pn64; pn[4] = pn64 >> 8; @@ -670,7 +670,7 @@ static int gcmp_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb) hdr = (struct ieee80211_hdr *)pos; pos += hdrlen; - pn64 = atomic64_inc_return(&key->u.gcmp.tx_pn); + pn64 = atomic64_inc_return(&key->conf.tx_pn); pn[5] = pn64; pn[4] = pn64 >> 8; @@ -940,7 +940,7 @@ ieee80211_crypto_aes_cmac_encrypt(struct ieee80211_tx_data *tx) mmie->key_id = cpu_to_le16(key->conf.keyidx); /* PN = PN + 1 */ - pn64 = atomic64_inc_return(&key->u.aes_cmac.tx_pn); + pn64 = atomic64_inc_return(&key->conf.tx_pn); bip_ipn_set64(mmie->sequence_number, pn64); @@ -984,7 +984,7 @@ ieee80211_crypto_aes_cmac_256_encrypt(struct ieee80211_tx_data *tx) mmie->key_id = cpu_to_le16(key->conf.keyidx); /* PN = PN + 1 */ - pn64 = atomic64_inc_return(&key->u.aes_cmac.tx_pn); + pn64 = atomic64_inc_return(&key->conf.tx_pn); bip_ipn_set64(mmie->sequence_number, pn64); @@ -1129,7 +1129,7 @@ ieee80211_crypto_aes_gmac_encrypt(struct ieee80211_tx_data *tx) mmie->key_id = cpu_to_le16(key->conf.keyidx); /* PN = PN + 1 */ - pn64 = atomic64_inc_return(&key->u.aes_gmac.tx_pn); + pn64 = atomic64_inc_return(&key->conf.tx_pn); bip_ipn_set64(mmie->sequence_number, pn64); -- cgit From b2eb0ee6d0220b47a1f901e4f1c56dd594509d8e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 1 Jun 2015 22:54:13 +0200 Subject: mac80211: copy nl80211 mgmt TX SKB for status When we return the TX status for an nl80211 mgmt TX SKB, we should also return the original frame with the status to allow userspace to match up the submission (it could also use the cookie but both ways are permissible.) As TX SKBs could be encrypted, at least in the case of ANQP while associated with the AP, copy the original SKB, store it with an ACK frame ID and restructure the status path to use that to return status with the original SKB. Otherwise, userspace (in particular wpa_supplicant) will get confused. Reported-by: Matti Gottlieb Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 39 ++++++++++++++++-- net/mac80211/status.c | 109 +++++++++++++++++++++++++++++++------------------- 2 files changed, 104 insertions(+), 44 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 02f48c848ef5..5ba528f13300 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2,7 +2,7 @@ * mac80211 configuration hooks for cfg80211 * * Copyright 2006-2010 Johannes Berg - * Copyright 2013-2014 Intel Mobile Communications GmbH + * Copyright 2013-2015 Intel Mobile Communications GmbH * * This file is GPLv2 as found in COPYING. */ @@ -3290,7 +3290,7 @@ static int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); struct ieee80211_local *local = sdata->local; - struct sk_buff *skb; + struct sk_buff *skb, *ack_skb; struct sta_info *sta; const struct ieee80211_mgmt *mgmt = (void *)params->buf; bool need_offchan = false; @@ -3428,8 +3428,41 @@ static int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, skb->dev = sdata->dev; + if (!params->dont_wait_for_ack) { + unsigned long spin_flags; + int id; + + /* make a copy to preserve the original cookie (in case the + * driver decides to reallocate the skb) and the frame contents + * in case of encryption. + */ + ack_skb = skb_copy(skb, GFP_KERNEL); + if (!ack_skb) { + ret = -ENOMEM; + kfree_skb(skb); + goto out_unlock; + } + + spin_lock_irqsave(&local->ack_status_lock, spin_flags); + id = idr_alloc(&local->ack_status_frames, ack_skb, + 1, 0x10000, GFP_ATOMIC); + spin_unlock_irqrestore(&local->ack_status_lock, spin_flags); + + if (id < 0) { + ret = -ENOMEM; + kfree_skb(ack_skb); + kfree_skb(skb); + goto out_unlock; + } + + IEEE80211_SKB_CB(skb)->ack_frame_id = id; + } else { + /* for cookie below */ + ack_skb = skb; + } + if (!need_offchan) { - *cookie = (unsigned long) skb; + *cookie = (unsigned long)ack_skb; ieee80211_tx_skb(sdata, skb); ret = 0; goto out_unlock; diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 461594966b65..56b73e012757 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -429,6 +429,66 @@ static void ieee80211_tdls_td_tx_handle(struct ieee80211_local *local, } } +static struct ieee80211_sub_if_data * +ieee80211_sdata_from_skb(struct ieee80211_local *local, struct sk_buff *skb) +{ + struct ieee80211_sub_if_data *sdata; + + if (skb->dev) { + list_for_each_entry_rcu(sdata, &local->interfaces, list) { + if (!sdata->dev) + continue; + + if (skb->dev == sdata->dev) + return sdata; + } + + return NULL; + } + + return rcu_dereference(local->p2p_sdata); +} + +static void ieee80211_report_ack_skb(struct ieee80211_local *local, + struct ieee80211_tx_info *info, + bool acked, bool dropped) +{ + struct sk_buff *skb; + unsigned long flags; + + spin_lock_irqsave(&local->ack_status_lock, flags); + skb = idr_find(&local->ack_status_frames, info->ack_frame_id); + if (skb) + idr_remove(&local->ack_status_frames, info->ack_frame_id); + spin_unlock_irqrestore(&local->ack_status_lock, flags); + + if (!skb) + return; + + if (dropped) { + dev_kfree_skb_any(skb); + return; + } + + if (info->flags & IEEE80211_TX_INTFL_NL80211_FRAME_TX) { + struct ieee80211_sub_if_data *sdata; + + rcu_read_lock(); + sdata = ieee80211_sdata_from_skb(local, skb); + if (sdata) + cfg80211_mgmt_tx_status(&sdata->wdev, + (unsigned long)skb, + skb->data, skb->len, + acked, GFP_ATOMIC); + rcu_read_unlock(); + + dev_kfree_skb_any(skb); + } else { + /* consumes skb */ + skb_complete_wifi_ack(skb, acked); + } +} + static void ieee80211_report_used_skb(struct ieee80211_local *local, struct sk_buff *skb, bool dropped) { @@ -440,27 +500,14 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local, acked = false; if (info->flags & (IEEE80211_TX_INTFL_NL80211_FRAME_TX | - IEEE80211_TX_INTFL_MLME_CONN_TX)) { - struct ieee80211_sub_if_data *sdata = NULL; - struct ieee80211_sub_if_data *iter_sdata; + IEEE80211_TX_INTFL_MLME_CONN_TX) && + !info->ack_frame_id) { + struct ieee80211_sub_if_data *sdata; u64 cookie = (unsigned long)skb; rcu_read_lock(); - if (skb->dev) { - list_for_each_entry_rcu(iter_sdata, &local->interfaces, - list) { - if (!iter_sdata->dev) - continue; - - if (skb->dev == iter_sdata->dev) { - sdata = iter_sdata; - break; - } - } - } else { - sdata = rcu_dereference(local->p2p_sdata); - } + sdata = ieee80211_sdata_from_skb(local, skb); if (!sdata) { skb->dev = NULL; @@ -483,33 +530,13 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local, cfg80211_probe_status(sdata->dev, hdr->addr1, cookie, acked, GFP_ATOMIC); } else { - cfg80211_mgmt_tx_status(&sdata->wdev, cookie, skb->data, - skb->len, acked, GFP_ATOMIC); + /* we assign ack frame ID for the others */ + WARN_ON(1); } rcu_read_unlock(); - } - - if (unlikely(info->ack_frame_id)) { - struct sk_buff *ack_skb; - unsigned long flags; - - spin_lock_irqsave(&local->ack_status_lock, flags); - ack_skb = idr_find(&local->ack_status_frames, - info->ack_frame_id); - if (ack_skb) - idr_remove(&local->ack_status_frames, - info->ack_frame_id); - spin_unlock_irqrestore(&local->ack_status_lock, flags); - - if (ack_skb) { - if (!dropped) { - /* consumes ack_skb */ - skb_complete_wifi_ack(ack_skb, acked); - } else { - dev_kfree_skb_any(ack_skb); - } - } + } else if (info->ack_frame_id) { + ieee80211_report_ack_skb(local, info, acked, dropped); } } -- cgit From 3b79af973cf42de059d0e90e20fd145d7ed8c5c1 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 1 Jun 2015 23:14:59 +0200 Subject: mac80211: stop using pointers as userspace cookies Even if the pointers are really only accessible to root and used pretty much only by wpa_supplicant, this is still not great; even for debugging it'd be easier to have something that's easier to read and guaranteed to never get reused. With the recent change to make mac80211 create an ack_skb for the mgmt-tx path this becomes possible, only the client probe method needs to also allocate an ack_skb, and we can store the cookie in that skb. Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 115 ++++++++++++++++++++++++++++++++------------------ net/mac80211/status.c | 27 ++++++------ 2 files changed, 87 insertions(+), 55 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 5ba528f13300..1a17d3208d8f 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2546,6 +2546,19 @@ static bool ieee80211_coalesce_started_roc(struct ieee80211_local *local, return true; } +static u64 ieee80211_mgmt_tx_cookie(struct ieee80211_local *local) +{ + lockdep_assert_held(&local->mtx); + + local->roc_cookie_counter++; + + /* wow, you wrapped 64 bits ... more likely a bug */ + if (WARN_ON(local->roc_cookie_counter == 0)) + local->roc_cookie_counter++; + + return local->roc_cookie_counter; +} + static int ieee80211_start_roc_work(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *channel, @@ -2583,7 +2596,6 @@ static int ieee80211_start_roc_work(struct ieee80211_local *local, roc->req_duration = duration; roc->frame = txskb; roc->type = type; - roc->mgmt_tx_cookie = (unsigned long)txskb; roc->sdata = sdata; INIT_DELAYED_WORK(&roc->work, ieee80211_sw_roc_work); INIT_LIST_HEAD(&roc->dependents); @@ -2593,17 +2605,10 @@ static int ieee80211_start_roc_work(struct ieee80211_local *local, * or the SKB (for mgmt TX) */ if (!txskb) { - /* local->mtx protects this */ - local->roc_cookie_counter++; - roc->cookie = local->roc_cookie_counter; - /* wow, you wrapped 64 bits ... more likely a bug */ - if (WARN_ON(roc->cookie == 0)) { - roc->cookie = 1; - local->roc_cookie_counter++; - } + roc->cookie = ieee80211_mgmt_tx_cookie(local); *cookie = roc->cookie; } else { - *cookie = (unsigned long)txskb; + roc->mgmt_tx_cookie = *cookie; } /* if there's one pending or we're scanning, queue this one */ @@ -3284,6 +3289,36 @@ int ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, return err; } +static struct sk_buff *ieee80211_make_ack_skb(struct ieee80211_local *local, + struct sk_buff *skb, u64 *cookie, + gfp_t gfp) +{ + unsigned long spin_flags; + struct sk_buff *ack_skb; + int id; + + ack_skb = skb_copy(skb, gfp); + if (!ack_skb) + return ERR_PTR(-ENOMEM); + + spin_lock_irqsave(&local->ack_status_lock, spin_flags); + id = idr_alloc(&local->ack_status_frames, ack_skb, + 1, 0x10000, GFP_ATOMIC); + spin_unlock_irqrestore(&local->ack_status_lock, spin_flags); + + if (id < 0) { + kfree_skb(ack_skb); + return ERR_PTR(-ENOMEM); + } + + IEEE80211_SKB_CB(skb)->ack_frame_id = id; + + *cookie = ieee80211_mgmt_tx_cookie(local); + IEEE80211_SKB_CB(ack_skb)->ack.cookie = *cookie; + + return ack_skb; +} + static int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, struct cfg80211_mgmt_tx_params *params, u64 *cookie) @@ -3429,40 +3464,22 @@ static int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, skb->dev = sdata->dev; if (!params->dont_wait_for_ack) { - unsigned long spin_flags; - int id; - - /* make a copy to preserve the original cookie (in case the - * driver decides to reallocate the skb) and the frame contents + /* make a copy to preserve the frame contents * in case of encryption. */ - ack_skb = skb_copy(skb, GFP_KERNEL); - if (!ack_skb) { - ret = -ENOMEM; - kfree_skb(skb); - goto out_unlock; - } - - spin_lock_irqsave(&local->ack_status_lock, spin_flags); - id = idr_alloc(&local->ack_status_frames, ack_skb, - 1, 0x10000, GFP_ATOMIC); - spin_unlock_irqrestore(&local->ack_status_lock, spin_flags); - - if (id < 0) { - ret = -ENOMEM; - kfree_skb(ack_skb); + ack_skb = ieee80211_make_ack_skb(local, skb, cookie, + GFP_KERNEL); + if (IS_ERR(ack_skb)) { + ret = PTR_ERR(ack_skb); kfree_skb(skb); goto out_unlock; } - - IEEE80211_SKB_CB(skb)->ack_frame_id = id; } else { /* for cookie below */ ack_skb = skb; } if (!need_offchan) { - *cookie = (unsigned long)ack_skb; ieee80211_tx_skb(sdata, skb); ret = 0; goto out_unlock; @@ -3555,7 +3572,7 @@ static int ieee80211_probe_client(struct wiphy *wiphy, struct net_device *dev, struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct ieee80211_qos_hdr *nullfunc; - struct sk_buff *skb; + struct sk_buff *skb, *ack_skb; int size = sizeof(*nullfunc); __le16 fc; bool qos; @@ -3563,20 +3580,24 @@ static int ieee80211_probe_client(struct wiphy *wiphy, struct net_device *dev, struct sta_info *sta; struct ieee80211_chanctx_conf *chanctx_conf; enum ieee80211_band band; + int ret; + + /* the lock is needed to assign the cookie later */ + mutex_lock(&local->mtx); rcu_read_lock(); chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); if (WARN_ON(!chanctx_conf)) { - rcu_read_unlock(); - return -EINVAL; + ret = -EINVAL; + goto unlock; } band = chanctx_conf->def.chan->band; sta = sta_info_get_bss(sdata, peer); if (sta) { qos = sta->sta.wme; } else { - rcu_read_unlock(); - return -ENOLINK; + ret = -ENOLINK; + goto unlock; } if (qos) { @@ -3592,8 +3613,8 @@ static int ieee80211_probe_client(struct wiphy *wiphy, struct net_device *dev, skb = dev_alloc_skb(local->hw.extra_tx_headroom + size); if (!skb) { - rcu_read_unlock(); - return -ENOMEM; + ret = -ENOMEM; + goto unlock; } skb->dev = dev; @@ -3619,13 +3640,23 @@ static int ieee80211_probe_client(struct wiphy *wiphy, struct net_device *dev, if (qos) nullfunc->qos_ctrl = cpu_to_le16(7); + ack_skb = ieee80211_make_ack_skb(local, skb, cookie, GFP_ATOMIC); + if (IS_ERR(ack_skb)) { + kfree_skb(skb); + ret = PTR_ERR(ack_skb); + goto unlock; + } + local_bh_disable(); ieee80211_xmit(sdata, sta, skb); local_bh_enable(); + + ret = 0; +unlock: rcu_read_unlock(); + mutex_unlock(&local->mtx); - *cookie = (unsigned long) skb; - return 0; + return ret; } static int ieee80211_cfg_get_channel(struct wiphy *wiphy, diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 56b73e012757..67c428735a51 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -471,15 +471,23 @@ static void ieee80211_report_ack_skb(struct ieee80211_local *local, } if (info->flags & IEEE80211_TX_INTFL_NL80211_FRAME_TX) { + u64 cookie = IEEE80211_SKB_CB(skb)->ack.cookie; struct ieee80211_sub_if_data *sdata; + struct ieee80211_hdr *hdr = (void *)skb->data; rcu_read_lock(); sdata = ieee80211_sdata_from_skb(local, skb); - if (sdata) - cfg80211_mgmt_tx_status(&sdata->wdev, - (unsigned long)skb, - skb->data, skb->len, - acked, GFP_ATOMIC); + if (sdata) { + if (ieee80211_is_nullfunc(hdr->frame_control) || + ieee80211_is_qos_nullfunc(hdr->frame_control)) + cfg80211_probe_status(sdata->dev, hdr->addr1, + cookie, acked, + GFP_ATOMIC); + else + cfg80211_mgmt_tx_status(&sdata->wdev, cookie, + skb->data, skb->len, + acked, GFP_ATOMIC); + } rcu_read_unlock(); dev_kfree_skb_any(skb); @@ -499,11 +507,8 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local, if (dropped) acked = false; - if (info->flags & (IEEE80211_TX_INTFL_NL80211_FRAME_TX | - IEEE80211_TX_INTFL_MLME_CONN_TX) && - !info->ack_frame_id) { + if (info->flags & IEEE80211_TX_INTFL_MLME_CONN_TX) { struct ieee80211_sub_if_data *sdata; - u64 cookie = (unsigned long)skb; rcu_read_lock(); @@ -525,10 +530,6 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local, ieee80211_mgd_conn_tx_status(sdata, hdr->frame_control, acked); - } else if (ieee80211_is_nullfunc(hdr->frame_control) || - ieee80211_is_qos_nullfunc(hdr->frame_control)) { - cfg80211_probe_status(sdata->dev, hdr->addr1, - cookie, acked, GFP_ATOMIC); } else { /* we assign ack frame ID for the others */ WARN_ON(1); -- cgit From 1caf6f476e90f592c2502a82bdef423cf950d011 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Tue, 2 Jun 2015 15:55:17 +0200 Subject: ieee802154: 6lowpan: set ackreq when needed This patch sets the acknowledge request bit inside the 802.15.4 mac header when frame retries is 0 or above. The other frame retries value which is -1 indicates that the transmitter doesn't care about an acknowledge frame which will be ignored after transmitting if the node sends anyway an ack frame after receiving. This is currently unnecessary traffic if the max frame retries parameter is -1. Signed-off-by: Alexander Aring Reviewed-by: Stefan Schmidt Signed-off-by: Marcel Holtmann --- net/ieee802154/6lowpan/tx.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ieee802154/6lowpan/tx.c b/net/ieee802154/6lowpan/tx.c index 98acf7319754..2597abbf7f4b 100644 --- a/net/ieee802154/6lowpan/tx.c +++ b/net/ieee802154/6lowpan/tx.c @@ -190,6 +190,7 @@ err: static int lowpan_header(struct sk_buff *skb, struct net_device *dev) { + struct wpan_dev *wpan_dev = lowpan_dev_info(dev)->real_dev->ieee802154_ptr; struct ieee802154_addr sa, da; struct ieee802154_mac_cb *cb = mac_cb_init(skb); struct lowpan_addr_info info; @@ -207,7 +208,7 @@ static int lowpan_header(struct sk_buff *skb, struct net_device *dev) /* prepare wpan address data */ sa.mode = IEEE802154_ADDR_LONG; - sa.pan_id = lowpan_dev_info(dev)->real_dev->ieee802154_ptr->pan_id; + sa.pan_id = wpan_dev->pan_id; sa.extended_addr = ieee802154_devaddr_from_raw(saddr); /* intra-PAN communications */ @@ -223,7 +224,7 @@ static int lowpan_header(struct sk_buff *skb, struct net_device *dev) } else { da.mode = IEEE802154_ADDR_LONG; da.extended_addr = ieee802154_devaddr_from_raw(daddr); - cb->ackreq = true; + cb->ackreq = wpan_dev->frame_retries >= 0; } return dev_hard_header(skb, lowpan_dev_info(dev)->real_dev, -- cgit From 0ecc4e688b6e33f8314c2b074335e134e0b2c4ae Mon Sep 17 00:00:00 2001 From: Varka Bhadram Date: Mon, 1 Jun 2015 14:22:26 +0530 Subject: mac802154: add trace functionality for driver ops This patch adds trace events for driver operations. Signed-off-by: Varka Bhadram Acked-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/mac802154/Makefile | 4 +- net/mac802154/driver-ops.h | 92 ++++++++++++--- net/mac802154/trace.c | 9 ++ net/mac802154/trace.h | 272 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 362 insertions(+), 15 deletions(-) create mode 100644 net/mac802154/trace.c create mode 100644 net/mac802154/trace.h (limited to 'net') diff --git a/net/mac802154/Makefile b/net/mac802154/Makefile index 702d8b466317..17a51e8389e2 100644 --- a/net/mac802154/Makefile +++ b/net/mac802154/Makefile @@ -1,5 +1,7 @@ obj-$(CONFIG_MAC802154) += mac802154.o mac802154-objs := main.o rx.o tx.o mac_cmd.o mib.o \ - iface.o llsec.o util.o cfg.o + iface.o llsec.o util.o cfg.o trace.o + +CFLAGS_trace.o := -I$(src) ccflags-y += -D__CHECK_ENDIAN__ diff --git a/net/mac802154/driver-ops.h b/net/mac802154/driver-ops.h index caecd5f43aa7..0550f3365e33 100644 --- a/net/mac802154/driver-ops.h +++ b/net/mac802154/driver-ops.h @@ -7,6 +7,7 @@ #include #include "ieee802154_i.h" +#include "trace.h" static inline int drv_xmit_async(struct ieee802154_local *local, struct sk_buff *skb) @@ -27,19 +28,25 @@ drv_xmit_sync(struct ieee802154_local *local, struct sk_buff *skb) static inline int drv_start(struct ieee802154_local *local) { + int ret; + might_sleep(); + trace_802154_drv_start(local); local->started = true; smp_mb(); - - return local->ops->start(&local->hw); + ret = local->ops->start(&local->hw); + trace_802154_drv_return_int(local, ret); + return ret; } static inline void drv_stop(struct ieee802154_local *local) { might_sleep(); + trace_802154_drv_stop(local); local->ops->stop(&local->hw); + trace_802154_drv_return_void(local); /* sync away all work on the tasklet before clearing started */ tasklet_disable(&local->tasklet); @@ -53,13 +60,20 @@ static inline void drv_stop(struct ieee802154_local *local) static inline int drv_set_channel(struct ieee802154_local *local, u8 page, u8 channel) { + int ret; + might_sleep(); - return local->ops->set_channel(&local->hw, page, channel); + trace_802154_drv_set_channel(local, page, channel); + ret = local->ops->set_channel(&local->hw, page, channel); + trace_802154_drv_return_int(local, ret); + return ret; } static inline int drv_set_tx_power(struct ieee802154_local *local, s32 mbm) { + int ret; + might_sleep(); if (!local->ops->set_txpower) { @@ -67,12 +81,17 @@ static inline int drv_set_tx_power(struct ieee802154_local *local, s32 mbm) return -EOPNOTSUPP; } - return local->ops->set_txpower(&local->hw, mbm); + trace_802154_drv_set_tx_power(local, mbm); + ret = local->ops->set_txpower(&local->hw, mbm); + trace_802154_drv_return_int(local, ret); + return ret; } static inline int drv_set_cca_mode(struct ieee802154_local *local, const struct wpan_phy_cca *cca) { + int ret; + might_sleep(); if (!local->ops->set_cca_mode) { @@ -80,11 +99,16 @@ static inline int drv_set_cca_mode(struct ieee802154_local *local, return -EOPNOTSUPP; } - return local->ops->set_cca_mode(&local->hw, cca); + trace_802154_drv_set_cca_mode(local, cca); + ret = local->ops->set_cca_mode(&local->hw, cca); + trace_802154_drv_return_int(local, ret); + return ret; } static inline int drv_set_lbt_mode(struct ieee802154_local *local, bool mode) { + int ret; + might_sleep(); if (!local->ops->set_lbt) { @@ -92,12 +116,17 @@ static inline int drv_set_lbt_mode(struct ieee802154_local *local, bool mode) return -EOPNOTSUPP; } - return local->ops->set_lbt(&local->hw, mode); + trace_802154_drv_set_lbt_mode(local, mode); + ret = local->ops->set_lbt(&local->hw, mode); + trace_802154_drv_return_int(local, ret); + return ret; } static inline int drv_set_cca_ed_level(struct ieee802154_local *local, s32 mbm) { + int ret; + might_sleep(); if (!local->ops->set_cca_ed_level) { @@ -105,12 +134,16 @@ drv_set_cca_ed_level(struct ieee802154_local *local, s32 mbm) return -EOPNOTSUPP; } - return local->ops->set_cca_ed_level(&local->hw, mbm); + trace_802154_drv_set_cca_ed_level(local, mbm); + ret = local->ops->set_cca_ed_level(&local->hw, mbm); + trace_802154_drv_return_int(local, ret); + return ret; } static inline int drv_set_pan_id(struct ieee802154_local *local, __le16 pan_id) { struct ieee802154_hw_addr_filt filt; + int ret; might_sleep(); @@ -121,14 +154,18 @@ static inline int drv_set_pan_id(struct ieee802154_local *local, __le16 pan_id) filt.pan_id = pan_id; - return local->ops->set_hw_addr_filt(&local->hw, &filt, + trace_802154_drv_set_pan_id(local, pan_id); + ret = local->ops->set_hw_addr_filt(&local->hw, &filt, IEEE802154_AFILT_PANID_CHANGED); + trace_802154_drv_return_int(local, ret); + return ret; } static inline int drv_set_extended_addr(struct ieee802154_local *local, __le64 extended_addr) { struct ieee802154_hw_addr_filt filt; + int ret; might_sleep(); @@ -139,14 +176,18 @@ drv_set_extended_addr(struct ieee802154_local *local, __le64 extended_addr) filt.ieee_addr = extended_addr; - return local->ops->set_hw_addr_filt(&local->hw, &filt, + trace_802154_drv_set_extended_addr(local, extended_addr); + ret = local->ops->set_hw_addr_filt(&local->hw, &filt, IEEE802154_AFILT_IEEEADDR_CHANGED); + trace_802154_drv_return_int(local, ret); + return ret; } static inline int drv_set_short_addr(struct ieee802154_local *local, __le16 short_addr) { struct ieee802154_hw_addr_filt filt; + int ret; might_sleep(); @@ -157,14 +198,18 @@ drv_set_short_addr(struct ieee802154_local *local, __le16 short_addr) filt.short_addr = short_addr; - return local->ops->set_hw_addr_filt(&local->hw, &filt, + trace_802154_drv_set_short_addr(local, short_addr); + ret = local->ops->set_hw_addr_filt(&local->hw, &filt, IEEE802154_AFILT_SADDR_CHANGED); + trace_802154_drv_return_int(local, ret); + return ret; } static inline int drv_set_pan_coord(struct ieee802154_local *local, bool is_coord) { struct ieee802154_hw_addr_filt filt; + int ret; might_sleep(); @@ -175,14 +220,19 @@ drv_set_pan_coord(struct ieee802154_local *local, bool is_coord) filt.pan_coord = is_coord; - return local->ops->set_hw_addr_filt(&local->hw, &filt, + trace_802154_drv_set_pan_coord(local, is_coord); + ret = local->ops->set_hw_addr_filt(&local->hw, &filt, IEEE802154_AFILT_PANC_CHANGED); + trace_802154_drv_return_int(local, ret); + return ret; } static inline int drv_set_csma_params(struct ieee802154_local *local, u8 min_be, u8 max_be, u8 max_csma_backoffs) { + int ret; + might_sleep(); if (!local->ops->set_csma_params) { @@ -190,13 +240,19 @@ drv_set_csma_params(struct ieee802154_local *local, u8 min_be, u8 max_be, return -EOPNOTSUPP; } - return local->ops->set_csma_params(&local->hw, min_be, max_be, + trace_802154_drv_set_csma_params(local, min_be, max_be, + max_csma_backoffs); + ret = local->ops->set_csma_params(&local->hw, min_be, max_be, max_csma_backoffs); + trace_802154_drv_return_int(local, ret); + return ret; } static inline int drv_set_max_frame_retries(struct ieee802154_local *local, s8 max_frame_retries) { + int ret; + might_sleep(); if (!local->ops->set_frame_retries) { @@ -204,12 +260,17 @@ drv_set_max_frame_retries(struct ieee802154_local *local, s8 max_frame_retries) return -EOPNOTSUPP; } - return local->ops->set_frame_retries(&local->hw, max_frame_retries); + trace_802154_drv_set_max_frame_retries(local, max_frame_retries); + ret = local->ops->set_frame_retries(&local->hw, max_frame_retries); + trace_802154_drv_return_int(local, ret); + return ret; } static inline int drv_set_promiscuous_mode(struct ieee802154_local *local, bool on) { + int ret; + might_sleep(); if (!local->ops->set_promiscuous_mode) { @@ -217,7 +278,10 @@ drv_set_promiscuous_mode(struct ieee802154_local *local, bool on) return -EOPNOTSUPP; } - return local->ops->set_promiscuous_mode(&local->hw, on); + trace_802154_drv_set_promiscuous_mode(local, on); + ret = local->ops->set_promiscuous_mode(&local->hw, on); + trace_802154_drv_return_int(local, ret); + return ret; } #endif /* __MAC802154_DRIVER_OPS */ diff --git a/net/mac802154/trace.c b/net/mac802154/trace.c new file mode 100644 index 000000000000..863e5e6b983d --- /dev/null +++ b/net/mac802154/trace.c @@ -0,0 +1,9 @@ +#include + +#ifndef __CHECKER__ +#include +#include "driver-ops.h" +#define CREATE_TRACE_POINTS +#include "trace.h" + +#endif diff --git a/net/mac802154/trace.h b/net/mac802154/trace.h new file mode 100644 index 000000000000..6f30e0c93a16 --- /dev/null +++ b/net/mac802154/trace.h @@ -0,0 +1,272 @@ +/* Based on net/mac80211/trace.h */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM mac802154 + +#if !defined(__MAC802154_DRIVER_TRACE) || defined(TRACE_HEADER_MULTI_READ) +#define __MAC802154_DRIVER_TRACE + +#include + +#include +#include "ieee802154_i.h" + +#define MAXNAME 32 +#define LOCAL_ENTRY __array(char, wpan_phy_name, MAXNAME) +#define LOCAL_ASSIGN strlcpy(__entry->wpan_phy_name, \ + wpan_phy_name(local->hw.phy), MAXNAME) +#define LOCAL_PR_FMT "%s" +#define LOCAL_PR_ARG __entry->wpan_phy_name + +#define CCA_ENTRY __field(enum nl802154_cca_modes, cca_mode) \ + __field(enum nl802154_cca_opts, cca_opt) +#define CCA_ASSIGN \ + do { \ + (__entry->cca_mode) = cca->mode; \ + (__entry->cca_opt) = cca->opt; \ + } while (0) +#define CCA_PR_FMT "cca_mode: %d, cca_opt: %d" +#define CCA_PR_ARG __entry->cca_mode, __entry->cca_opt + +#define BOOL_TO_STR(bo) (bo) ? "true" : "false" + +/* Tracing for driver callbacks */ + +DECLARE_EVENT_CLASS(local_only_evt, + TP_PROTO(struct ieee802154_local *local), + TP_ARGS(local), + TP_STRUCT__entry( + LOCAL_ENTRY + ), + TP_fast_assign( + LOCAL_ASSIGN; + ), + TP_printk(LOCAL_PR_FMT, LOCAL_PR_ARG) +); + +DEFINE_EVENT(local_only_evt, 802154_drv_return_void, + TP_PROTO(struct ieee802154_local *local), + TP_ARGS(local) +); + +TRACE_EVENT(802154_drv_return_int, + TP_PROTO(struct ieee802154_local *local, int ret), + TP_ARGS(local, ret), + TP_STRUCT__entry( + LOCAL_ENTRY + __field(int, ret) + ), + TP_fast_assign( + LOCAL_ASSIGN; + __entry->ret = ret; + ), + TP_printk(LOCAL_PR_FMT ", returned: %d", LOCAL_PR_ARG, + __entry->ret) +); + +DEFINE_EVENT(local_only_evt, 802154_drv_start, + TP_PROTO(struct ieee802154_local *local), + TP_ARGS(local) +); + +DEFINE_EVENT(local_only_evt, 802154_drv_stop, + TP_PROTO(struct ieee802154_local *local), + TP_ARGS(local) +); + +TRACE_EVENT(802154_drv_set_channel, + TP_PROTO(struct ieee802154_local *local, u8 page, u8 channel), + TP_ARGS(local, page, channel), + TP_STRUCT__entry( + LOCAL_ENTRY + __field(u8, page) + __field(u8, channel) + ), + TP_fast_assign( + LOCAL_ASSIGN; + __entry->page = page; + __entry->channel = channel; + ), + TP_printk(LOCAL_PR_FMT ", page: %d, channel: %d", LOCAL_PR_ARG, + __entry->page, __entry->channel) +); + +TRACE_EVENT(802154_drv_set_cca_mode, + TP_PROTO(struct ieee802154_local *local, + const struct wpan_phy_cca *cca), + TP_ARGS(local, cca), + TP_STRUCT__entry( + LOCAL_ENTRY + CCA_ENTRY + ), + TP_fast_assign( + LOCAL_ASSIGN; + CCA_ASSIGN; + ), + TP_printk(LOCAL_PR_FMT ", " CCA_PR_FMT, LOCAL_PR_ARG, + CCA_PR_ARG) +); + +TRACE_EVENT(802154_drv_set_cca_ed_level, + TP_PROTO(struct ieee802154_local *local, s32 mbm), + TP_ARGS(local, mbm), + TP_STRUCT__entry( + LOCAL_ENTRY + __field(s32, mbm) + ), + TP_fast_assign( + LOCAL_ASSIGN; + __entry->mbm = mbm; + ), + TP_printk(LOCAL_PR_FMT ", ed level: %d", LOCAL_PR_ARG, + __entry->mbm) +); + +TRACE_EVENT(802154_drv_set_tx_power, + TP_PROTO(struct ieee802154_local *local, s32 power), + TP_ARGS(local, power), + TP_STRUCT__entry( + LOCAL_ENTRY + __field(s32, power) + ), + TP_fast_assign( + LOCAL_ASSIGN; + __entry->power = power; + ), + TP_printk(LOCAL_PR_FMT ", mbm: %d", LOCAL_PR_ARG, + __entry->power) +); + +TRACE_EVENT(802154_drv_set_lbt_mode, + TP_PROTO(struct ieee802154_local *local, bool mode), + TP_ARGS(local, mode), + TP_STRUCT__entry( + LOCAL_ENTRY + __field(bool, mode) + ), + TP_fast_assign( + LOCAL_ASSIGN; + __entry->mode = mode; + ), + TP_printk(LOCAL_PR_FMT ", lbt mode: %s", LOCAL_PR_ARG, + BOOL_TO_STR(__entry->mode)) +); + +TRACE_EVENT(802154_drv_set_short_addr, + TP_PROTO(struct ieee802154_local *local, __le16 short_addr), + TP_ARGS(local, short_addr), + TP_STRUCT__entry( + LOCAL_ENTRY + __field(__le16, short_addr) + ), + TP_fast_assign( + LOCAL_ASSIGN; + __entry->short_addr = short_addr; + ), + TP_printk(LOCAL_PR_FMT ", short addr: 0x%04x", LOCAL_PR_ARG, + le16_to_cpu(__entry->short_addr)) +); + +TRACE_EVENT(802154_drv_set_pan_id, + TP_PROTO(struct ieee802154_local *local, __le16 pan_id), + TP_ARGS(local, pan_id), + TP_STRUCT__entry( + LOCAL_ENTRY + __field(__le16, pan_id) + ), + TP_fast_assign( + LOCAL_ASSIGN; + __entry->pan_id = pan_id; + ), + TP_printk(LOCAL_PR_FMT ", pan id: 0x%04x", LOCAL_PR_ARG, + le16_to_cpu(__entry->pan_id)) +); + +TRACE_EVENT(802154_drv_set_extended_addr, + TP_PROTO(struct ieee802154_local *local, __le64 extended_addr), + TP_ARGS(local, extended_addr), + TP_STRUCT__entry( + LOCAL_ENTRY + __field(__le64, extended_addr) + ), + TP_fast_assign( + LOCAL_ASSIGN; + __entry->extended_addr = extended_addr; + ), + TP_printk(LOCAL_PR_FMT ", extended addr: 0x%llx", LOCAL_PR_ARG, + le64_to_cpu(__entry->extended_addr)) +); + +TRACE_EVENT(802154_drv_set_pan_coord, + TP_PROTO(struct ieee802154_local *local, bool is_coord), + TP_ARGS(local, is_coord), + TP_STRUCT__entry( + LOCAL_ENTRY + __field(bool, is_coord) + ), + TP_fast_assign( + LOCAL_ASSIGN; + __entry->is_coord = is_coord; + ), + TP_printk(LOCAL_PR_FMT ", is_coord: %s", LOCAL_PR_ARG, + BOOL_TO_STR(__entry->is_coord)) +); + +TRACE_EVENT(802154_drv_set_csma_params, + TP_PROTO(struct ieee802154_local *local, u8 min_be, u8 max_be, + u8 max_csma_backoffs), + TP_ARGS(local, min_be, max_be, max_csma_backoffs), + TP_STRUCT__entry( + LOCAL_ENTRY + __field(u8, min_be) + __field(u8, max_be) + __field(u8, max_csma_backoffs) + ), + TP_fast_assign( + LOCAL_ASSIGN, + __entry->min_be = min_be; + __entry->max_be = max_be; + __entry->max_csma_backoffs = max_csma_backoffs; + ), + TP_printk(LOCAL_PR_FMT ", min be: %d, max be: %d, max csma backoffs: %d", + LOCAL_PR_ARG, __entry->min_be, __entry->max_be, + __entry->max_csma_backoffs) +); + +TRACE_EVENT(802154_drv_set_max_frame_retries, + TP_PROTO(struct ieee802154_local *local, s8 max_frame_retries), + TP_ARGS(local, max_frame_retries), + TP_STRUCT__entry( + LOCAL_ENTRY + __field(s8, max_frame_retries) + ), + TP_fast_assign( + LOCAL_ASSIGN; + __entry->max_frame_retries = max_frame_retries; + ), + TP_printk(LOCAL_PR_FMT ", max frame retries: %d", LOCAL_PR_ARG, + __entry->max_frame_retries) +); + +TRACE_EVENT(802154_drv_set_promiscuous_mode, + TP_PROTO(struct ieee802154_local *local, bool on), + TP_ARGS(local, on), + TP_STRUCT__entry( + LOCAL_ENTRY + __field(bool, on) + ), + TP_fast_assign( + LOCAL_ASSIGN; + __entry->on = on; + ), + TP_printk(LOCAL_PR_FMT ", promiscuous mode: %s", LOCAL_PR_ARG, + BOOL_TO_STR(__entry->on)) +); + +#endif /* !__MAC802154_DRIVER_TRACE || TRACE_HEADER_MULTI_READ */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace +#include -- cgit From ea1b2b45f513c6f9ee49b465b1a9281feb783532 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 2 Jun 2015 20:15:49 +0200 Subject: mac80211: remove short slot/short preamble incapable flags There are no drivers setting IEEE80211_HW_2GHZ_SHORT_SLOT_INCAPABLE or IEEE80211_HW_2GHZ_SHORT_PREAMBLE_INCAPABLE, so any code using the two flags is dead; it's also exceedingly unlikely that any new driver could ever need to set these flags. The wcn36xx code is almost certainly broken, but this preserves the previous behaviour. Signed-off-by: Johannes Berg --- net/mac80211/debugfs.c | 7 ------- net/mac80211/mesh_plink.c | 4 +--- net/mac80211/mlme.c | 6 ++---- net/mac80211/tdls.c | 17 +++++------------ 4 files changed, 8 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index b17206db49b4..1b94d2704c27 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -1,4 +1,3 @@ - /* * mac80211 debugfs for wireless PHYs * @@ -112,12 +111,6 @@ static ssize_t hwflags_read(struct file *file, char __user *user_buf, if (local->hw.flags & IEEE80211_HW_HOST_BROADCAST_PS_BUFFERING) sf += scnprintf(buf + sf, mxln - sf, "HOST_BCAST_PS_BUFFERING\n"); - if (local->hw.flags & IEEE80211_HW_2GHZ_SHORT_SLOT_INCAPABLE) - sf += scnprintf(buf + sf, mxln - sf, - "2GHZ_SHORT_SLOT_INCAPABLE\n"); - if (local->hw.flags & IEEE80211_HW_2GHZ_SHORT_PREAMBLE_INCAPABLE) - sf += scnprintf(buf + sf, mxln - sf, - "2GHZ_SHORT_PREAMBLE_INCAPABLE\n"); if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC) sf += scnprintf(buf + sf, mxln - sf, "SIGNAL_UNSPEC\n"); if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index ac843fc88745..72a127e8a1b6 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -106,9 +106,7 @@ static u32 mesh_set_short_slot_time(struct ieee80211_sub_if_data *sdata) /* (IEEE 802.11-2012 19.4.5) */ short_slot = true; goto out; - } else if (band != IEEE80211_BAND_2GHZ || - (band == IEEE80211_BAND_2GHZ && - local->hw.flags & IEEE80211_HW_2GHZ_SHORT_SLOT_INCAPABLE)) + } else if (band != IEEE80211_BAND_2GHZ) goto out; for (i = 0; i < sband->n_bitrates; i++) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 853cfa71b96a..94274ef569fa 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -669,10 +669,8 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) capab = WLAN_CAPABILITY_ESS; if (sband->band == IEEE80211_BAND_2GHZ) { - if (!(local->hw.flags & IEEE80211_HW_2GHZ_SHORT_SLOT_INCAPABLE)) - capab |= WLAN_CAPABILITY_SHORT_SLOT_TIME; - if (!(local->hw.flags & IEEE80211_HW_2GHZ_SHORT_PREAMBLE_INCAPABLE)) - capab |= WLAN_CAPABILITY_SHORT_PREAMBLE; + capab |= WLAN_CAPABILITY_SHORT_SLOT_TIME; + capab |= WLAN_CAPABILITY_SHORT_PREAMBLE; } if (assoc_data->capability & WLAN_CAPABILITY_PRIVACY) diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index 75e8e3bba538..28298cc50326 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -167,23 +167,16 @@ static void ieee80211_tdls_add_bss_coex_ie(struct sk_buff *skb) static u16 ieee80211_get_tdls_sta_capab(struct ieee80211_sub_if_data *sdata, u16 status_code) { - struct ieee80211_local *local = sdata->local; - u16 capab; - /* The capability will be 0 when sending a failure code */ if (status_code != 0) return 0; - capab = 0; - if (ieee80211_get_sdata_band(sdata) != IEEE80211_BAND_2GHZ) - return capab; - - if (!(local->hw.flags & IEEE80211_HW_2GHZ_SHORT_SLOT_INCAPABLE)) - capab |= WLAN_CAPABILITY_SHORT_SLOT_TIME; - if (!(local->hw.flags & IEEE80211_HW_2GHZ_SHORT_PREAMBLE_INCAPABLE)) - capab |= WLAN_CAPABILITY_SHORT_PREAMBLE; + if (ieee80211_get_sdata_band(sdata) == IEEE80211_BAND_2GHZ) { + return WLAN_CAPABILITY_SHORT_SLOT_TIME | + WLAN_CAPABILITY_SHORT_PREAMBLE; + } - return capab; + return 0; } static void ieee80211_tdls_add_link_ie(struct ieee80211_sub_if_data *sdata, -- cgit From c526a467671960922b5cb5fc385a1813602526bc Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 2 Jun 2015 20:32:00 +0200 Subject: mac80211: rename single hw-scan flag to follow naming convention The naming convention is to always have the flags prefixed with IEEE80211_HW_ so they're 'namespaced', make this flag follow it. Signed-off-by: Johannes Berg --- net/mac80211/scan.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 7bb6a9383f58..3eb121d2aa45 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -257,7 +257,7 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_local *local) if (test_bit(SCAN_HW_CANCELLED, &local->scanning)) return false; - if (local->hw.flags & IEEE80211_SINGLE_HW_SCAN_ON_ALL_BANDS) { + if (local->hw.flags & IEEE80211_HW_SINGLE_SCAN_ON_ALL_BANDS) { for (i = 0; i < req->n_channels; i++) { local->hw_scan_req->req.channels[i] = req->channels[i]; bands_used |= BIT(req->channels[i]->band); @@ -326,7 +326,7 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) return; if (hw_scan && !aborted && - !(local->hw.flags & IEEE80211_SINGLE_HW_SCAN_ON_ALL_BANDS) && + !(local->hw.flags & IEEE80211_HW_SINGLE_SCAN_ON_ALL_BANDS) && ieee80211_prep_hw_scan(local)) { int rc; @@ -520,7 +520,7 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, local->hw_scan_ies_bufsize = local->scan_ies_len + req->ie_len; - if (local->hw.flags & IEEE80211_SINGLE_HW_SCAN_ON_ALL_BANDS) { + if (local->hw.flags & IEEE80211_HW_SINGLE_SCAN_ON_ALL_BANDS) { int i, n_bands = 0; u8 bands_counted = 0; -- cgit From 940d156f52d27c16f3e0890e82974d500e40e64f Mon Sep 17 00:00:00 2001 From: Markus Pargmann Date: Fri, 26 Dec 2014 12:41:31 +0100 Subject: batman-adv: iv_ogm_aggregate_new, simplify error handling It is just a bit easier to put the error handling at one place and let multiple error paths use the same calls. Signed-off-by: Markus Pargmann Signed-off-by: Marek Lindner --- net/batman-adv/bat_iv_ogm.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index c5ba7a798de7..190a64299090 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -642,19 +642,16 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, if (!batadv_atomic_dec_not_zero(&bat_priv->batman_queue_left)) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "batman packet queue full\n"); - goto out; + goto out_free_outgoing; } } forw_packet_aggr = kmalloc(sizeof(*forw_packet_aggr), GFP_ATOMIC); - if (!forw_packet_aggr) { - if (!own_packet) - atomic_inc(&bat_priv->batman_queue_left); - goto out; - } + if (!forw_packet_aggr) + goto out_nomem; - if ((atomic_read(&bat_priv->aggregated_ogms)) && - (packet_len < BATADV_MAX_AGGREGATION_BYTES)) + if (atomic_read(&bat_priv->aggregated_ogms) && + packet_len < BATADV_MAX_AGGREGATION_BYTES) skb_size = BATADV_MAX_AGGREGATION_BYTES; else skb_size = packet_len; @@ -662,12 +659,8 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, skb_size += ETH_HLEN; forw_packet_aggr->skb = netdev_alloc_skb_ip_align(NULL, skb_size); - if (!forw_packet_aggr->skb) { - if (!own_packet) - atomic_inc(&bat_priv->batman_queue_left); - kfree(forw_packet_aggr); - goto out; - } + if (!forw_packet_aggr->skb) + goto out_free_forw_packet; forw_packet_aggr->skb->priority = TC_PRIO_CONTROL; skb_reserve(forw_packet_aggr->skb, ETH_HLEN); @@ -699,7 +692,12 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, send_time - jiffies); return; -out: +out_free_forw_packet: + kfree(forw_packet_aggr); +out_nomem: + if (!own_packet) + atomic_inc(&bat_priv->batman_queue_left); +out_free_outgoing: batadv_hardif_free_ref(if_outgoing); out_free_incoming: batadv_hardif_free_ref(if_incoming); -- cgit From 564891510eb139dd073a0f1a5a7f5641b5bcc238 Mon Sep 17 00:00:00 2001 From: Markus Pargmann Date: Fri, 26 Dec 2014 12:41:32 +0100 Subject: batman-adv: iv_ogm_queue_add, Simplify expressions Signed-off-by: Markus Pargmann Signed-off-by: Marek Lindner --- net/batman-adv/bat_iv_ogm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 190a64299090..159f30af1dc3 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -750,13 +750,13 @@ static void batadv_iv_ogm_queue_add(struct batadv_priv *bat_priv, unsigned long max_aggregation_jiffies; batadv_ogm_packet = (struct batadv_ogm_packet *)packet_buff; - direct_link = batadv_ogm_packet->flags & BATADV_DIRECTLINK ? 1 : 0; + direct_link = !!(batadv_ogm_packet->flags & BATADV_DIRECTLINK); max_aggregation_jiffies = msecs_to_jiffies(BATADV_MAX_AGGREGATION_MS); /* find position for the packet in the forward queue */ spin_lock_bh(&bat_priv->forw_bat_list_lock); /* own packets are not to be aggregated */ - if ((atomic_read(&bat_priv->aggregated_ogms)) && (!own_packet)) { + if (atomic_read(&bat_priv->aggregated_ogms) && !own_packet) { hlist_for_each_entry(forw_packet_pos, &bat_priv->forw_bat_list, list) { if (batadv_iv_ogm_can_aggregate(batadv_ogm_packet, -- cgit From 23badd6dbe5ed66668b0084950c5f66062923e10 Mon Sep 17 00:00:00 2001 From: Markus Pargmann Date: Fri, 26 Dec 2014 12:41:33 +0100 Subject: batman-adv: iv_ogm_orig_update, style, add missing brackets CodingStyle describes that either none or both branches of a conditional have to have brackets. Signed-off-by: Markus Pargmann Signed-off-by: Marek Lindner --- net/batman-adv/bat_iv_ogm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 159f30af1dc3..66ac9e85d1ce 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -1032,9 +1032,10 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, batadv_orig_node_free_ref(orig_tmp); if (!neigh_node) goto unlock; - } else + } else { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Updating existing last-hop neighbor of originator\n"); + } rcu_read_unlock(); neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing); -- cgit From 9f52ee19c3c73763138ccc307858d9877242c0ad Mon Sep 17 00:00:00 2001 From: Markus Pargmann Date: Fri, 26 Dec 2014 12:41:34 +0100 Subject: batman-adv: iv_ogm, Fix dup_status comment Signed-off-by: Markus Pargmann Signed-off-by: Marek Lindner --- net/batman-adv/bat_iv_ogm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 66ac9e85d1ce..23c41d17d999 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -28,7 +28,7 @@ /** * enum batadv_dup_status - duplicate status - * @BATADV_NO_DUP: the packet is a duplicate + * @BATADV_NO_DUP: the packet is no duplicate * @BATADV_ORIG_DUP: OGM is a duplicate in the originator (but not for the * neighbor) * @BATADV_NEIGH_DUP: OGM is a duplicate for the neighbor -- cgit From 6c4a1622e21fde1a0287ada3bedcbf75ddbc80ac Mon Sep 17 00:00:00 2001 From: Markus Pargmann Date: Fri, 26 Dec 2014 12:41:35 +0100 Subject: batman-adv: iv_ogm, fix coding style The kernel coding style says, that there should not be multiple assignments in one row. Signed-off-by: Markus Pargmann Signed-off-by: Marek Lindner --- net/batman-adv/bat_iv_ogm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 23c41d17d999..0019f1bfbe8b 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -64,7 +64,9 @@ static void batadv_ring_buffer_set(uint8_t lq_recv[], uint8_t *lq_index, static uint8_t batadv_ring_buffer_avg(const uint8_t lq_recv[]) { const uint8_t *ptr; - uint16_t count = 0, i = 0, sum = 0; + uint16_t count = 0; + uint16_t i = 0; + uint16_t sum = 0; ptr = lq_recv; -- cgit From d491dbb68b6efd12a27ee10791e27fc5f2bd02ff Mon Sep 17 00:00:00 2001 From: Markus Pargmann Date: Fri, 26 Dec 2014 12:41:36 +0100 Subject: batman-adv: iv_ogm, fix comment function name This is a small copy paste fix for batadv_ing_buffer_avg. Signed-off-by: Markus Pargmann Signed-off-by: Marek Lindner --- net/batman-adv/bat_iv_ogm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 0019f1bfbe8b..9f83137a6514 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -55,7 +55,7 @@ static void batadv_ring_buffer_set(uint8_t lq_recv[], uint8_t *lq_index, } /** - * batadv_ring_buffer_set - compute the average of all non-zero values stored + * batadv_ring_buffer_avg - compute the average of all non-zero values stored * in the given ring buffer * @lq_recv: pointer to the ring buffer * -- cgit From 21102626da60a6fcb0458e4040ee4739f04652e8 Mon Sep 17 00:00:00 2001 From: Markus Pargmann Date: Fri, 26 Dec 2014 12:41:37 +0100 Subject: batman-adv: types, Fix comment on bcast_own batadv_orig_bat_iv->bcast_own is actually not a bitfield, it is an array. Adjust the comment accordingly. Signed-off-by: Markus Pargmann Signed-off-by: Antonio Quartulli Signed-off-by: Marek Lindner --- net/batman-adv/types.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index e95db4273356..c1000c0d6b0d 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -183,9 +183,10 @@ struct batadv_orig_node_vlan { /** * struct batadv_orig_bat_iv - B.A.T.M.A.N. IV private orig_node members - * @bcast_own: bitfield containing the number of our OGMs this orig_node - * rebroadcasted "back" to us (relative to last_real_seqno) - * @bcast_own_sum: counted result of bcast_own + * @bcast_own: set of bitfields (one per hard interface) where each one counts + * the number of our OGMs this orig_node rebroadcasted "back" to us (relative + * to last_real_seqno). Every bitfield is BATADV_TQ_LOCAL_WINDOW_SIZE bits long. + * @bcast_own_sum: sum of bcast_own * @ogm_cnt_lock: lock protecting bcast_own, bcast_own_sum, * neigh_node->bat_iv.real_bits & neigh_node->bat_iv.real_packet_count */ -- cgit From a0c77227ffd0ad371cfde84458f440bcde5ab048 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 1 Mar 2015 16:56:26 +0100 Subject: batman-adv: Remove unnecessary check for orig_ifinfo not NULL orig_ifinfo is dereferenced multiple times in batadv_iv_ogm_update_seqnos before the check for NULL is done. The function also exists at the beginning when orig_ifinfo would have been NULL. This makes the check at the end unnecessary and only confuses the reader/code analyzers. Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner --- net/batman-adv/bat_iv_ogm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 9f83137a6514..4e93d2d3958b 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -1357,8 +1357,7 @@ batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr, out: spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock); batadv_orig_node_free_ref(orig_node); - if (orig_ifinfo) - batadv_orig_ifinfo_free_ref(orig_ifinfo); + batadv_orig_ifinfo_free_ref(orig_ifinfo); return ret; } -- cgit From e8ad3b1acfbb022b495f12f4ed6e825d5f53546a Mon Sep 17 00:00:00 2001 From: Markus Pargmann Date: Fri, 26 Dec 2014 12:41:38 +0100 Subject: batman-adv: main, Convert is_my_mac() to bool It is much clearer to see a bool type as return value than 'int' for functions that are supposed to return true or false. Signed-off-by: Markus Pargmann Signed-off-by: Marek Lindner --- net/batman-adv/main.c | 11 +++++++---- net/batman-adv/main.h | 2 +- 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index fd9333dffc97..a22247a61d3e 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -209,10 +209,13 @@ void batadv_mesh_free(struct net_device *soft_iface) * interfaces in the current mesh * @bat_priv: the bat priv with all the soft interface information * @addr: the address to check + * + * Returns 'true' if the mac address was found, false otherwise. */ -int batadv_is_my_mac(struct batadv_priv *bat_priv, const uint8_t *addr) +bool batadv_is_my_mac(struct batadv_priv *bat_priv, const uint8_t *addr) { const struct batadv_hard_iface *hard_iface; + bool is_my_mac = false; rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { @@ -223,12 +226,12 @@ int batadv_is_my_mac(struct batadv_priv *bat_priv, const uint8_t *addr) continue; if (batadv_compare_eth(hard_iface->net_dev->dev_addr, addr)) { - rcu_read_unlock(); - return 1; + is_my_mac = true; + break; } } rcu_read_unlock(); - return 0; + return is_my_mac; } /** diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 026ba37f31a6..96876d29f952 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -195,7 +195,7 @@ extern struct workqueue_struct *batadv_event_workqueue; int batadv_mesh_init(struct net_device *soft_iface); void batadv_mesh_free(struct net_device *soft_iface); -int batadv_is_my_mac(struct batadv_priv *bat_priv, const uint8_t *addr); +bool batadv_is_my_mac(struct batadv_priv *bat_priv, const uint8_t *addr); struct batadv_hard_iface * batadv_seq_print_text_primary_if_get(struct seq_file *seq); int batadv_max_header_len(void); -- cgit From f2d5cf2add74ac066572b5215ba8a33dfcf0b85c Mon Sep 17 00:00:00 2001 From: Markus Pargmann Date: Fri, 26 Dec 2014 12:41:39 +0100 Subject: batman-adv: main, batadv_compare_eth return bool Declare the returntype of batadv_compare_eth as bool. The function called inside this helper function (ether_addr_equal_unaligned) also uses bool as return value, so there is no need to return int. Signed-off-by: Markus Pargmann Signed-off-by: Marek Lindner --- net/batman-adv/main.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 96876d29f952..af0a3361d4b2 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -279,7 +279,7 @@ static inline void _batadv_dbg(int type __always_unused, * * note: can't use ether_addr_equal() as it requires aligned memory */ -static inline int batadv_compare_eth(const void *data1, const void *data2) +static inline bool batadv_compare_eth(const void *data1, const void *data2) { return ether_addr_equal_unaligned(data1, data2); } -- cgit From 9fb6c6519b10aee4815591122ce59ce4b63fc44b Mon Sep 17 00:00:00 2001 From: Markus Pargmann Date: Fri, 26 Dec 2014 12:41:40 +0100 Subject: batman-adv: Remove unnecessary ret variable We can avoid this indirect return variable by directly returning the error values. Signed-off-by: Markus Pargmann Signed-off-by: Marek Lindner --- net/batman-adv/main.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index a22247a61d3e..49f07e1e0f5f 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -513,14 +513,12 @@ static struct batadv_algo_ops *batadv_algo_get(char *name) int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops) { struct batadv_algo_ops *bat_algo_ops_tmp; - int ret; bat_algo_ops_tmp = batadv_algo_get(bat_algo_ops->name); if (bat_algo_ops_tmp) { pr_info("Trying to register already registered routing algorithm: %s\n", bat_algo_ops->name); - ret = -EEXIST; - goto out; + return -EEXIST; } /* all algorithms must implement all ops (for now) */ @@ -534,16 +532,13 @@ int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops) !bat_algo_ops->bat_neigh_is_equiv_or_better) { pr_info("Routing algo '%s' does not implement required ops\n", bat_algo_ops->name); - ret = -EINVAL; - goto out; + return -EINVAL; } INIT_HLIST_NODE(&bat_algo_ops->list); hlist_add_head(&bat_algo_ops->list, &batadv_algo_list); - ret = 0; -out: - return ret; + return 0; } int batadv_algo_select(struct batadv_priv *bat_priv, char *name) -- cgit From f372d09059a67f57b3d2b5eab8c05b0ddb8ca92f Mon Sep 17 00:00:00 2001 From: Markus Pargmann Date: Fri, 26 Dec 2014 12:41:41 +0100 Subject: batman-adv: Remove unnecessary ret variable in algo_register Remove ret variable and all jumps. Signed-off-by: Markus Pargmann Signed-off-by: Marek Lindner --- net/batman-adv/main.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 49f07e1e0f5f..548e405d13c1 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -544,17 +544,14 @@ int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops) int batadv_algo_select(struct batadv_priv *bat_priv, char *name) { struct batadv_algo_ops *bat_algo_ops; - int ret = -EINVAL; bat_algo_ops = batadv_algo_get(name); if (!bat_algo_ops) - goto out; + return -EINVAL; bat_priv->bat_algo_ops = bat_algo_ops; - ret = 0; -out: - return ret; + return 0; } int batadv_algo_seq_print_text(struct seq_file *seq, void *offset) -- cgit From 3896d655f4d491c67d669a15f275a39f713410f8 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 2 Jun 2015 16:03:14 -0700 Subject: bpf: introduce bpf_clone_redirect() helper Allow eBPF programs attached to classifier/actions to call bpf_clone_redirect(skb, ifindex, flags) helper which will mirror or redirect the packet by dynamic ifindex selection from within the program to a target device either at ingress or at egress. Can be used for various scenarios, for example, to load balance skbs into veths, split parts of the traffic to local taps, etc. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/core/filter.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index b78a010a957f..64c121c09655 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -46,6 +46,7 @@ #include #include #include +#include /** * sk_filter - run a packet through a socket filter @@ -1407,6 +1408,43 @@ const struct bpf_func_proto bpf_l4_csum_replace_proto = { .arg5_type = ARG_ANYTHING, }; +#define BPF_IS_REDIRECT_INGRESS(flags) ((flags) & 1) + +static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1, *skb2; + struct net_device *dev; + + dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex); + if (unlikely(!dev)) + return -EINVAL; + + if (unlikely(!(dev->flags & IFF_UP))) + return -EINVAL; + + skb2 = skb_clone(skb, GFP_ATOMIC); + if (unlikely(!skb2)) + return -ENOMEM; + + if (G_TC_AT(skb2->tc_verd) & AT_INGRESS) + skb_push(skb2, skb2->mac_len); + + if (BPF_IS_REDIRECT_INGRESS(flags)) + return dev_forward_skb(dev, skb2); + + skb2->dev = dev; + return dev_queue_xmit(skb2); +} + +const struct bpf_func_proto bpf_clone_redirect_proto = { + .func = bpf_clone_redirect, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * sk_filter_func_proto(enum bpf_func_id func_id) { @@ -1440,6 +1478,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_l3_csum_replace_proto; case BPF_FUNC_l4_csum_replace: return &bpf_l4_csum_replace_proto; + case BPF_FUNC_clone_redirect: + return &bpf_clone_redirect_proto; default: return sk_filter_func_proto(func_id); } -- cgit From 7616dcbb212eeec00c9bcc0fecb953fdee60634c Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Wed, 3 Jun 2015 20:43:43 -0700 Subject: switchdev: documentation: use switchdev_port_obj_xxx for IPv4 FIB add/modify/delete ops Clarify in documentation and code that IPV4 FIB add operation is used for both adding a new FIB entry to the device and for modifying an existing FIB entry on the device. Also, remove left-over references to ipv4_fib ops and replace with details on SWITCHDEV_PORT_IPV4_FIB object. Signed-off-by: Scott Feldman Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/switchdev/switchdev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index ac853acbe211..e008057dab46 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -803,7 +803,7 @@ static struct net_device *switchdev_get_dev_by_nhs(struct fib_info *fi) } /** - * switchdev_fib_ipv4_add - Add IPv4 route entry to switch + * switchdev_fib_ipv4_add - Add/modify switch IPv4 route entry * * @dst: route's IPv4 destination address * @dst_len: destination address length (prefix length) @@ -813,7 +813,7 @@ static struct net_device *switchdev_get_dev_by_nhs(struct fib_info *fi) * @nlflags: netlink flags passed in (NLM_F_*) * @tb_id: route table ID * - * Add IPv4 route entry to switch device. + * Add/modify switch IPv4 route entry. */ int switchdev_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi, u8 tos, u8 type, u32 nlflags, u32 tb_id) -- cgit From 12e25e1041d044d4204f2b7c54695e14e8ffb282 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 3 Jun 2015 23:49:21 -0700 Subject: tcp: remove redundant checks tcp_v4_rcv() checks the following before calling tcp_v4_do_rcv(): if (th->doff < sizeof(struct tcphdr) / 4) goto bad_packet; if (!pskb_may_pull(skb, th->doff * 4)) goto discard_it; So following check in tcp_v4_do_rcv() is redundant and "goto csum_err;" is wrong anyway. if (skb->len < tcp_hdrlen(skb) || ...) goto csum_err; A second check can be removed after no_tcp_socket label for same reason. Same tests can be removed in tcp_v6_do_rcv() Note : short tcp frames are not properly accounted in tcpInErrs MIB, because pskb_may_pull() failure simply drops incoming skb, we might fix this in a separate patch. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 4 ++-- net/ipv6/tcp_ipv6.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index feb875769b8d..3130c42240c8 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1400,7 +1400,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) return 0; } - if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb)) + if (tcp_checksum_complete(skb)) goto csum_err; if (sk->sk_state == TCP_LISTEN) { @@ -1647,7 +1647,7 @@ no_tcp_socket: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto discard_it; - if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) { + if (tcp_checksum_complete(skb)) { csum_error: TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS); bad_packet: diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 7be3d858cbf0..b72e1ffc6bb4 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1250,7 +1250,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) return 0; } - if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb)) + if (tcp_checksum_complete(skb)) goto csum_err; if (sk->sk_state == TCP_LISTEN) { @@ -1442,7 +1442,7 @@ no_tcp_socket: tcp_v6_fill_cb(skb, hdr, th); - if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) { + if (tcp_checksum_complete(skb)) { csum_error: TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS); bad_packet: -- cgit From c39c4c6abb89d24454b63798ccbae12b538206a5 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Wed, 3 Jun 2015 11:10:42 +0100 Subject: tcp: double default TSQ output bytes limit Xen virtual network driver has higher latency than a physical NIC. Having only 128K as limit for TSQ introduced 30% regression in guest throughput. This patch raises the limit to 256K. This reduces the regression to 8%. This buys us more time to work out a proper solution in the long run. Signed-off-by: Wei Liu Cc: David Miller Cc: Eric Dumazet Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 190538a2a88c..eeb59befaf06 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -50,8 +50,8 @@ int sysctl_tcp_retrans_collapse __read_mostly = 1; */ int sysctl_tcp_workaround_signed_windows __read_mostly = 0; -/* Default TSQ limit of two TSO segments */ -int sysctl_tcp_limit_output_bytes __read_mostly = 131072; +/* Default TSQ limit of four TSO segments */ +int sysctl_tcp_limit_output_bytes __read_mostly = 262144; /* This limits the percentage of the congestion window which we * will allow a single TSO frame to consume. Building TSO frames -- cgit From 07bd77fa4c0f1bd21fad0f9fa12ad4e453d3fad8 Mon Sep 17 00:00:00 2001 From: Varka Bhadram Date: Wed, 3 Jun 2015 09:21:47 +0530 Subject: cfg802154: fix rdev-ops naming convension and format specifiers This patch make to use the same naming convention that mac802154 tracing follows and fixes the format specifier for extended addr. Signed-off-by: Varka Bhadram Acked-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/ieee802154/trace.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ieee802154/trace.h b/net/ieee802154/trace.h index 73eb7605c1eb..9b5f0eb36696 100644 --- a/net/ieee802154/trace.h +++ b/net/ieee802154/trace.h @@ -56,7 +56,7 @@ TRACE_EVENT(802154_rdev_add_virtual_intf, __entry->type = type; __entry->extended_addr = extended_addr; ), - TP_printk(WPAN_PHY_PR_FMT ", virtual intf name: %s, type: %d, ea %llx", + TP_printk(WPAN_PHY_PR_FMT ", virtual intf name: %s, type: %d, extended addr: 0x%llx", WPAN_PHY_PR_ARG, __get_str(vir_intf_name), __entry->type, __le64_to_cpu(__entry->extended_addr)) ); @@ -104,7 +104,7 @@ TRACE_EVENT(802154_rdev_set_tx_power, WPAN_PHY_ASSIGN; __entry->power = power; ), - TP_printk(WPAN_PHY_PR_FMT ", power: %d", WPAN_PHY_PR_ARG, + TP_printk(WPAN_PHY_PR_FMT ", mbm: %d", WPAN_PHY_PR_ARG, __entry->power) ); @@ -134,7 +134,7 @@ TRACE_EVENT(802154_rdev_set_cca_ed_level, WPAN_PHY_ASSIGN; __entry->ed_level = ed_level; ), - TP_printk(WPAN_PHY_PR_FMT ", ed_level: %d", WPAN_PHY_PR_ARG, + TP_printk(WPAN_PHY_PR_FMT ", ed level: %d", WPAN_PHY_PR_ARG, __entry->ed_level) ); @@ -167,7 +167,7 @@ DEFINE_EVENT_PRINT(802154_le16_template, 802154_rdev_set_short_addr, TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, __le16 le16arg), TP_ARGS(wpan_phy, wpan_dev, le16arg), - TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT ", sa: 0x%04x", + TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT ", short addr: 0x%04x", WPAN_PHY_PR_ARG, WPAN_DEV_PR_ARG, __le16_to_cpu(__entry->le16arg)) ); @@ -190,7 +190,7 @@ TRACE_EVENT(802154_rdev_set_backoff_exponent, ), TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT - ", min be: %d, max_be: %d", WPAN_PHY_PR_ARG, + ", min be: %d, max be: %d", WPAN_PHY_PR_ARG, WPAN_DEV_PR_ARG, __entry->min_be, __entry->max_be) ); -- cgit From 8a70cefa3037d62e7c0b6068a66675def1a330c9 Mon Sep 17 00:00:00 2001 From: Lennert Buytenhek Date: Wed, 3 Jun 2015 10:50:19 +0300 Subject: ieee802154: Fix sockaddr_ieee802154 implicit padding information leak. The AF_IEEE802154 sockaddr looks like this: struct sockaddr_ieee802154 { sa_family_t family; /* AF_IEEE802154 */ struct ieee802154_addr_sa addr; }; struct ieee802154_addr_sa { int addr_type; u16 pan_id; union { u8 hwaddr[IEEE802154_ADDR_LEN]; u16 short_addr; }; }; On most architectures there will be implicit structure padding here, in two different places: * In struct sockaddr_ieee802154, two bytes of padding between 'family' (unsigned short) and 'addr', so that 'addr' starts on a four byte boundary. * In struct ieee802154_addr_sa, two bytes at the end of the structure, to make the structure 16 bytes. When calling recvmsg(2) on a PF_IEEE802154 SOCK_DGRAM socket, the ieee802154 stack constructs a struct sockaddr_ieee802154 on the kernel stack without clearing these padding fields, and, depending on the addr_type, between four and ten bytes of uncleared kernel stack will be copied to userspace. We can't just insert two 'u16 __pad's in the right places and zero those before copying an address to userspace, as not all architectures insert this implicit padding -- from a quick test it seems that avr32, cris and m68k don't insert this padding, while every other architecture that I have cross compilers for does insert this padding. The easiest way to plug the leak is to just memset the whole struct sockaddr_ieee802154 before filling in the fields we want to fill in, and that's what this patch does. Cc: stable@vger.kernel.org Signed-off-by: Lennert Buytenhek Acked-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/ieee802154/socket.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index 02abef2c1621..b6eacf30ee7a 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -731,6 +731,12 @@ static int dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, sock_recv_ts_and_drops(msg, sk, skb); if (saddr) { + /* Clear the implicit padding in struct sockaddr_ieee802154 + * (16 bits between 'family' and 'addr') and in struct + * ieee802154_addr_sa (16 bits at the end of the structure). + */ + memset(saddr, 0, sizeof(*saddr)); + saddr->family = AF_IEEE802154; ieee802154_addr_to_sa(&saddr->addr, &mac_cb(skb)->source); *addr_len = sizeof(*saddr); -- cgit From 133be0264f28e59d772c6a259349ba3ee2b183b3 Mon Sep 17 00:00:00 2001 From: Varka Bhadram Date: Thu, 4 Jun 2015 13:07:36 +0530 Subject: nl802154: export supported commands This patch will export the supported commands by the devices to the userspace. This will be useful to check if HardMAC drivers can support a specific command or not. Signed-off-by: Varka Bhadram Acked-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/ieee802154/nl802154.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'net') diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index 7dbb1f4ce7df..68f24016860c 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -228,6 +228,8 @@ static const struct nla_policy nl802154_policy[NL802154_ATTR_MAX+1] = { [NL802154_ATTR_LBT_MODE] = { .type = NLA_U8, }, [NL802154_ATTR_WPAN_PHY_CAPS] = { .type = NLA_NESTED }, + + [NL802154_ATTR_SUPPORTED_COMMANDS] = { .type = NLA_NESTED }, }; /* message building helper */ @@ -372,7 +374,9 @@ static int nl802154_send_wpan_phy(struct cfg802154_registered_device *rdev, struct sk_buff *msg, u32 portid, u32 seq, int flags) { + struct nlattr *nl_cmds; void *hdr; + int i; hdr = nl802154hdr_put(msg, portid, seq, flags, cmd); if (!hdr) @@ -431,6 +435,42 @@ static int nl802154_send_wpan_phy(struct cfg802154_registered_device *rdev, if (nl802154_put_capabilities(msg, rdev)) goto nla_put_failure; + nl_cmds = nla_nest_start(msg, NL802154_ATTR_SUPPORTED_COMMANDS); + if (!nl_cmds) + goto nla_put_failure; + + i = 0; +#define CMD(op, n) \ + do { \ + if (rdev->ops->op) { \ + i++; \ + if (nla_put_u32(msg, i, NL802154_CMD_ ## n)) \ + goto nla_put_failure; \ + } \ + } while (0) + + CMD(add_virtual_intf, NEW_INTERFACE); + CMD(del_virtual_intf, DEL_INTERFACE); + CMD(set_channel, SET_CHANNEL); + CMD(set_pan_id, SET_PAN_ID); + CMD(set_short_addr, SET_SHORT_ADDR); + CMD(set_backoff_exponent, SET_BACKOFF_EXPONENT); + CMD(set_max_csma_backoffs, SET_MAX_CSMA_BACKOFFS); + CMD(set_max_frame_retries, SET_MAX_FRAME_RETRIES); + CMD(set_lbt_mode, SET_LBT_MODE); + + if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_TXPOWER) + CMD(set_tx_power, SET_TX_POWER); + + if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_ED_LEVEL) + CMD(set_cca_ed_level, SET_CCA_ED_LEVEL); + + if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_MODE) + CMD(set_cca_mode, SET_CCA_MODE); + +#undef CMD + nla_nest_end(msg, nl_cmds); + finish: genlmsg_end(msg, hdr); return 0; -- cgit From 94db13fe5ff144ea5afcfa3a3daf38a239090acf Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 4 Jun 2015 08:33:48 -0700 Subject: bpf: fix build due to missing tc_verd fix build error: net/core/filter.c: In function 'bpf_clone_redirect': net/core/filter.c:1429:18: error: 'struct sk_buff' has no member named 'tc_verd' if (G_TC_AT(skb2->tc_verd) & AT_INGRESS) Fixes: 3896d655f4d4 ("bpf: introduce bpf_clone_redirect() helper") Reported-by: Or Gerlitz Reported-by: Fengguang Wu Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 64c121c09655..09b2062eb5b8 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -46,7 +46,6 @@ #include #include #include -#include /** * sk_filter - run a packet through a socket filter @@ -1426,8 +1425,7 @@ static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5) if (unlikely(!skb2)) return -ENOMEM; - if (G_TC_AT(skb2->tc_verd) & AT_INGRESS) - skb_push(skb2, skb2->mac_len); + skb_push(skb2, skb2->data - skb_mac_header(skb2)); if (BPF_IS_REDIRECT_INGRESS(flags)) return dev_forward_skb(dev, skb2); -- cgit From ce3b5355477ce99bffa60a6a215f2e11db4b649c Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Thu, 4 Jun 2015 09:16:36 -0700 Subject: net: Simplify GRE case in flow_dissector Do break when we see routing flag or a non-zero version number in GRE header. Acked-by: Jiri Pirko Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 1f2d89300b1a..7f699169dc92 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -308,30 +308,30 @@ flow_label: * Only look inside GRE if version zero and no * routing */ - if (!(hdr->flags & (GRE_VERSION|GRE_ROUTING))) { - proto = hdr->proto; + if (hdr->flags & (GRE_VERSION | GRE_ROUTING)) + break; + + proto = hdr->proto; + nhoff += 4; + if (hdr->flags & GRE_CSUM) nhoff += 4; - if (hdr->flags & GRE_CSUM) - nhoff += 4; - if (hdr->flags & GRE_KEY) - nhoff += 4; - if (hdr->flags & GRE_SEQ) - nhoff += 4; - if (proto == htons(ETH_P_TEB)) { - const struct ethhdr *eth; - struct ethhdr _eth; - - eth = __skb_header_pointer(skb, nhoff, - sizeof(_eth), - data, hlen, &_eth); - if (!eth) - return false; - proto = eth->h_proto; - nhoff += sizeof(*eth); - } - goto again; + if (hdr->flags & GRE_KEY) + nhoff += 4; + if (hdr->flags & GRE_SEQ) + nhoff += 4; + if (proto == htons(ETH_P_TEB)) { + const struct ethhdr *eth; + struct ethhdr _eth; + + eth = __skb_header_pointer(skb, nhoff, + sizeof(_eth), + data, hlen, &_eth); + if (!eth) + return false; + proto = eth->h_proto; + nhoff += sizeof(*eth); } - break; + goto again; } case IPPROTO_IPIP: proto = htons(ETH_P_IP); -- cgit From c468efe2c7d478bad8855f7d170cf245ee0f1b3f Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Thu, 4 Jun 2015 09:16:38 -0700 Subject: net: Remove superfluous setting of key_basic key_basic is set twice in __skb_flow_dissect which seems unnecessary. Remove second one. Acked-by: Jiri Pirko Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 7f699169dc92..0763795bea8f 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -343,12 +343,6 @@ flow_label: break; } - /* It is ensured by skb_flow_dissector_init() that basic key will - * be always present. - */ - key_basic = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_BASIC, - target_container); key_basic->n_proto = proto; key_basic->ip_proto = ip_proto; key_basic->thoff = (u16) nhoff; -- cgit From 42aecaa9bb2bd57eb8d61b4565cee5d3640863fb Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Thu, 4 Jun 2015 09:16:39 -0700 Subject: net: Get skb hash over flow_keys structure This patch changes flow hashing to use jhash2 over the flow_keys structure instead just doing jhash_3words over src, dst, and ports. This method will allow us take more input into the hashing function so that we can include full IPv6 addresses, VLAN, flow labels etc. without needing to resort to xor'ing which makes for a poor hash. Acked-by: Jiri Pirko Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 54 +++++++++++++++++++++++++++++++++++------------ net/sched/cls_flower.c | 2 ++ 2 files changed, 43 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 0763795bea8f..55b5f2962afa 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -57,9 +57,11 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, flow_dissector->offset[key->key_id] = key->offset; } - /* Ensure that the dissector always includes basic key. That way - * we are able to avoid handling lack of it in fast path. + /* Ensure that the dissector always includes control and basic key. + * That way we are able to avoid handling lack of these in fast path. */ + BUG_ON(!skb_flow_dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_CONTROL)); BUG_ON(!skb_flow_dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_BASIC)); } @@ -120,6 +122,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb, void *target_container, void *data, __be16 proto, int nhoff, int hlen) { + struct flow_dissector_key_control *key_control; struct flow_dissector_key_basic *key_basic; struct flow_dissector_key_addrs *key_addrs; struct flow_dissector_key_ports *key_ports; @@ -132,6 +135,13 @@ bool __skb_flow_dissect(const struct sk_buff *skb, hlen = skb_headlen(skb); } + /* It is ensured by skb_flow_dissector_init() that control key will + * be always present. + */ + key_control = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_CONTROL, + target_container); + /* It is ensured by skb_flow_dissector_init() that basic key will * be always present. */ @@ -219,7 +229,7 @@ flow_label: key_basic->n_proto = proto; key_basic->ip_proto = ip_proto; - key_basic->thoff = (u16)nhoff; + key_control->thoff = (u16)nhoff; if (skb_flow_dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) { @@ -275,7 +285,7 @@ flow_label: if (!hdr) return false; key_basic->n_proto = proto; - key_basic->thoff = (u16)nhoff; + key_control->thoff = (u16)nhoff; if (skb_flow_dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS)) { @@ -288,7 +298,7 @@ flow_label: return true; } case htons(ETH_P_FCOE): - key_basic->thoff = (u16)(nhoff + FCOE_HEADER_LEN); + key_control->thoff = (u16)(nhoff + FCOE_HEADER_LEN); /* fall through */ default: return false; @@ -345,7 +355,7 @@ flow_label: key_basic->n_proto = proto; key_basic->ip_proto = ip_proto; - key_basic->thoff = (u16) nhoff; + key_control->thoff = (u16)nhoff; if (skb_flow_dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) { @@ -366,9 +376,21 @@ static __always_inline void __flow_hash_secret_init(void) net_get_random_once(&hashrnd, sizeof(hashrnd)); } -static __always_inline u32 __flow_hash_3words(u32 a, u32 b, u32 c, u32 keyval) +static __always_inline u32 __flow_hash_words(u32 *words, u32 length, u32 keyval) +{ + return jhash2(words, length, keyval); +} + +static inline void *flow_keys_hash_start(struct flow_keys *flow) { - return jhash_3words(a, b, c, keyval); + BUILD_BUG_ON(FLOW_KEYS_HASH_OFFSET % sizeof(u32)); + return (void *)flow + FLOW_KEYS_HASH_OFFSET; +} + +static inline size_t flow_keys_hash_length(struct flow_keys *flow) +{ + BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32)); + return (sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) / sizeof(u32); } static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval) @@ -383,10 +405,8 @@ static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval) swap(keys->ports.src, keys->ports.dst); } - hash = __flow_hash_3words((__force u32)keys->addrs.dst, - (__force u32)keys->addrs.src, - (__force u32)keys->ports.ports, - keyval); + hash = __flow_hash_words((u32 *)flow_keys_hash_start(keys), + flow_keys_hash_length(keys), keyval); if (!hash) hash = 1; @@ -473,7 +493,7 @@ EXPORT_SYMBOL(skb_get_hash_perturb); u32 __skb_get_poff(const struct sk_buff *skb, void *data, const struct flow_keys *keys, int hlen) { - u32 poff = keys->basic.thoff; + u32 poff = keys->control.thoff; switch (keys->basic.ip_proto) { case IPPROTO_TCP: { @@ -536,6 +556,10 @@ u32 skb_get_poff(const struct sk_buff *skb) } static const struct flow_dissector_key flow_keys_dissector_keys[] = { + { + .key_id = FLOW_DISSECTOR_KEY_CONTROL, + .offset = offsetof(struct flow_keys, control), + }, { .key_id = FLOW_DISSECTOR_KEY_BASIC, .offset = offsetof(struct flow_keys, basic), @@ -555,6 +579,10 @@ static const struct flow_dissector_key flow_keys_dissector_keys[] = { }; static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = { + { + .key_id = FLOW_DISSECTOR_KEY_CONTROL, + .offset = offsetof(struct flow_keys, control), + }, { .key_id = FLOW_DISSECTOR_KEY_BASIC, .offset = offsetof(struct flow_keys, basic), diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 8c8f34ef6980..5a7d66c59684 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -25,6 +25,7 @@ struct fl_flow_key { int indev_ifindex; + struct flow_dissector_key_control control; struct flow_dissector_key_basic basic; struct flow_dissector_key_eth_addrs eth; union { @@ -347,6 +348,7 @@ static void fl_init_dissector(struct cls_fl_head *head, struct flow_dissector_key keys[FLOW_DISSECTOR_KEY_MAX]; size_t cnt = 0; + FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_CONTROL, control); FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_BASIC, basic); FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, FLOW_DISSECTOR_KEY_ETH_ADDRS, eth); -- cgit From c3f8324188fa80178f20c8209b492ca6191177e8 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Thu, 4 Jun 2015 09:16:40 -0700 Subject: net: Add full IPv6 addresses to flow_keys This patch adds full IPv6 addresses into flow_keys and uses them as input to the flow hash function. The implementation supports either IPv4 or IPv6 addresses in a union, and selector is used to determine how may words to input to jhash2. We also add flow_get_u32_dst and flow_get_u32_src functions which are used to get a u32 representation of the source and destination addresses. For IPv6, ipv6_addr_hash is called. These functions retain getting the legacy values of src and dst in flow_keys. With this patch, Ethertype and IP protocol are now included in the flow hash input. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 116 ++++++++++++++++++++++++++++++++++++++-------- net/ethernet/eth.c | 2 +- net/sched/cls_flow.c | 14 ++++-- net/sched/cls_flower.c | 11 +++-- 4 files changed, 114 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 55b5f2962afa..ca9d22488cfb 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -178,10 +178,12 @@ ip: if (!skb_flow_dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS)) break; + key_addrs = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_IPV4_ADDRS, - target_container); - memcpy(key_addrs, &iph->saddr, sizeof(*key_addrs)); + FLOW_DISSECTOR_KEY_IPV4_ADDRS, target_container); + memcpy(&key_addrs->v4addrs, &iph->saddr, + sizeof(key_addrs->v4addrs)); + key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; break; } case htons(ETH_P_IPV6): { @@ -203,8 +205,11 @@ ipv6: FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS, target_container); - key_addrs->src = (__force __be32)ipv6_addr_hash(&iph->saddr); - key_addrs->dst = (__force __be32)ipv6_addr_hash(&iph->daddr); + key_addrs->v4addrs.src = + (__force __be32)ipv6_addr_hash(&iph->saddr); + key_addrs->v4addrs.dst = + (__force __be32)ipv6_addr_hash(&iph->daddr); + key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; goto flow_label; } if (skb_flow_dissector_uses_key(flow_dissector, @@ -216,6 +221,7 @@ ipv6: target_container); memcpy(key_ipv6_addrs, &iph->saddr, sizeof(*key_ipv6_addrs)); + key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; goto flow_label; } break; @@ -292,8 +298,9 @@ flow_label: key_addrs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS, target_container); - key_addrs->src = hdr->srcnode; - key_addrs->dst = 0; + key_addrs->v4addrs.src = hdr->srcnode; + key_addrs->v4addrs.dst = 0; + key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; } return true; } @@ -389,21 +396,88 @@ static inline void *flow_keys_hash_start(struct flow_keys *flow) static inline size_t flow_keys_hash_length(struct flow_keys *flow) { + size_t diff = FLOW_KEYS_HASH_OFFSET + sizeof(flow->addrs); BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32)); - return (sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) / sizeof(u32); + BUILD_BUG_ON(offsetof(typeof(*flow), addrs) != + sizeof(*flow) - sizeof(flow->addrs)); + + switch (flow->control.addr_type) { + case FLOW_DISSECTOR_KEY_IPV4_ADDRS: + diff -= sizeof(flow->addrs.v4addrs); + break; + case FLOW_DISSECTOR_KEY_IPV6_ADDRS: + diff -= sizeof(flow->addrs.v6addrs); + break; + } + return (sizeof(*flow) - diff) / sizeof(u32); +} + +__be32 flow_get_u32_src(const struct flow_keys *flow) +{ + switch (flow->control.addr_type) { + case FLOW_DISSECTOR_KEY_IPV4_ADDRS: + return flow->addrs.v4addrs.src; + case FLOW_DISSECTOR_KEY_IPV6_ADDRS: + return (__force __be32)ipv6_addr_hash( + &flow->addrs.v6addrs.src); + default: + return 0; + } +} +EXPORT_SYMBOL(flow_get_u32_src); + +__be32 flow_get_u32_dst(const struct flow_keys *flow) +{ + switch (flow->control.addr_type) { + case FLOW_DISSECTOR_KEY_IPV4_ADDRS: + return flow->addrs.v4addrs.dst; + case FLOW_DISSECTOR_KEY_IPV6_ADDRS: + return (__force __be32)ipv6_addr_hash( + &flow->addrs.v6addrs.dst); + default: + return 0; + } +} +EXPORT_SYMBOL(flow_get_u32_dst); + +static inline void __flow_hash_consistentify(struct flow_keys *keys) +{ + int addr_diff, i; + + switch (keys->control.addr_type) { + case FLOW_DISSECTOR_KEY_IPV4_ADDRS: + addr_diff = (__force u32)keys->addrs.v4addrs.dst - + (__force u32)keys->addrs.v4addrs.src; + if ((addr_diff < 0) || + (addr_diff == 0 && + ((__force u16)keys->ports.dst < + (__force u16)keys->ports.src))) { + swap(keys->addrs.v4addrs.src, keys->addrs.v4addrs.dst); + swap(keys->ports.src, keys->ports.dst); + } + break; + case FLOW_DISSECTOR_KEY_IPV6_ADDRS: + addr_diff = memcmp(&keys->addrs.v6addrs.dst, + &keys->addrs.v6addrs.src, + sizeof(keys->addrs.v6addrs.dst)); + if ((addr_diff < 0) || + (addr_diff == 0 && + ((__force u16)keys->ports.dst < + (__force u16)keys->ports.src))) { + for (i = 0; i < 4; i++) + swap(keys->addrs.v6addrs.src.s6_addr32[i], + keys->addrs.v6addrs.dst.s6_addr32[i]); + swap(keys->ports.src, keys->ports.dst); + } + break; + } } static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval) { u32 hash; - /* get a consistent hash (same value on both flow directions) */ - if (((__force u32)keys->addrs.dst < (__force u32)keys->addrs.src) || - (((__force u32)keys->addrs.dst == (__force u32)keys->addrs.src) && - ((__force u16)keys->ports.dst < (__force u16)keys->ports.src))) { - swap(keys->addrs.dst, keys->addrs.src); - swap(keys->ports.src, keys->ports.dst); - } + __flow_hash_consistentify(keys); hash = __flow_hash_words((u32 *)flow_keys_hash_start(keys), flow_keys_hash_length(keys), keyval); @@ -451,8 +525,8 @@ void make_flow_keys_digest(struct flow_keys_digest *digest, data->n_proto = flow->basic.n_proto; data->ip_proto = flow->basic.ip_proto; data->ports = flow->ports.ports; - data->src = flow->addrs.src; - data->dst = flow->addrs.dst; + data->src = flow->addrs.v4addrs.src; + data->dst = flow->addrs.v4addrs.dst; } EXPORT_SYMBOL(make_flow_keys_digest); @@ -566,11 +640,15 @@ static const struct flow_dissector_key flow_keys_dissector_keys[] = { }, { .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS, - .offset = offsetof(struct flow_keys, addrs), + .offset = offsetof(struct flow_keys, addrs.v4addrs), + }, + { + .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS, + .offset = offsetof(struct flow_keys, addrs.v6addrs), }, { .key_id = FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS, - .offset = offsetof(struct flow_keys, addrs), + .offset = offsetof(struct flow_keys, addrs.v4addrs), }, { .key_id = FLOW_DISSECTOR_KEY_PORTS, diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 7d0e239a6755..77e0f0e7a88e 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -133,7 +133,7 @@ u32 eth_get_headlen(void *data, unsigned int len) /* parse any remaining L2/L3 headers, check for L4 */ if (!skb_flow_dissect_flow_keys_buf(&keys, data, eth->h_proto, sizeof(*eth), len)) - return max_t(u32, keys.basic.thoff, sizeof(*eth)); + return max_t(u32, keys.control.thoff, sizeof(*eth)); /* parse for any L4 headers */ return min_t(u32, __skb_get_poff(NULL, data, &keys, len), len); diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index b4359924846c..76bc3a20ffdb 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -68,15 +68,21 @@ static inline u32 addr_fold(void *addr) static u32 flow_get_src(const struct sk_buff *skb, const struct flow_keys *flow) { - if (flow->addrs.src) - return ntohl(flow->addrs.src); + __be32 src = flow_get_u32_src(flow); + + if (src) + return ntohl(src); + return addr_fold(skb->sk); } static u32 flow_get_dst(const struct sk_buff *skb, const struct flow_keys *flow) { - if (flow->addrs.dst) - return ntohl(flow->addrs.dst); + __be32 dst = flow_get_u32_dst(flow); + + if (dst) + return ntohl(dst); + return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb); } diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 5a7d66c59684..b92d3f49c23e 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -28,8 +28,9 @@ struct fl_flow_key { struct flow_dissector_key_control control; struct flow_dissector_key_basic basic; struct flow_dissector_key_eth_addrs eth; + struct flow_dissector_key_addrs ipaddrs; union { - struct flow_dissector_key_addrs ipv4; + struct flow_dissector_key_ipv4_addrs ipv4; struct flow_dissector_key_ipv6_addrs ipv6; }; struct flow_dissector_key_ports tp; @@ -260,14 +261,14 @@ static int fl_set_key(struct net *net, struct nlattr **tb, &mask->basic.ip_proto, TCA_FLOWER_UNSPEC, sizeof(key->basic.ip_proto)); } - if (key->basic.n_proto == htons(ETH_P_IP)) { + if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { fl_set_key_val(tb, &key->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC, &mask->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC_MASK, sizeof(key->ipv4.src)); fl_set_key_val(tb, &key->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST, &mask->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST_MASK, sizeof(key->ipv4.dst)); - } else if (key->basic.n_proto == htons(ETH_P_IPV6)) { + } else if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { fl_set_key_val(tb, &key->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC, &mask->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC_MASK, sizeof(key->ipv6.src)); @@ -610,7 +611,7 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, sizeof(key->basic.ip_proto))) goto nla_put_failure; - if (key->basic.n_proto == htons(ETH_P_IP) && + if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS && (fl_dump_key_val(skb, &key->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC, &mask->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC_MASK, sizeof(key->ipv4.src)) || @@ -618,7 +619,7 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, &mask->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST_MASK, sizeof(key->ipv4.dst)))) goto nla_put_failure; - else if (key->basic.n_proto == htons(ETH_P_IPV6) && + else if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS && (fl_dump_key_val(skb, &key->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC, &mask->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC_MASK, sizeof(key->ipv6.src)) || -- cgit From 9f24908901c5f4b1e3b07548106b1790af933476 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Thu, 4 Jun 2015 09:16:41 -0700 Subject: net: Add keys for TIPC address Add a new flow key for TIPC addresses. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index ca9d22488cfb..91861c3d45a7 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -294,13 +294,12 @@ flow_label: key_control->thoff = (u16)nhoff; if (skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS)) { + FLOW_DISSECTOR_KEY_TIPC_ADDRS)) { key_addrs = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS, + FLOW_DISSECTOR_KEY_TIPC_ADDRS, target_container); - key_addrs->v4addrs.src = hdr->srcnode; - key_addrs->v4addrs.dst = 0; - key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + key_addrs->tipcaddrs.srcnode = hdr->srcnode; + key_control->addr_type = FLOW_DISSECTOR_KEY_TIPC_ADDRS; } return true; } @@ -408,6 +407,9 @@ static inline size_t flow_keys_hash_length(struct flow_keys *flow) case FLOW_DISSECTOR_KEY_IPV6_ADDRS: diff -= sizeof(flow->addrs.v6addrs); break; + case FLOW_DISSECTOR_KEY_TIPC_ADDRS: + diff -= sizeof(flow->addrs.tipcaddrs); + break; } return (sizeof(*flow) - diff) / sizeof(u32); } @@ -420,6 +422,8 @@ __be32 flow_get_u32_src(const struct flow_keys *flow) case FLOW_DISSECTOR_KEY_IPV6_ADDRS: return (__force __be32)ipv6_addr_hash( &flow->addrs.v6addrs.src); + case FLOW_DISSECTOR_KEY_TIPC_ADDRS: + return flow->addrs.tipcaddrs.srcnode; default: return 0; } @@ -650,6 +654,10 @@ static const struct flow_dissector_key flow_keys_dissector_keys[] = { .key_id = FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS, .offset = offsetof(struct flow_keys, addrs.v4addrs), }, + { + .key_id = FLOW_DISSECTOR_KEY_TIPC_ADDRS, + .offset = offsetof(struct flow_keys, addrs.tipcaddrs), + }, { .key_id = FLOW_DISSECTOR_KEY_PORTS, .offset = offsetof(struct flow_keys, ports), -- cgit From 45b47fd00ca14df869979dbbe14324625ec93552 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Thu, 4 Jun 2015 09:16:42 -0700 Subject: net: Get rid of IPv6 hash addresses flow keys We don't need to return the IPv6 address hash as part of flow keys. In general, using the IPv6 address hash is risky in a hash value since the underlying use of xor provides no entropy. If someone really needs the hash value they can get it from the full IPv6 addresses in flow keys (e.g. from flow_get_u32_src). Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 91861c3d45a7..5348a468de78 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -199,19 +199,6 @@ ipv6: ip_proto = iph->nexthdr; nhoff += sizeof(struct ipv6hdr); - if (skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS)) { - key_addrs = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS, - target_container); - - key_addrs->v4addrs.src = - (__force __be32)ipv6_addr_hash(&iph->saddr); - key_addrs->v4addrs.dst = - (__force __be32)ipv6_addr_hash(&iph->daddr); - key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; - goto flow_label; - } if (skb_flow_dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV6_ADDRS)) { struct flow_dissector_key_ipv6_addrs *key_ipv6_addrs; @@ -650,10 +637,6 @@ static const struct flow_dissector_key flow_keys_dissector_keys[] = { .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS, .offset = offsetof(struct flow_keys, addrs.v6addrs), }, - { - .key_id = FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS, - .offset = offsetof(struct flow_keys, addrs.v4addrs), - }, { .key_id = FLOW_DISSECTOR_KEY_TIPC_ADDRS, .offset = offsetof(struct flow_keys, addrs.tipcaddrs), -- cgit From d34af823ff401c312541aa613c49ea4b872bde9e Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Thu, 4 Jun 2015 09:16:43 -0700 Subject: net: Add VLAN ID to flow_keys In flow_dissector set vlan_id in flow_keys when VLAN is found. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 5348a468de78..5c66cb2e66df 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -126,6 +126,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector_key_basic *key_basic; struct flow_dissector_key_addrs *key_addrs; struct flow_dissector_key_ports *key_ports; + struct flow_dissector_key_tags *key_tags; u8 ip_proto; if (!data) { @@ -246,6 +247,15 @@ flow_label: if (!vlan) return false; + if (skb_flow_dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_VLANID)) { + key_tags = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_VLANID, + target_container); + + key_tags->vlan_id = skb_vlan_tag_get_id(skb); + } + proto = vlan->h_vlan_encapsulated_proto; nhoff += sizeof(*vlan); goto again; @@ -645,6 +655,10 @@ static const struct flow_dissector_key flow_keys_dissector_keys[] = { .key_id = FLOW_DISSECTOR_KEY_PORTS, .offset = offsetof(struct flow_keys, ports), }, + { + .key_id = FLOW_DISSECTOR_KEY_VLANID, + .offset = offsetof(struct flow_keys, tags), + }, }; static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = { -- cgit From 87ee9e52ffeb168803a76cc07734425227cc2268 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Thu, 4 Jun 2015 09:16:44 -0700 Subject: net: Add IPv6 flow label to flow_keys In flow_dissector set the flow label in flow_keys for IPv6. This also removes the shortcircuiting of flow dissection when a non-zero label is present, the flow label can be considered to provide additional entropy for a hash. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 29 ++++++++++------------------- 1 file changed, 10 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 5c66cb2e66df..70f0567b6be9 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -210,30 +210,17 @@ ipv6: memcpy(key_ipv6_addrs, &iph->saddr, sizeof(*key_ipv6_addrs)); key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; - goto flow_label; } - break; -flow_label: + flow_label = ip6_flowlabel(iph); if (flow_label) { - /* Awesome, IPv6 packet has a flow label so we can - * use that to represent the ports without any - * further dissection. - */ - - key_basic->n_proto = proto; - key_basic->ip_proto = ip_proto; - key_control->thoff = (u16)nhoff; - if (skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_PORTS)) { - key_ports = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_PORTS, - target_container); - key_ports->ports = flow_label; + FLOW_DISSECTOR_KEY_FLOW_LABEL)) { + key_tags = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_FLOW_LABEL, + target_container); + key_tags->flow_label = ntohl(flow_label); } - - return true; } break; @@ -659,6 +646,10 @@ static const struct flow_dissector_key flow_keys_dissector_keys[] = { .key_id = FLOW_DISSECTOR_KEY_VLANID, .offset = offsetof(struct flow_keys, tags), }, + { + .key_id = FLOW_DISSECTOR_KEY_FLOW_LABEL, + .offset = offsetof(struct flow_keys, tags), + }, }; static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = { -- cgit From 1fdd512c92003cf2d671ba22753d13302bf8cd1d Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Thu, 4 Jun 2015 09:16:45 -0700 Subject: net: Add GRE keyid in flow_keys In flow dissector if a GRE header contains a keyid this is saved in the new keyid field of flow_keys. The GRE keyid is then represented in the flow hash function input. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 70f0567b6be9..7ddc0a7b3da5 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -127,6 +127,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector_key_addrs *key_addrs; struct flow_dissector_key_ports *key_ports; struct flow_dissector_key_tags *key_tags; + struct flow_dissector_key_keyid *key_keyid; u8 ip_proto; if (!data) { @@ -315,8 +316,25 @@ ipv6: nhoff += 4; if (hdr->flags & GRE_CSUM) nhoff += 4; - if (hdr->flags & GRE_KEY) + if (hdr->flags & GRE_KEY) { + const __be32 *keyid; + __be32 _keyid; + + keyid = __skb_header_pointer(skb, nhoff, sizeof(_keyid), + data, hlen, &_keyid); + + if (!keyid) + return false; + + if (skb_flow_dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_GRE_KEYID)) { + key_keyid = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_GRE_KEYID, + target_container); + key_keyid->keyid = *keyid; + } nhoff += 4; + } if (hdr->flags & GRE_SEQ) nhoff += 4; if (proto == htons(ETH_P_TEB)) { @@ -650,6 +668,10 @@ static const struct flow_dissector_key flow_keys_dissector_keys[] = { .key_id = FLOW_DISSECTOR_KEY_FLOW_LABEL, .offset = offsetof(struct flow_keys, tags), }, + { + .key_id = FLOW_DISSECTOR_KEY_GRE_KEYID, + .offset = offsetof(struct flow_keys, keyid), + }, }; static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = { -- cgit From b3baa0fbd02a1a9d493d8cb92ae4a4491b9e9d13 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Thu, 4 Jun 2015 09:16:46 -0700 Subject: mpls: Add MPLS entropy label in flow_keys In flow dissector if an MPLS header contains an entropy label this is saved in the new keyid field of flow_keys. The entropy label is then represented in the flow hash function input. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 7ddc0a7b3da5..77e22e4fc898 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -288,6 +289,37 @@ ipv6: } return true; } + + case htons(ETH_P_MPLS_UC): + case htons(ETH_P_MPLS_MC): { + struct mpls_label *hdr, _hdr[2]; +mpls: + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, + hlen, &_hdr); + if (!hdr) + return false; + + if ((ntohl(hdr[0].entry) & MPLS_LS_LABEL_MASK) == + MPLS_LABEL_ENTROPY) { + if (skb_flow_dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) { + key_keyid = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_MPLS_ENTROPY, + target_container); + key_keyid->keyid = hdr[1].entry & + htonl(MPLS_LS_LABEL_MASK); + } + + key_basic->n_proto = proto; + key_basic->ip_proto = ip_proto; + key_control->thoff = (u16)nhoff; + + return true; + } + + return true; + } + case htons(ETH_P_FCOE): key_control->thoff = (u16)(nhoff + FCOE_HEADER_LEN); /* fall through */ @@ -357,6 +389,9 @@ ipv6: case IPPROTO_IPV6: proto = htons(ETH_P_IPV6); goto ipv6; + case IPPROTO_MPLS: + proto = htons(ETH_P_MPLS_UC); + goto mpls; default: break; } -- cgit From 951b6a0717db97ce420547222647bcc40bf1eacd Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 14 May 2015 12:58:08 +0530 Subject: Bluetooth: Fix potential NULL dereference in RFCOMM bind callback addr can be NULL and it should not be dereferenced before NULL checking. Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- net/bluetooth/rfcomm/sock.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index b2338e971b33..7511df72347f 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -334,16 +334,19 @@ static int rfcomm_sock_create(struct net *net, struct socket *sock, static int rfcomm_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { - struct sockaddr_rc *sa = (struct sockaddr_rc *) addr; + struct sockaddr_rc sa; struct sock *sk = sock->sk; - int chan = sa->rc_channel; - int err = 0; - - BT_DBG("sk %p %pMR", sk, &sa->rc_bdaddr); + int len, err = 0; if (!addr || addr->sa_family != AF_BLUETOOTH) return -EINVAL; + memset(&sa, 0, sizeof(sa)); + len = min_t(unsigned int, sizeof(sa), addr_len); + memcpy(&sa, addr, len); + + BT_DBG("sk %p %pMR", sk, &sa.rc_bdaddr); + lock_sock(sk); if (sk->sk_state != BT_OPEN) { @@ -358,12 +361,13 @@ static int rfcomm_sock_bind(struct socket *sock, struct sockaddr *addr, int addr write_lock(&rfcomm_sk_list.lock); - if (chan && __rfcomm_get_listen_sock_by_addr(chan, &sa->rc_bdaddr)) { + if (sa.rc_channel && + __rfcomm_get_listen_sock_by_addr(sa.rc_channel, &sa.rc_bdaddr)) { err = -EADDRINUSE; } else { /* Save source address */ - bacpy(&rfcomm_pi(sk)->src, &sa->rc_bdaddr); - rfcomm_pi(sk)->channel = chan; + bacpy(&rfcomm_pi(sk)->src, &sa.rc_bdaddr); + rfcomm_pi(sk)->channel = sa.rc_channel; sk->sk_state = BT_BOUND; } -- cgit From 9380f9eacfbbee701daa416edd6625efcd3e29e1 Mon Sep 17 00:00:00 2001 From: Loic Poulain Date: Thu, 21 May 2015 16:46:41 +0200 Subject: Bluetooth: Reorder HCI user channel socket release The hci close method needs to know if we are in user channel context. Only add the index to mgmt once close is performed. Signed-off-by: Loic Poulain Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_sock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 5b14dcafcd08..9467545e5c97 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -503,9 +503,9 @@ static int hci_sock_release(struct socket *sock) if (hdev) { if (hci_pi(sk)->channel == HCI_CHANNEL_USER) { - mgmt_index_added(hdev); - hci_dev_clear_flag(hdev, HCI_USER_CHANNEL); hci_dev_close(hdev->id); + hci_dev_clear_flag(hdev, HCI_USER_CHANNEL); + mgmt_index_added(hdev); } atomic_dec(&hdev->promisc); -- cgit From 90c337da1524863838658078ec34241f45d8394d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 6 Jun 2015 21:17:57 -0700 Subject: inet: add IP_BIND_ADDRESS_NO_PORT to overcome bind(0) limitations When an application needs to force a source IP on an active TCP socket it has to use bind(IP, port=x). As most applications do not want to deal with already used ports, x is often set to 0, meaning the kernel is in charge to find an available port. But kernel does not know yet if this socket is going to be a listener or be connected. It has very limited choices (no full knowledge of final 4-tuple for a connect()) With limited ephemeral port range (about 32K ports), it is very easy to fill the space. This patch adds a new SOL_IP socket option, asking kernel to ignore the 0 port provided by application in bind(IP, port=0) and only remember the given IP address. The port will be automatically chosen at connect() time, in a way that allows sharing a source port as long as the 4-tuples are unique. This new feature is available for both IPv4 and IPv6 (Thanks Neal) Tested: Wrote a test program and checked its behavior on IPv4 and IPv6. strace(1) shows sequences of bind(IP=127.0.0.2, port=0) followed by connect(). Also getsockname() show that the port is still 0 right after bind() but properly allocated after connect(). socket(PF_INET, SOCK_STREAM, IPPROTO_IP) = 5 setsockopt(5, SOL_IP, IP_BIND_ADDRESS_NO_PORT, [1], 4) = 0 bind(5, {sa_family=AF_INET, sin_port=htons(0), sin_addr=inet_addr("127.0.0.2")}, 16) = 0 getsockname(5, {sa_family=AF_INET, sin_port=htons(0), sin_addr=inet_addr("127.0.0.2")}, [16]) = 0 connect(5, {sa_family=AF_INET, sin_port=htons(53174), sin_addr=inet_addr("127.0.0.3")}, 16) = 0 getsockname(5, {sa_family=AF_INET, sin_port=htons(38050), sin_addr=inet_addr("127.0.0.2")}, [16]) = 0 IPv6 test : socket(PF_INET6, SOCK_STREAM, IPPROTO_IP) = 7 setsockopt(7, SOL_IP, IP_BIND_ADDRESS_NO_PORT, [1], 4) = 0 bind(7, {sa_family=AF_INET6, sin6_port=htons(0), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, 28) = 0 getsockname(7, {sa_family=AF_INET6, sin6_port=htons(0), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, [28]) = 0 connect(7, {sa_family=AF_INET6, sin6_port=htons(57300), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, 28) = 0 getsockname(7, {sa_family=AF_INET6, sin6_port=htons(60964), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, [28]) = 0 I was able to bind()/connect() a million concurrent IPv4 sockets, instead of ~32000 before patch. lpaa23:~# ulimit -n 1000010 lpaa23:~# ./bind --connect --num-flows=1000000 & 1000000 sockets lpaa23:~# grep TCP /proc/net/sockstat TCP: inuse 2000063 orphan 0 tw 47 alloc 2000157 mem 66 Check that a given source port is indeed used by many different connections : lpaa23:~# ss -t src :40000 | head -10 State Recv-Q Send-Q Local Address:Port Peer Address:Port ESTAB 0 0 127.0.0.2:40000 127.0.202.33:44983 ESTAB 0 0 127.0.0.2:40000 127.2.27.240:44983 ESTAB 0 0 127.0.0.2:40000 127.2.98.5:44983 ESTAB 0 0 127.0.0.2:40000 127.0.124.196:44983 ESTAB 0 0 127.0.0.2:40000 127.2.139.38:44983 ESTAB 0 0 127.0.0.2:40000 127.1.59.80:44983 ESTAB 0 0 127.0.0.2:40000 127.3.6.228:44983 ESTAB 0 0 127.0.0.2:40000 127.0.38.53:44983 ESTAB 0 0 127.0.0.2:40000 127.1.197.10:44983 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 3 ++- net/ipv4/ip_sockglue.c | 7 +++++++ net/ipv6/af_inet6.c | 3 ++- 3 files changed, 11 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 6ad0f7a711c9..cc858ef44451 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -488,7 +488,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) inet->inet_saddr = 0; /* Use device */ /* Make sure we are allowed to bind here. */ - if (sk->sk_prot->get_port(sk, snum)) { + if ((snum || !inet->bind_address_no_port) && + sk->sk_prot->get_port(sk, snum)) { inet->inet_saddr = inet->inet_rcv_saddr = 0; err = -EADDRINUSE; goto out_release_sock; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 7cfb0893f263..04ae2992a5cd 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -582,6 +582,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, case IP_TRANSPARENT: case IP_MINTTL: case IP_NODEFRAG: + case IP_BIND_ADDRESS_NO_PORT: case IP_UNICAST_IF: case IP_MULTICAST_TTL: case IP_MULTICAST_ALL: @@ -732,6 +733,9 @@ static int do_ip_setsockopt(struct sock *sk, int level, } inet->nodefrag = val ? 1 : 0; break; + case IP_BIND_ADDRESS_NO_PORT: + inet->bind_address_no_port = val ? 1 : 0; + break; case IP_MTU_DISCOVER: if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT) goto e_inval; @@ -1324,6 +1328,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_NODEFRAG: val = inet->nodefrag; break; + case IP_BIND_ADDRESS_NO_PORT: + val = inet->bind_address_no_port; + break; case IP_MTU_DISCOVER: val = inet->pmtudisc; break; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index f3866c0b6cfe..7de52b65173f 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -362,7 +362,8 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) np->saddr = addr->sin6_addr; /* Make sure we are allowed to bind here. */ - if (sk->sk_prot->get_port(sk, snum)) { + if ((snum || !inet->bind_address_no_port) && + sk->sk_prot->get_port(sk, snum)) { inet_reset_saddr(sk); err = -EADDRINUSE; goto out; -- cgit From ed65963ba0a2bdc330b1d7183f930d1c6a0a6685 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Sat, 6 Jun 2015 17:30:46 +0200 Subject: mac802154: remove unneeded vif struct This patch removes the virtual interface structure from sub if data struct, because it isn't used anywhere. This structure could be useful for give per interface information at softmac driver layer. Nevertheless there exist no use case currently and it contains the interface type information currently. This information is also stored inside wpan dev which is now used to check on the wpan dev interface type. Signed-off-by: Alexander Aring Reviewed-by: Varka Bhadram Acked-by: Varka Bhadram Signed-off-by: Marcel Holtmann --- net/mac802154/ieee802154_i.h | 2 -- net/mac802154/iface.c | 9 ++++----- net/mac802154/rx.c | 4 ++-- 3 files changed, 6 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/mac802154/ieee802154_i.h b/net/mac802154/ieee802154_i.h index eec668f3637f..34755d5751a4 100644 --- a/net/mac802154/ieee802154_i.h +++ b/net/mac802154/ieee802154_i.h @@ -92,8 +92,6 @@ struct ieee802154_sub_if_data { struct mutex sec_mtx; struct mac802154_llsec sec; - /* must be last, dynamically sized area in this! */ - struct ieee802154_vif vif; }; #define MAC802154_CHAN_NONE 0xff /* No channel is assigned */ diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c index 6ac023932ce0..3a67d35d4672 100644 --- a/net/mac802154/iface.c +++ b/net/mac802154/iface.c @@ -219,8 +219,8 @@ ieee802154_check_concurrent_iface(struct ieee802154_sub_if_data *sdata, * exist really an use case if we need to support * multiple node types at the same time. */ - if (sdata->vif.type == NL802154_IFTYPE_NODE && - nsdata->vif.type == NL802154_IFTYPE_NODE) + if (wpan_dev->iftype == NL802154_IFTYPE_NODE && + nsdata->wpan_dev.iftype == NL802154_IFTYPE_NODE) return -EBUSY; /* check all phy mac sublayer settings are the same. @@ -243,7 +243,7 @@ static int mac802154_wpan_open(struct net_device *dev) struct ieee802154_local *local = sdata->local; struct wpan_dev *wpan_dev = &sdata->wpan_dev; - rc = ieee802154_check_concurrent_iface(sdata, sdata->vif.type); + rc = ieee802154_check_concurrent_iface(sdata, wpan_dev->iftype); if (rc < 0) return rc; @@ -467,7 +467,6 @@ ieee802154_setup_sdata(struct ieee802154_sub_if_data *sdata, u8 tmp; /* set some type-dependent values */ - sdata->vif.type = type; sdata->wpan_dev.iftype = type; get_random_bytes(&tmp, sizeof(tmp)); @@ -523,7 +522,7 @@ ieee802154_if_add(struct ieee802154_local *local, const char *name, ASSERT_RTNL(); - ndev = alloc_netdev(sizeof(*sdata) + local->hw.vif_data_size, name, + ndev = alloc_netdev(sizeof(*sdata), name, name_assign_type, ieee802154_if_setup); if (!ndev) return ERR_PTR(-ENOMEM); diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c index e0f10063cac3..1bdf98068608 100644 --- a/net/mac802154/rx.c +++ b/net/mac802154/rx.c @@ -202,7 +202,7 @@ __ieee802154_rx_handle_packet(struct ieee802154_local *local, } list_for_each_entry_rcu(sdata, &local->interfaces, list) { - if (sdata->vif.type != NL802154_IFTYPE_NODE || + if (sdata->wpan_dev.iftype != NL802154_IFTYPE_NODE || !netif_running(sdata->dev)) continue; @@ -227,7 +227,7 @@ ieee802154_monitors_rx(struct ieee802154_local *local, struct sk_buff *skb) skb->protocol = htons(ETH_P_IEEE802154); list_for_each_entry_rcu(sdata, &local->interfaces, list) { - if (sdata->vif.type != NL802154_IFTYPE_MONITOR) + if (sdata->wpan_dev.iftype != NL802154_IFTYPE_MONITOR) continue; if (!ieee802154_sdata_running(sdata)) -- cgit From 98da81a426a76c351f3c2854d5bc31f17bba3194 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 4 Jun 2015 08:01:12 -0700 Subject: tcp: remove redundant checks II For same reasons than in commit 12e25e1041d0 ("tcp: remove redundant checks"), we can remove redundant checks done for timewait sockets. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 4 ---- net/ipv6/tcp_ipv6.c | 4 ---- 2 files changed, 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 3130c42240c8..d7d4c2b79cf2 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1671,10 +1671,6 @@ do_time_wait: goto discard_it; } - if (skb->len < (th->doff << 2)) { - inet_twsk_put(inet_twsk(sk)); - goto bad_packet; - } if (tcp_checksum_complete(skb)) { inet_twsk_put(inet_twsk(sk)); goto csum_error; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index b72e1ffc6bb4..45a7176ed460 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1467,10 +1467,6 @@ do_time_wait: tcp_v6_fill_cb(skb, hdr, th); - if (skb->len < (th->doff<<2)) { - inet_twsk_put(inet_twsk(sk)); - goto bad_packet; - } if (tcp_checksum_complete(skb)) { inet_twsk_put(inet_twsk(sk)); goto csum_error; -- cgit From 3431205e03977aaf32bce6d4b16fb8244b510056 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 4 Jun 2015 10:11:53 -0700 Subject: bpf: make programs see skb->data == L2 for ingress and egress eBPF programs attached to ingress and egress qdiscs see inconsistent skb->data. For ingress L2 header is already pulled, whereas for egress it's present. This is known to program writers which are currently forced to use BPF_LL_OFF workaround. Since programs don't change skb internal pointers it is safe to do pull/push right around invocation of the program and earlier taps and later pt->func() will not be affected. Multiple taps via packet_rcv(), tpacket_rcv() are doing the same trick around run_filter/BPF_PROG_RUN even if skb_shared. This fix finally allows programs to use optimized LD_ABS/IND instructions without BPF_LL_OFF for higher performance. tc ingress + cls_bpf + samples/bpf/tcbpf1_kern.o w/o JIT w/JIT before 20.5 23.6 Mpps after 21.8 26.6 Mpps Old programs with BPF_LL_OFF will still work as-is. We can now undo most of the earlier workaround commit: a166151cbe33 ("bpf: fix bpf helpers to use skb->mac_header relative offsets") Signed-off-by: Alexei Starovoitov Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/core/filter.c | 26 +++----------------------- net/sched/act_bpf.c | 9 ++++++++- net/sched/cls_bpf.c | 16 +++++++++++++++- 3 files changed, 26 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 09b2062eb5b8..36a69e33d76b 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1238,21 +1238,6 @@ int sk_attach_bpf(u32 ufd, struct sock *sk) return 0; } -/** - * bpf_skb_clone_not_writable - is the header of a clone not writable - * @skb: buffer to check - * @len: length up to which to write, can be negative - * - * Returns true if modifying the header part of the cloned buffer - * does require the data to be copied. I.e. this version works with - * negative lengths needed for eBPF case! - */ -static bool bpf_skb_clone_unwritable(const struct sk_buff *skb, int len) -{ - return skb_header_cloned(skb) || - (int) skb_headroom(skb) + len > skb->hdr_len; -} - #define BPF_RECOMPUTE_CSUM(flags) ((flags) & 1) static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) @@ -1275,9 +1260,8 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) if (unlikely((u32) offset > 0xffff || len > sizeof(buf))) return -EFAULT; - offset -= skb->data - skb_mac_header(skb); if (unlikely(skb_cloned(skb) && - bpf_skb_clone_unwritable(skb, offset + len))) + !skb_clone_writable(skb, offset + len))) return -EFAULT; ptr = skb_header_pointer(skb, offset, len, buf); @@ -1321,9 +1305,8 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) if (unlikely((u32) offset > 0xffff)) return -EFAULT; - offset -= skb->data - skb_mac_header(skb); if (unlikely(skb_cloned(skb) && - bpf_skb_clone_unwritable(skb, offset + sizeof(sum)))) + !skb_clone_writable(skb, offset + sizeof(sum)))) return -EFAULT; ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); @@ -1369,9 +1352,8 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) if (unlikely((u32) offset > 0xffff)) return -EFAULT; - offset -= skb->data - skb_mac_header(skb); if (unlikely(skb_cloned(skb) && - bpf_skb_clone_unwritable(skb, offset + sizeof(sum)))) + !skb_clone_writable(skb, offset + sizeof(sum)))) return -EFAULT; ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); @@ -1425,8 +1407,6 @@ static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5) if (unlikely(!skb2)) return -ENOMEM; - skb_push(skb2, skb2->data - skb_mac_header(skb2)); - if (BPF_IS_REDIRECT_INGRESS(flags)) return dev_forward_skb(dev, skb2); diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index dc6a2d324bd8..1d56903fd4c7 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -37,6 +37,7 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act, { struct tcf_bpf *prog = act->priv; int action, filter_res; + bool at_ingress = G_TC_AT(skb->tc_verd) & AT_INGRESS; if (unlikely(!skb_mac_header_was_set(skb))) return TC_ACT_UNSPEC; @@ -48,7 +49,13 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act, /* Needed here for accessing maps. */ rcu_read_lock(); - filter_res = BPF_PROG_RUN(prog->filter, skb); + if (at_ingress) { + __skb_push(skb, skb->mac_len); + filter_res = BPF_PROG_RUN(prog->filter, skb); + __skb_pull(skb, skb->mac_len); + } else { + filter_res = BPF_PROG_RUN(prog->filter, skb); + } rcu_read_unlock(); /* A BPF program may overwrite the default action opcode. diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 91bd9c19471d..c79ecfd36e0f 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -64,6 +64,11 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, { struct cls_bpf_head *head = rcu_dereference_bh(tp->root); struct cls_bpf_prog *prog; +#ifdef CONFIG_NET_CLS_ACT + bool at_ingress = G_TC_AT(skb->tc_verd) & AT_INGRESS; +#else + bool at_ingress = false; +#endif int ret = -1; if (unlikely(!skb_mac_header_was_set(skb))) @@ -72,7 +77,16 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, /* Needed here for accessing maps. */ rcu_read_lock(); list_for_each_entry_rcu(prog, &head->plist, link) { - int filter_res = BPF_PROG_RUN(prog->filter, skb); + int filter_res; + + if (at_ingress) { + /* It is safe to push/pull even if skb_shared() */ + __skb_push(skb, skb->mac_len); + filter_res = BPF_PROG_RUN(prog->filter, skb); + __skb_pull(skb, skb->mac_len); + } else { + filter_res = BPF_PROG_RUN(prog->filter, skb); + } if (filter_res == 0) continue; -- cgit From d691f9e8d4405c334aa10d556e73c8bf44cb0e01 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 4 Jun 2015 10:11:54 -0700 Subject: bpf: allow programs to write to certain skb fields allow programs read/write skb->mark, tc_index fields and ((struct qdisc_skb_cb *)cb)->data. mark and tc_index are generically useful in TC. cb[0]-cb[4] are primarily used to pass arguments from one program to another called via bpf_tail_call() which can be seen in sockex3_kern.c example. All fields of 'struct __sk_buff' are readable to socket and tc_cls_act progs. mark, tc_index are writeable from tc_cls_act only. cb[0]-cb[4] are writeable by both sockets and tc_cls_act. Add verifier tests and improve sample code. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 82 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 36a69e33d76b..d271c06bf01f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -46,6 +46,7 @@ #include #include #include +#include /** * sk_filter - run a packet through a socket filter @@ -1463,13 +1464,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) } } -static bool sk_filter_is_valid_access(int off, int size, - enum bpf_access_type type) +static bool __is_valid_access(int off, int size, enum bpf_access_type type) { - /* only read is allowed */ - if (type != BPF_READ) - return false; - /* check bounds */ if (off < 0 || off >= sizeof(struct __sk_buff)) return false; @@ -1485,8 +1481,42 @@ static bool sk_filter_is_valid_access(int off, int size, return true; } -static u32 sk_filter_convert_ctx_access(int dst_reg, int src_reg, int ctx_off, - struct bpf_insn *insn_buf) +static bool sk_filter_is_valid_access(int off, int size, + enum bpf_access_type type) +{ + if (type == BPF_WRITE) { + switch (off) { + case offsetof(struct __sk_buff, cb[0]) ... + offsetof(struct __sk_buff, cb[4]): + break; + default: + return false; + } + } + + return __is_valid_access(off, size, type); +} + +static bool tc_cls_act_is_valid_access(int off, int size, + enum bpf_access_type type) +{ + if (type == BPF_WRITE) { + switch (off) { + case offsetof(struct __sk_buff, mark): + case offsetof(struct __sk_buff, tc_index): + case offsetof(struct __sk_buff, cb[0]) ... + offsetof(struct __sk_buff, cb[4]): + break; + default: + return false; + } + } + return __is_valid_access(off, size, type); +} + +static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, + int src_reg, int ctx_off, + struct bpf_insn *insn_buf) { struct bpf_insn *insn = insn_buf; @@ -1538,7 +1568,15 @@ static u32 sk_filter_convert_ctx_access(int dst_reg, int src_reg, int ctx_off, break; case offsetof(struct __sk_buff, mark): - return convert_skb_access(SKF_AD_MARK, dst_reg, src_reg, insn); + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); + + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sk_buff, mark)); + else + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sk_buff, mark)); + break; case offsetof(struct __sk_buff, pkt_type): return convert_skb_access(SKF_AD_PKTTYPE, dst_reg, src_reg, insn); @@ -1553,6 +1591,38 @@ static u32 sk_filter_convert_ctx_access(int dst_reg, int src_reg, int ctx_off, case offsetof(struct __sk_buff, vlan_tci): return convert_skb_access(SKF_AD_VLAN_TAG, dst_reg, src_reg, insn); + + case offsetof(struct __sk_buff, cb[0]) ... + offsetof(struct __sk_buff, cb[4]): + BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20); + + ctx_off -= offsetof(struct __sk_buff, cb[0]); + ctx_off += offsetof(struct sk_buff, cb); + ctx_off += offsetof(struct qdisc_skb_cb, data); + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, ctx_off); + else + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, ctx_off); + break; + + case offsetof(struct __sk_buff, tc_index): +#ifdef CONFIG_NET_SCHED + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2); + + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, + offsetof(struct sk_buff, tc_index)); + else + *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, + offsetof(struct sk_buff, tc_index)); + break; +#else + if (type == BPF_WRITE) + *insn++ = BPF_MOV64_REG(dst_reg, dst_reg); + else + *insn++ = BPF_MOV64_IMM(dst_reg, 0); + break; +#endif } return insn - insn_buf; @@ -1561,13 +1631,13 @@ static u32 sk_filter_convert_ctx_access(int dst_reg, int src_reg, int ctx_off, static const struct bpf_verifier_ops sk_filter_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, - .convert_ctx_access = sk_filter_convert_ctx_access, + .convert_ctx_access = bpf_net_convert_ctx_access, }; static const struct bpf_verifier_ops tc_cls_act_ops = { .get_func_proto = tc_cls_act_func_proto, - .is_valid_access = sk_filter_is_valid_access, - .convert_ctx_access = sk_filter_convert_ctx_access, + .is_valid_access = tc_cls_act_is_valid_access, + .convert_ctx_access = bpf_net_convert_ctx_access, }; static struct bpf_prog_type_list sk_filter_type __read_mostly = { -- cgit From 36fd61cb80fcf07c20230face1a0f6e1505c8322 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 1 Mar 2015 09:46:18 +0100 Subject: batman-adv: Use common Jenkins Hash implementation An unoptimized version of the Jenkins one-at-a-time hash function is used and partially copied all over the code wherever an hashtable is used. Instead the optimized version shared between the whole kernel should be used to reduce code duplication and use better optimized code. Only the DAT code must use the old implementation because it is used as distributed hash function which has to be common for all nodes. Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner --- net/batman-adv/bridge_loop_avoidance.c | 16 ++++------------ net/batman-adv/distributed-arp-table.c | 17 +++++++++++++++-- net/batman-adv/hash.h | 22 ---------------------- net/batman-adv/main.h | 1 + net/batman-adv/network-coding.c | 10 ++-------- net/batman-adv/originator.h | 13 +------------ net/batman-adv/translation-table.c | 8 ++------ 7 files changed, 25 insertions(+), 62 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index fa941cd7d8ad..f2ac903e091e 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -42,12 +42,8 @@ static inline uint32_t batadv_choose_claim(const void *data, uint32_t size) struct batadv_bla_claim *claim = (struct batadv_bla_claim *)data; uint32_t hash = 0; - hash = batadv_hash_bytes(hash, &claim->addr, sizeof(claim->addr)); - hash = batadv_hash_bytes(hash, &claim->vid, sizeof(claim->vid)); - - hash += (hash << 3); - hash ^= (hash >> 11); - hash += (hash << 15); + hash = jhash(&claim->addr, sizeof(claim->addr), hash); + hash = jhash(&claim->vid, sizeof(claim->vid), hash); return hash % size; } @@ -59,12 +55,8 @@ static inline uint32_t batadv_choose_backbone_gw(const void *data, const struct batadv_bla_claim *claim = (struct batadv_bla_claim *)data; uint32_t hash = 0; - hash = batadv_hash_bytes(hash, &claim->addr, sizeof(claim->addr)); - hash = batadv_hash_bytes(hash, &claim->vid, sizeof(claim->vid)); - - hash += (hash << 3); - hash ^= (hash >> 11); - hash += (hash << 15); + hash = jhash(&claim->addr, sizeof(claim->addr), hash); + hash = jhash(&claim->vid, sizeof(claim->vid), hash); return hash % size; } diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index da1742d9059f..2e6198ecc07d 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -206,9 +206,22 @@ static uint32_t batadv_hash_dat(const void *data, uint32_t size) { uint32_t hash = 0; const struct batadv_dat_entry *dat = data; + const unsigned char *key; + uint32_t i; + + key = (const unsigned char *)&dat->ip; + for (i = 0; i < sizeof(dat->ip); i++) { + hash += key[i]; + hash += (hash << 10); + hash ^= (hash >> 6); + } - hash = batadv_hash_bytes(hash, &dat->ip, sizeof(dat->ip)); - hash = batadv_hash_bytes(hash, &dat->vid, sizeof(dat->vid)); + key = (const unsigned char *)&dat->vid; + for (i = 0; i < sizeof(dat->vid); i++) { + hash += key[i]; + hash += (hash << 10); + hash ^= (hash >> 6); + } hash += (hash << 3); hash ^= (hash >> 11); diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h index 379e32acf2b4..bcf427cafe49 100644 --- a/net/batman-adv/hash.h +++ b/net/batman-adv/hash.h @@ -79,28 +79,6 @@ static inline void batadv_hash_delete(struct batadv_hashtable *hash, batadv_hash_destroy(hash); } -/** - * batadv_hash_bytes - hash some bytes and add them to the previous hash - * @hash: previous hash value - * @data: data to be hashed - * @size: number of bytes to be hashed - * - * Returns the new hash value. - */ -static inline uint32_t batadv_hash_bytes(uint32_t hash, const void *data, - uint32_t size) -{ - const unsigned char *key = data; - int i; - - for (i = 0; i < size; i++) { - hash += key[i]; - hash += (hash << 10); - hash ^= (hash >> 6); - } - return hash; -} - /** * batadv_hash_add - adds data to the hashtable * @hash: storage hash table diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index af0a3361d4b2..985d017920d5 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -174,6 +174,7 @@ enum batadv_uev_type { #include /* workqueue */ #include #include +#include #include /* struct sock */ #include /* ipv6 address stuff */ #include diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index b984bc49deaf..fb54319abff3 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -453,14 +453,8 @@ static uint32_t batadv_nc_hash_choose(const void *data, uint32_t size) const struct batadv_nc_path *nc_path = data; uint32_t hash = 0; - hash = batadv_hash_bytes(hash, &nc_path->prev_hop, - sizeof(nc_path->prev_hop)); - hash = batadv_hash_bytes(hash, &nc_path->next_hop, - sizeof(nc_path->next_hop)); - - hash += (hash << 3); - hash ^= (hash >> 11); - hash += (hash << 15); + hash = jhash(&nc_path->prev_hop, sizeof(nc_path->prev_hop), hash); + hash = jhash(&nc_path->next_hop, sizeof(nc_path->next_hop), hash); return hash % size; } diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index 91339143a2f7..0b716fe1db99 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -75,20 +75,9 @@ void batadv_orig_node_vlan_free_ref(struct batadv_orig_node_vlan *orig_vlan); */ static inline uint32_t batadv_choose_orig(const void *data, uint32_t size) { - const unsigned char *key = data; uint32_t hash = 0; - size_t i; - - for (i = 0; i < 6; i++) { - hash += key[i]; - hash += (hash << 10); - hash ^= (hash >> 6); - } - - hash += (hash << 3); - hash ^= (hash >> 11); - hash += (hash << 15); + hash = jhash(data, ETH_ALEN, hash); return hash % size; } diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index b098e53edded..fe9f8244ac0a 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -67,12 +67,8 @@ static inline uint32_t batadv_choose_tt(const void *data, uint32_t size) uint32_t hash = 0; tt = (struct batadv_tt_common_entry *)data; - hash = batadv_hash_bytes(hash, &tt->addr, ETH_ALEN); - hash = batadv_hash_bytes(hash, &tt->vid, sizeof(tt->vid)); - - hash += (hash << 3); - hash ^= (hash >> 11); - hash += (hash << 15); + hash = jhash(&tt->addr, ETH_ALEN, hash); + hash = jhash(&tt->vid, sizeof(tt->vid), hash); return hash % size; } -- cgit From 6f70eb7552abc73584f43b701d71c1c377887af0 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Sun, 1 Mar 2015 00:50:16 +0800 Subject: batman-adv: split name from variable for uint mesh attributes Some mesh attributes are behind substructs in the batadv_priv object and for this reason the name cannot be used anymore to refer to them. This patch allows to specify the variable name where the attribute is stored inside batadv_priv instead of using the name Signed-off-by: Antonio Quartulli Signed-off-by: Marek Lindner --- net/batman-adv/sysfs.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index fa8c347bf057..7278b2bd5cd0 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -151,7 +151,7 @@ ssize_t batadv_show_##_name(struct kobject *kobj, \ static BATADV_ATTR(_name, _mode, batadv_show_##_name, \ batadv_store_##_name) -#define BATADV_ATTR_SIF_STORE_UINT(_name, _min, _max, _post_func) \ +#define BATADV_ATTR_SIF_STORE_UINT(_name, _var, _min, _max, _post_func) \ ssize_t batadv_store_##_name(struct kobject *kobj, \ struct attribute *attr, char *buff, \ size_t count) \ @@ -161,24 +161,24 @@ ssize_t batadv_store_##_name(struct kobject *kobj, \ \ return __batadv_store_uint_attr(buff, count, _min, _max, \ _post_func, attr, \ - &bat_priv->_name, net_dev); \ + &bat_priv->_var, net_dev); \ } -#define BATADV_ATTR_SIF_SHOW_UINT(_name) \ +#define BATADV_ATTR_SIF_SHOW_UINT(_name, _var) \ ssize_t batadv_show_##_name(struct kobject *kobj, \ struct attribute *attr, char *buff) \ { \ struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj); \ \ - return sprintf(buff, "%i\n", atomic_read(&bat_priv->_name)); \ + return sprintf(buff, "%i\n", atomic_read(&bat_priv->_var)); \ } \ /* Use this, if you are going to set [name] in the soft-interface * (bat_priv) to an unsigned integer value */ -#define BATADV_ATTR_SIF_UINT(_name, _mode, _min, _max, _post_func) \ - static BATADV_ATTR_SIF_STORE_UINT(_name, _min, _max, _post_func)\ - static BATADV_ATTR_SIF_SHOW_UINT(_name) \ +#define BATADV_ATTR_SIF_UINT(_name, _var, _mode, _min, _max, _post_func)\ + static BATADV_ATTR_SIF_STORE_UINT(_name, _var, _min, _max, _post_func)\ + static BATADV_ATTR_SIF_SHOW_UINT(_name, _var) \ static BATADV_ATTR(_name, _mode, batadv_show_##_name, \ batadv_store_##_name) @@ -540,19 +540,20 @@ BATADV_ATTR_SIF_BOOL(fragmentation, S_IRUGO | S_IWUSR, batadv_update_min_mtu); static BATADV_ATTR(routing_algo, S_IRUGO, batadv_show_bat_algo, NULL); static BATADV_ATTR(gw_mode, S_IRUGO | S_IWUSR, batadv_show_gw_mode, batadv_store_gw_mode); -BATADV_ATTR_SIF_UINT(orig_interval, S_IRUGO | S_IWUSR, 2 * BATADV_JITTER, - INT_MAX, NULL); -BATADV_ATTR_SIF_UINT(hop_penalty, S_IRUGO | S_IWUSR, 0, BATADV_TQ_MAX_VALUE, - NULL); -BATADV_ATTR_SIF_UINT(gw_sel_class, S_IRUGO | S_IWUSR, 1, BATADV_TQ_MAX_VALUE, - batadv_post_gw_reselect); +BATADV_ATTR_SIF_UINT(orig_interval, orig_interval, S_IRUGO | S_IWUSR, + 2 * BATADV_JITTER, INT_MAX, NULL); +BATADV_ATTR_SIF_UINT(hop_penalty, hop_penalty, S_IRUGO | S_IWUSR, 0, + BATADV_TQ_MAX_VALUE, NULL); +BATADV_ATTR_SIF_UINT(gw_sel_class, gw_sel_class, S_IRUGO | S_IWUSR, 1, + BATADV_TQ_MAX_VALUE, batadv_post_gw_reselect); static BATADV_ATTR(gw_bandwidth, S_IRUGO | S_IWUSR, batadv_show_gw_bwidth, batadv_store_gw_bwidth); #ifdef CONFIG_BATMAN_ADV_MCAST BATADV_ATTR_SIF_BOOL(multicast_mode, S_IRUGO | S_IWUSR, NULL); #endif #ifdef CONFIG_BATMAN_ADV_DEBUG -BATADV_ATTR_SIF_UINT(log_level, S_IRUGO | S_IWUSR, 0, BATADV_DBG_ALL, NULL); +BATADV_ATTR_SIF_UINT(log_level, log_level, S_IRUGO | S_IWUSR, 0, + BATADV_DBG_ALL, NULL); #endif #ifdef CONFIG_BATMAN_ADV_NC BATADV_ATTR_SIF_BOOL(network_coding, S_IRUGO | S_IWUSR, -- cgit From bcef1f3c49daee2829cf58e30310427c63038cff Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Sun, 1 Mar 2015 00:50:17 +0800 Subject: batman-adv: add bat_neigh_free API This API has to be used to let any routing protocol free neighbor specific allocated resources Signed-off-by: Antonio Quartulli Signed-off-by: Marek Lindner --- net/batman-adv/originator.c | 6 ++++++ net/batman-adv/types.h | 4 ++++ 2 files changed, 10 insertions(+) (limited to 'net') diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index e3900e452616..aa092c34b88b 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -197,13 +197,19 @@ static void batadv_neigh_node_free_rcu(struct rcu_head *rcu) struct hlist_node *node_tmp; struct batadv_neigh_node *neigh_node; struct batadv_neigh_ifinfo *neigh_ifinfo; + struct batadv_algo_ops *bao; neigh_node = container_of(rcu, struct batadv_neigh_node, rcu); + bao = neigh_node->orig_node->bat_priv->bat_algo_ops; hlist_for_each_entry_safe(neigh_ifinfo, node_tmp, &neigh_node->ifinfo_list, list) { batadv_neigh_ifinfo_free_ref_now(neigh_ifinfo); } + + if (bao->bat_neigh_free) + bao->bat_neigh_free(neigh_node); + batadv_hardif_free_ref_now(neigh_node->if_incoming); kfree(neigh_node); diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index c1000c0d6b0d..336f94bba558 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1121,6 +1121,8 @@ struct batadv_forw_packet { * @bat_neigh_is_equiv_or_better: check if neigh1 is equally good or better * than neigh2 for their respective outgoing interface from the metric * prospective + * @bat_neigh_free: free the resources allocated by the routing algorithm for a + * neigh_node object * @bat_orig_print: print the originator table (optional) * @bat_orig_free: free the resources allocated by the routing algorithm for an * orig_node object @@ -1138,6 +1140,7 @@ struct batadv_algo_ops { void (*bat_primary_iface_set)(struct batadv_hard_iface *hard_iface); void (*bat_ogm_schedule)(struct batadv_hard_iface *hard_iface); void (*bat_ogm_emit)(struct batadv_forw_packet *forw_packet); + /* neigh_node handling API */ int (*bat_neigh_cmp)(struct batadv_neigh_node *neigh1, struct batadv_hard_iface *if_outgoing1, struct batadv_neigh_node *neigh2, @@ -1147,6 +1150,7 @@ struct batadv_algo_ops { struct batadv_hard_iface *if_outgoing1, struct batadv_neigh_node *neigh2, struct batadv_hard_iface *if_outgoing2); + void (*bat_neigh_free)(struct batadv_neigh_node *neigh); /* orig_node handling API */ void (*bat_orig_print)(struct batadv_priv *priv, struct seq_file *seq, struct batadv_hard_iface *hard_iface); -- cgit From 1e2c2a4fe4a52cc55a78727778119f9a74283b8a Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Fri, 17 Apr 2015 19:40:28 +0200 Subject: batman-adv: Add required includes to all files The header files could not be build indepdent from each other. This is happened because headers didn't include the files for things they've used. This was problematic because the success of a build depended on the knowledge about the right order of local includes. Also source files were not including everything they've used explicitly. Instead they required that transitive includes are always stable. This is problematic because some transitive includes are not obvious, depend on config settings and may not be stable in the future. The order for include blocks are: * primary headers (main.h and the *.h file of a *.c file) * global linux headers * required local headers * extra forward declarations for pointers in function/struct declarations The only exceptions are linux/bitops.h and linux/if_ether.h in packet.h. This header file is shared with userspace applications like batctl and must therefore build together with userspace applications. The header linux/bitops.h is not part of the uapi headers and linux/if_ether.h conflicts with the musl implementation of netinet/if_ether.h. The maintainers rejected the use of __KERNEL__ preprocessor checks and thus these two headers are only in main.h. All files using packet.h first have to include main.h to work correctly. Reported-by: Markus Pargmann Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner --- net/batman-adv/bat_iv_ogm.c | 42 +++++++++++++++++++++---- net/batman-adv/bitarray.c | 4 +-- net/batman-adv/bitarray.h | 6 ++++ net/batman-adv/bridge_loop_avoidance.c | 38 ++++++++++++++++++----- net/batman-adv/bridge_loop_avoidance.h | 10 ++++++ net/batman-adv/debugfs.c | 37 +++++++++++++++++----- net/batman-adv/debugfs.h | 7 +++++ net/batman-adv/distributed-arp-table.c | 28 ++++++++++++++--- net/batman-adv/distributed-arp-table.h | 13 ++++++-- net/batman-adv/fragmentation.c | 22 +++++++++++-- net/batman-adv/fragmentation.h | 9 ++++++ net/batman-adv/gateway_client.c | 34 ++++++++++++++++----- net/batman-adv/gateway_client.h | 8 +++++ net/batman-adv/gateway_common.c | 11 ++++++- net/batman-adv/gateway_common.h | 7 +++++ net/batman-adv/hard-interface.c | 38 +++++++++++++++-------- net/batman-adv/hard-interface.h | 11 +++++++ net/batman-adv/hash.c | 6 +++- net/batman-adv/hash.h | 9 ++++++ net/batman-adv/icmp_socket.c | 33 +++++++++++++++++--- net/batman-adv/icmp_socket.h | 7 +++++ net/batman-adv/main.c | 56 +++++++++++++++++++++++----------- net/batman-adv/main.h | 29 ++++++++---------- net/batman-adv/multicast.c | 29 ++++++++++++++++-- net/batman-adv/multicast.h | 6 ++++ net/batman-adv/network-coding.c | 37 +++++++++++++++++++--- net/batman-adv/network-coding.h | 13 ++++++++ net/batman-adv/originator.c | 28 ++++++++++++----- net/batman-adv/originator.h | 13 ++++++++ net/batman-adv/packet.h | 3 ++ net/batman-adv/routing.c | 36 ++++++++++++++++------ net/batman-adv/routing.h | 10 ++++++ net/batman-adv/send.c | 36 ++++++++++++++++------ net/batman-adv/send.h | 13 ++++++++ net/batman-adv/soft-interface.c | 54 +++++++++++++++++++++++--------- net/batman-adv/soft-interface.h | 11 +++++++ net/batman-adv/sysfs.c | 31 +++++++++++++++---- net/batman-adv/sysfs.h | 10 ++++++ net/batman-adv/translation-table.c | 39 ++++++++++++++++++----- net/batman-adv/translation-table.h | 9 ++++++ net/batman-adv/types.h | 18 +++++++++-- 41 files changed, 703 insertions(+), 158 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 4e93d2d3958b..9adbc61c217e 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -15,16 +15,46 @@ * along with this program; if not, see . */ +#include "bat_algo.h" #include "main.h" -#include "translation-table.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bitarray.h" +#include "hard-interface.h" +#include "hash.h" +#include "network-coding.h" #include "originator.h" +#include "packet.h" #include "routing.h" -#include "gateway_common.h" -#include "gateway_client.h" -#include "hard-interface.h" #include "send.h" -#include "bat_algo.h" -#include "network-coding.h" +#include "translation-table.h" /** * enum batadv_dup_status - duplicate status diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c index 40e4a2a18e45..cf68c328345e 100644 --- a/net/batman-adv/bitarray.c +++ b/net/batman-adv/bitarray.c @@ -15,10 +15,10 @@ * along with this program; if not, see . */ -#include "main.h" #include "bitarray.h" +#include "main.h" -#include +#include /* shift the packet array by n places. */ static void batadv_bitmap_shift_left(unsigned long *seq_bits, int32_t n) diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h index be497be696d1..0c2456225fae 100644 --- a/net/batman-adv/bitarray.h +++ b/net/batman-adv/bitarray.h @@ -18,6 +18,12 @@ #ifndef _NET_BATMAN_ADV_BITARRAY_H_ #define _NET_BATMAN_ADV_BITARRAY_H_ +#include "main.h" + +#include +#include +#include + /* Returns 1 if the corresponding bit in the given seq_bits indicates true * and curr_seqno is within range of last_seqno. Otherwise returns 0. */ diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index f2ac903e091e..ba0609292ae7 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -15,19 +15,41 @@ * along with this program; if not, see . */ -#include "main.h" -#include "hash.h" -#include "hard-interface.h" -#include "originator.h" #include "bridge_loop_avoidance.h" -#include "translation-table.h" -#include "send.h" +#include "main.h" -#include +#include +#include +#include #include +#include +#include +#include #include -#include +#include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hard-interface.h" +#include "hash.h" +#include "originator.h" +#include "packet.h" +#include "translation-table.h" static const uint8_t batadv_announce_mac[4] = {0x43, 0x05, 0x43, 0x05}; diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h index 1f506d34039e..0282690389ac 100644 --- a/net/batman-adv/bridge_loop_avoidance.h +++ b/net/batman-adv/bridge_loop_avoidance.h @@ -18,6 +18,16 @@ #ifndef _NET_BATMAN_ADV_BLA_H_ #define _NET_BATMAN_ADV_BLA_H_ +#include "main.h" + +#include + +struct batadv_hard_iface; +struct batadv_orig_node; +struct batadv_priv; +struct seq_file; +struct sk_buff; + #ifdef CONFIG_BATMAN_ADV_BLA int batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid, bool is_bcast); diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c index 46118084221a..c4c1e8030ba0 100644 --- a/net/batman-adv/debugfs.c +++ b/net/batman-adv/debugfs.c @@ -15,21 +15,42 @@ * along with this program; if not, see . */ +#include "debugfs.h" #include "main.h" +#include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for linux/wait.h */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include -#include "debugfs.h" -#include "translation-table.h" -#include "originator.h" -#include "hard-interface.h" -#include "gateway_common.h" -#include "gateway_client.h" -#include "soft-interface.h" -#include "icmp_socket.h" #include "bridge_loop_avoidance.h" #include "distributed-arp-table.h" +#include "gateway_client.h" +#include "icmp_socket.h" #include "network-coding.h" +#include "originator.h" +#include "translation-table.h" static struct dentry *batadv_debugfs; diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h index ed25605ca732..187acdc85dfa 100644 --- a/net/batman-adv/debugfs.h +++ b/net/batman-adv/debugfs.h @@ -18,6 +18,13 @@ #ifndef _NET_BATMAN_ADV_DEBUGFS_H_ #define _NET_BATMAN_ADV_DEBUGFS_H_ +#include "main.h" + +#include + +struct batadv_hard_iface; +struct net_device; + #define BATADV_DEBUGFS_SUBDIR "batman_adv" #if IS_ENABLED(CONFIG_DEBUG_FS) diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index 2e6198ecc07d..fb54e6aed096 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -15,18 +15,36 @@ * along with this program; if not, see . */ -#include +#include "distributed-arp-table.h" +#include "main.h" + +#include +#include +#include +#include +#include #include +#include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include -#include "main.h" -#include "hash.h" -#include "distributed-arp-table.h" #include "hard-interface.h" +#include "hash.h" #include "originator.h" #include "send.h" -#include "types.h" #include "translation-table.h" static void batadv_dat_purge(struct work_struct *work); diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h index ed41b8edba18..3181507ebc14 100644 --- a/net/batman-adv/distributed-arp-table.h +++ b/net/batman-adv/distributed-arp-table.h @@ -18,12 +18,19 @@ #ifndef _NET_BATMAN_ADV_DISTRIBUTED_ARP_TABLE_H_ #define _NET_BATMAN_ADV_DISTRIBUTED_ARP_TABLE_H_ -#ifdef CONFIG_BATMAN_ADV_DAT +#include "main.h" + +#include +#include +#include -#include "types.h" #include "originator.h" +#include "packet.h" -#include +struct seq_file; +struct sk_buff; + +#ifdef CONFIG_BATMAN_ADV_DAT /* BATADV_DAT_ADDR_MAX - maximum address value in the DHT space */ #define BATADV_DAT_ADDR_MAX ((batadv_dat_addr_t)~(batadv_dat_addr_t)0) diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 6ce3c84a7e55..c0f0d01ab244 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -15,12 +15,28 @@ * along with this program; if not, see . */ -#include "main.h" #include "fragmentation.h" -#include "send.h" +#include "main.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hard-interface.h" #include "originator.h" +#include "packet.h" #include "routing.h" -#include "hard-interface.h" +#include "send.h" #include "soft-interface.h" /** diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h index ec1e86f899e8..8b9877e70b95 100644 --- a/net/batman-adv/fragmentation.h +++ b/net/batman-adv/fragmentation.h @@ -18,6 +18,15 @@ #ifndef _NET_BATMAN_ADV_FRAGMENTATION_H_ #define _NET_BATMAN_ADV_FRAGMENTATION_H_ +#include "main.h" + +#include +#include +#include +#include + +struct sk_buff; + void batadv_frag_purge_orig(struct batadv_orig_node *orig, bool (*check_cb)(struct batadv_frag_table_entry *)); bool batadv_frag_skb_fwd(struct sk_buff *skb, diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index a85eaca344e8..14f0a2207379 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -15,18 +15,38 @@ * along with this program; if not, see . */ -#include "main.h" -#include "sysfs.h" #include "gateway_client.h" +#include "main.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + #include "gateway_common.h" #include "hard-interface.h" #include "originator.h" -#include "translation-table.h" +#include "packet.h" #include "routing.h" -#include -#include -#include -#include +#include "sysfs.h" +#include "translation-table.h" /* These are the offsets of the "hw type" and "hw address length" in the dhcp * packet starting at the beginning of the dhcp header diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h index 185fb0887654..89565b451c18 100644 --- a/net/batman-adv/gateway_client.h +++ b/net/batman-adv/gateway_client.h @@ -18,6 +18,14 @@ #ifndef _NET_BATMAN_ADV_GATEWAY_CLIENT_H_ #define _NET_BATMAN_ADV_GATEWAY_CLIENT_H_ +#include "main.h" + +#include + +struct batadv_tvlv_gateway_data; +struct seq_file; +struct sk_buff; + void batadv_gw_check_client_stop(struct batadv_priv *bat_priv); void batadv_gw_reselect(struct batadv_priv *bat_priv); void batadv_gw_election(struct batadv_priv *bat_priv); diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index 0792e2f101e4..39cf44ccebd4 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -15,9 +15,18 @@ * along with this program; if not, see . */ -#include "main.h" #include "gateway_common.h" +#include "main.h" + +#include +#include +#include +#include +#include +#include + #include "gateway_client.h" +#include "packet.h" /** * batadv_parse_gw_bandwidth - parse supplied string buffer to extract download diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h index df5434229675..bd5c812cebf4 100644 --- a/net/batman-adv/gateway_common.h +++ b/net/batman-adv/gateway_common.h @@ -18,6 +18,13 @@ #ifndef _NET_BATMAN_ADV_GATEWAY_COMMON_H_ #define _NET_BATMAN_ADV_GATEWAY_COMMON_H_ +#include "main.h" + +#include + +struct batadv_priv; +struct net_device; + enum batadv_gw_modes { BATADV_GW_MODE_OFF, BATADV_GW_MODE_CLIENT, diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index bdb020e29272..f4a15d2e5eaf 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -15,22 +15,36 @@ * along with this program; if not, see . */ -#include "main.h" -#include "distributed-arp-table.h" #include "hard-interface.h" -#include "soft-interface.h" -#include "send.h" -#include "translation-table.h" -#include "routing.h" -#include "sysfs.h" -#include "debugfs.h" -#include "originator.h" -#include "hash.h" -#include "bridge_loop_avoidance.h" -#include "gateway_client.h" +#include "main.h" +#include +#include +#include +#include #include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bridge_loop_avoidance.h" +#include "debugfs.h" +#include "distributed-arp-table.h" +#include "gateway_client.h" +#include "originator.h" +#include "packet.h" +#include "send.h" +#include "soft-interface.h" +#include "sysfs.h" +#include "translation-table.h" void batadv_hardif_free_rcu(struct rcu_head *rcu) { diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h index e8b6ffea703d..5a31420513e1 100644 --- a/net/batman-adv/hard-interface.h +++ b/net/batman-adv/hard-interface.h @@ -18,6 +18,17 @@ #ifndef _NET_BATMAN_ADV_HARD_INTERFACE_H_ #define _NET_BATMAN_ADV_HARD_INTERFACE_H_ +#include "main.h" + +#include +#include +#include +#include +#include +#include + +struct net_device; + enum batadv_hard_if_state { BATADV_IF_NOT_IN_USE, BATADV_IF_TO_BE_REMOVED, diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c index 3a0e1dcd1f29..e89f3146b092 100644 --- a/net/batman-adv/hash.c +++ b/net/batman-adv/hash.c @@ -15,8 +15,12 @@ * along with this program; if not, see . */ -#include "main.h" #include "hash.h" +#include "main.h" + +#include +#include +#include /* clears the hash */ static void batadv_hash_init(struct batadv_hashtable *hash) diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h index bcf427cafe49..5065f50c9c3c 100644 --- a/net/batman-adv/hash.h +++ b/net/batman-adv/hash.h @@ -18,7 +18,16 @@ #ifndef _NET_BATMAN_ADV_HASH_H_ #define _NET_BATMAN_ADV_HASH_H_ +#include "main.h" + +#include #include +#include +#include +#include +#include + +struct lock_class_key; /* callback to a compare function. should compare 2 element datas for their * keys, return 0 if same and not 0 if not same diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c index 6c3cfb57d132..07061bcbaa04 100644 --- a/net/batman-adv/icmp_socket.c +++ b/net/batman-adv/icmp_socket.c @@ -15,14 +15,39 @@ * along with this program; if not, see . */ +#include "icmp_socket.h" #include "main.h" + +#include +#include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for linux/wait.h */ +#include #include -#include "icmp_socket.h" -#include "send.h" -#include "hash.h" -#include "originator.h" +#include +#include +#include +#include +#include +#include + #include "hard-interface.h" +#include "originator.h" +#include "packet.h" +#include "send.h" static struct batadv_socket_client *batadv_socket_client_hash[256]; diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h index 4815824e2f61..7de7fce4b48c 100644 --- a/net/batman-adv/icmp_socket.h +++ b/net/batman-adv/icmp_socket.h @@ -18,6 +18,13 @@ #ifndef _NET_BATMAN_ADV_ICMP_SOCKET_H_ #define _NET_BATMAN_ADV_ICMP_SOCKET_H_ +#include "main.h" + +#include + +struct batadv_icmp_header; +struct batadv_priv; + #define BATADV_ICMP_SOCKET "socket" void batadv_socket_init(void); diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 548e405d13c1..8457097f1643 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -15,31 +15,53 @@ * along with this program; if not, see . */ +#include "main.h" + +#include +#include +#include #include -#include +#include +#include +#include #include -#include -#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include -#include "main.h" -#include "sysfs.h" +#include + +#include "bat_algo.h" +#include "bridge_loop_avoidance.h" #include "debugfs.h" +#include "distributed-arp-table.h" +#include "gateway_client.h" +#include "gateway_common.h" +#include "hard-interface.h" +#include "icmp_socket.h" +#include "multicast.h" +#include "network-coding.h" +#include "originator.h" +#include "packet.h" #include "routing.h" #include "send.h" -#include "originator.h" #include "soft-interface.h" -#include "icmp_socket.h" #include "translation-table.h" -#include "hard-interface.h" -#include "gateway_client.h" -#include "bridge_loop_avoidance.h" -#include "distributed-arp-table.h" -#include "multicast.h" -#include "gateway_common.h" -#include "hash.h" -#include "bat_algo.h" -#include "network-coding.h" -#include "fragmentation.h" /* List manipulations on hardif_list have to be rtnl_lock()'ed, * list traversals just rcu-locked diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 985d017920d5..bf0019a41a12 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -163,28 +163,25 @@ enum batadv_uev_type { /* Kernel headers */ -#include /* mutex */ -#include /* needed by all modules */ -#include /* netdevice */ -#include /* ethernet address classification */ -#include /* ethernet header */ -#include /* poll_table */ -#include /* kernel threads */ -#include /* schedule types */ -#include /* workqueue */ +#include +#include /* for packet.h */ +#include +#include +#include +#include /* for packet.h */ +#include +#include +#include #include -#include -#include -#include /* struct sock */ -#include /* ipv6 address stuff */ -#include -#include #include -#include #include #include "types.h" +struct batadv_ogm_packet; +struct seq_file; +struct sk_buff; + #define BATADV_PRINT_VID(vid) (vid & BATADV_VLAN_HAS_TAG ? \ (int)(vid & VLAN_VID_MASK) : -1) diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 09f2838dedf2..7aa480b7edd0 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -15,10 +15,33 @@ * along with this program; if not, see . */ -#include "main.h" #include "multicast.h" -#include "originator.h" -#include "hard-interface.h" +#include "main.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "packet.h" #include "translation-table.h" /** diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h index 033d80e84fdf..beb6e56c624a 100644 --- a/net/batman-adv/multicast.h +++ b/net/batman-adv/multicast.h @@ -18,6 +18,12 @@ #ifndef _NET_BATMAN_ADV_MULTICAST_H_ #define _NET_BATMAN_ADV_MULTICAST_H_ +#include "main.h" + +struct batadv_orig_node; +struct batadv_priv; +struct sk_buff; + /** * batadv_forw_mode - the way a packet should be forwarded as * @BATADV_FORW_ALL: forward the packet to all nodes (currently via classic diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index fb54319abff3..f0a50f31d822 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -15,15 +15,44 @@ * along with this program; if not, see . */ +#include "network-coding.h" +#include "main.h" + +#include +#include +#include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include -#include "main.h" +#include "hard-interface.h" #include "hash.h" -#include "network-coding.h" -#include "send.h" #include "originator.h" -#include "hard-interface.h" +#include "packet.h" #include "routing.h" +#include "send.h" static struct lock_class_key batadv_nc_coding_hash_lock_class_key; static struct lock_class_key batadv_nc_decoding_hash_lock_class_key; diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h index b5ab8ff544ee..5b79aa8c64c1 100644 --- a/net/batman-adv/network-coding.h +++ b/net/batman-adv/network-coding.h @@ -18,6 +18,19 @@ #ifndef _NET_BATMAN_ADV_NETWORK_CODING_H_ #define _NET_BATMAN_ADV_NETWORK_CODING_H_ +#include "main.h" + +#include + +struct batadv_nc_node; +struct batadv_neigh_node; +struct batadv_ogm_packet; +struct batadv_orig_node; +struct batadv_priv; +struct net_device; +struct seq_file; +struct sk_buff; + #ifdef CONFIG_BATMAN_ADV_NC void batadv_nc_status_update(struct net_device *net_dev); diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index aa092c34b88b..018b7495ad84 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -15,19 +15,31 @@ * along with this program; if not, see . */ +#include "originator.h" #include "main.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + #include "distributed-arp-table.h" -#include "originator.h" -#include "hash.h" -#include "translation-table.h" -#include "routing.h" +#include "fragmentation.h" #include "gateway_client.h" #include "hard-interface.h" -#include "soft-interface.h" -#include "bridge_loop_avoidance.h" -#include "network-coding.h" -#include "fragmentation.h" +#include "hash.h" #include "multicast.h" +#include "network-coding.h" +#include "routing.h" +#include "translation-table.h" /* hash class keys */ static struct lock_class_key batadv_orig_hash_lock_class_key; diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index 0b716fe1db99..79734d302010 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -18,8 +18,21 @@ #ifndef _NET_BATMAN_ADV_ORIGINATOR_H_ #define _NET_BATMAN_ADV_ORIGINATOR_H_ +#include "main.h" + +#include +#include +#include +#include +#include +#include +#include +#include + #include "hash.h" +struct seq_file; + int batadv_compare_orig(const struct hlist_node *node, const void *data2); int batadv_originator_init(struct batadv_priv *bat_priv); void batadv_originator_free(struct batadv_priv *bat_priv); diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h index 9468bc09c7c4..9e747c08d0bc 100644 --- a/net/batman-adv/packet.h +++ b/net/batman-adv/packet.h @@ -18,6 +18,9 @@ #ifndef _NET_BATMAN_ADV_PACKET_H_ #define _NET_BATMAN_ADV_PACKET_H_ +#include +#include + /** * enum batadv_packettype - types for batman-adv encapsulated packets * @BATADV_IV_OGM: originator messages for B.A.T.M.A.N. IV diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index c5d90095bc3c..c360c0cd19c2 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -15,20 +15,36 @@ * along with this program; if not, see . */ -#include "main.h" #include "routing.h" -#include "send.h" -#include "soft-interface.h" -#include "hard-interface.h" -#include "icmp_socket.h" -#include "translation-table.h" -#include "originator.h" +#include "main.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bitarray.h" #include "bridge_loop_avoidance.h" #include "distributed-arp-table.h" -#include "network-coding.h" #include "fragmentation.h" - -#include +#include "hard-interface.h" +#include "icmp_socket.h" +#include "network-coding.h" +#include "originator.h" +#include "packet.h" +#include "send.h" +#include "soft-interface.h" +#include "translation-table.h" static int batadv_route_unicast_packet(struct sk_buff *skb, struct batadv_hard_iface *recv_if); diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h index 6573f12b3ddc..6bc29d33abc1 100644 --- a/net/batman-adv/routing.h +++ b/net/batman-adv/routing.h @@ -18,6 +18,16 @@ #ifndef _NET_BATMAN_ADV_ROUTING_H_ #define _NET_BATMAN_ADV_ROUTING_H_ +#include "main.h" + +#include + +struct batadv_hard_iface; +struct batadv_neigh_node; +struct batadv_orig_node; +struct batadv_priv; +struct sk_buff; + bool batadv_check_management_packet(struct sk_buff *skb, struct batadv_hard_iface *hard_iface, int header_len); diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 23635bd63fec..0a01992e65ab 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -15,19 +15,37 @@ * along with this program; if not, see . */ +#include "send.h" #include "main.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + #include "distributed-arp-table.h" -#include "send.h" -#include "routing.h" -#include "translation-table.h" -#include "soft-interface.h" -#include "hard-interface.h" -#include "gateway_common.h" +#include "fragmentation.h" #include "gateway_client.h" -#include "originator.h" +#include "hard-interface.h" #include "network-coding.h" -#include "fragmentation.h" -#include "multicast.h" +#include "originator.h" +#include "routing.h" +#include "soft-interface.h" +#include "translation-table.h" static void batadv_send_outstanding_bcast_packet(struct work_struct *work); diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h index 60c233eb35ed..0536835fe503 100644 --- a/net/batman-adv/send.h +++ b/net/batman-adv/send.h @@ -18,6 +18,19 @@ #ifndef _NET_BATMAN_ADV_SEND_H_ #define _NET_BATMAN_ADV_SEND_H_ +#include "main.h" + +#include +#include + +#include "packet.h" + +struct batadv_hard_iface; +struct batadv_orig_node; +struct batadv_priv; +struct sk_buff; +struct work_struct; + int batadv_send_skb_packet(struct sk_buff *skb, struct batadv_hard_iface *hard_iface, const uint8_t *dst_addr); diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 50cf722f4e1b..a859322d7b41 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -15,26 +15,50 @@ * along with this program; if not, see . */ -#include "main.h" #include "soft-interface.h" -#include "hard-interface.h" -#include "distributed-arp-table.h" -#include "routing.h" -#include "send.h" -#include "debugfs.h" -#include "translation-table.h" -#include "hash.h" -#include "gateway_common.h" -#include "gateway_client.h" -#include "sysfs.h" -#include "originator.h" -#include -#include +#include "main.h" + +#include +#include +#include +#include +#include #include +#include +#include +#include #include -#include "multicast.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + #include "bridge_loop_avoidance.h" +#include "debugfs.h" +#include "distributed-arp-table.h" +#include "gateway_client.h" +#include "gateway_common.h" +#include "hard-interface.h" +#include "multicast.h" #include "network-coding.h" +#include "packet.h" +#include "send.h" +#include "sysfs.h" +#include "translation-table.h" static int batadv_get_settings(struct net_device *dev, struct ethtool_cmd *cmd); static void batadv_get_drvinfo(struct net_device *dev, diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h index 9ce08049ffd0..578e8a663c30 100644 --- a/net/batman-adv/soft-interface.h +++ b/net/batman-adv/soft-interface.h @@ -18,6 +18,17 @@ #ifndef _NET_BATMAN_ADV_SOFT_INTERFACE_H_ #define _NET_BATMAN_ADV_SOFT_INTERFACE_H_ +#include "main.h" + +#include + +struct batadv_hard_iface; +struct batadv_orig_node; +struct batadv_priv; +struct batadv_softif_vlan; +struct net_device; +struct sk_buff; + int batadv_skb_head_push(struct sk_buff *skb, unsigned int len); void batadv_interface_rx(struct net_device *soft_iface, struct sk_buff *skb, struct batadv_hard_iface *recv_if, diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index 7278b2bd5cd0..d6a312a82c03 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -15,16 +15,35 @@ * along with this program; if not, see . */ -#include "main.h" #include "sysfs.h" -#include "translation-table.h" +#include "main.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + #include "distributed-arp-table.h" -#include "network-coding.h" -#include "originator.h" +#include "gateway_client.h" +#include "gateway_common.h" #include "hard-interface.h" +#include "network-coding.h" +#include "packet.h" #include "soft-interface.h" -#include "gateway_common.h" -#include "gateway_client.h" static struct net_device *batadv_kobj_to_netdev(struct kobject *obj) { diff --git a/net/batman-adv/sysfs.h b/net/batman-adv/sysfs.h index b9e79ad806ac..2294583f7cf9 100644 --- a/net/batman-adv/sysfs.h +++ b/net/batman-adv/sysfs.h @@ -18,6 +18,16 @@ #ifndef _NET_BATMAN_ADV_SYSFS_H_ #define _NET_BATMAN_ADV_SYSFS_H_ +#include "main.h" + +#include +#include + +struct batadv_priv; +struct batadv_softif_vlan; +struct kobject; +struct net_device; + #define BATADV_SYSFS_IF_MESH_SUBDIR "mesh" #define BATADV_SYSFS_IF_BAT_SUBDIR "batman_adv" /** diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index fe9f8244ac0a..0611c58168ec 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -15,18 +15,41 @@ * along with this program; if not, see . */ -#include "main.h" #include "translation-table.h" -#include "soft-interface.h" +#include "main.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bridge_loop_avoidance.h" #include "hard-interface.h" -#include "send.h" #include "hash.h" -#include "originator.h" -#include "routing.h" -#include "bridge_loop_avoidance.h" #include "multicast.h" - -#include +#include "originator.h" +#include "packet.h" +#include "soft-interface.h" /* hash class keys */ static struct lock_class_key batadv_tt_local_hash_lock_class_key; diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h index 5769037c7e2d..6acc25d3a925 100644 --- a/net/batman-adv/translation-table.h +++ b/net/batman-adv/translation-table.h @@ -18,6 +18,15 @@ #ifndef _NET_BATMAN_ADV_TRANSLATION_TABLE_H_ #define _NET_BATMAN_ADV_TRANSLATION_TABLE_H_ +#include "main.h" + +#include + +struct batadv_orig_node; +struct batadv_priv; +struct net_device; +struct seq_file; + int batadv_tt_init(struct batadv_priv *bat_priv); bool batadv_tt_local_add(struct net_device *soft_iface, const uint8_t *addr, unsigned short vid, int ifindex, uint32_t mark); diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 336f94bba558..67d63483618e 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -18,9 +18,23 @@ #ifndef _NET_BATMAN_ADV_TYPES_H_ #define _NET_BATMAN_ADV_TYPES_H_ +#ifndef _NET_BATMAN_ADV_MAIN_H_ +#error only "main.h" can be included directly +#endif + +#include +#include +#include +#include +#include /* for linux/wait.h */ +#include +#include +#include +#include + #include "packet.h" -#include "bitarray.h" -#include + +struct seq_file; #ifdef CONFIG_BATMAN_ADV_DAT -- cgit From a2f2b6cd41f5b4f8b5b7914e4e52e4954f631dd2 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Thu, 23 Apr 2015 18:22:24 +0200 Subject: batman-adv: Clarify calculation precedence for '&' and '?' Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner --- net/batman-adv/bat_iv_ogm.c | 2 +- net/batman-adv/main.h | 2 +- net/batman-adv/translation-table.c | 42 +++++++++++++++++++------------------- 3 files changed, 23 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 9adbc61c217e..753383c2215c 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -479,7 +479,7 @@ static void batadv_iv_ogm_send_to_if(struct batadv_forw_packet *forw_packet, batadv_ogm_packet->orig, ntohl(batadv_ogm_packet->seqno), batadv_ogm_packet->tq, batadv_ogm_packet->ttl, - (batadv_ogm_packet->flags & BATADV_DIRECTLINK ? + ((batadv_ogm_packet->flags & BATADV_DIRECTLINK) ? "on" : "off"), hard_iface->net_dev->name, hard_iface->net_dev->dev_addr); diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index bf0019a41a12..41d27c7872b9 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -182,7 +182,7 @@ struct batadv_ogm_packet; struct seq_file; struct sk_buff; -#define BATADV_PRINT_VID(vid) (vid & BATADV_VLAN_HAS_TAG ? \ +#define BATADV_PRINT_VID(vid) ((vid & BATADV_VLAN_HAS_TAG) ? \ (int)(vid & VLAN_VID_MASK) : -1) extern char batadv_routing_algo[]; diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 0611c58168ec..b4824951010b 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -973,17 +973,17 @@ int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset) " * %pM %4i [%c%c%c%c%c%c] %3u.%03u (%#.8x)\n", tt_common_entry->addr, BATADV_PRINT_VID(tt_common_entry->vid), - (tt_common_entry->flags & - BATADV_TT_CLIENT_ROAM ? 'R' : '.'), + ((tt_common_entry->flags & + BATADV_TT_CLIENT_ROAM) ? 'R' : '.'), no_purge ? 'P' : '.', - (tt_common_entry->flags & - BATADV_TT_CLIENT_NEW ? 'N' : '.'), - (tt_common_entry->flags & - BATADV_TT_CLIENT_PENDING ? 'X' : '.'), - (tt_common_entry->flags & - BATADV_TT_CLIENT_WIFI ? 'W' : '.'), - (tt_common_entry->flags & - BATADV_TT_CLIENT_ISOLA ? 'I' : '.'), + ((tt_common_entry->flags & + BATADV_TT_CLIENT_NEW) ? 'N' : '.'), + ((tt_common_entry->flags & + BATADV_TT_CLIENT_PENDING) ? 'X' : '.'), + ((tt_common_entry->flags & + BATADV_TT_CLIENT_WIFI) ? 'W' : '.'), + ((tt_common_entry->flags & + BATADV_TT_CLIENT_ISOLA) ? 'I' : '.'), no_purge ? 0 : last_seen_secs, no_purge ? 0 : last_seen_msecs, vlan->tt.crc); @@ -1547,10 +1547,10 @@ batadv_tt_global_print_entry(struct batadv_priv *bat_priv, BATADV_PRINT_VID(tt_global_entry->common.vid), best_entry->ttvn, best_entry->orig_node->orig, last_ttvn, vlan->tt.crc, - (flags & BATADV_TT_CLIENT_ROAM ? 'R' : '.'), - (flags & BATADV_TT_CLIENT_WIFI ? 'W' : '.'), - (flags & BATADV_TT_CLIENT_ISOLA ? 'I' : '.'), - (flags & BATADV_TT_CLIENT_TEMP ? 'T' : '.')); + ((flags & BATADV_TT_CLIENT_ROAM) ? 'R' : '.'), + ((flags & BATADV_TT_CLIENT_WIFI) ? 'W' : '.'), + ((flags & BATADV_TT_CLIENT_ISOLA) ? 'I' : '.'), + ((flags & BATADV_TT_CLIENT_TEMP) ? 'T' : '.')); batadv_orig_node_vlan_free_ref(vlan); } @@ -1579,10 +1579,10 @@ print_list: BATADV_PRINT_VID(tt_global_entry->common.vid), orig_entry->ttvn, orig_entry->orig_node->orig, last_ttvn, vlan->tt.crc, - (flags & BATADV_TT_CLIENT_ROAM ? 'R' : '.'), - (flags & BATADV_TT_CLIENT_WIFI ? 'W' : '.'), - (flags & BATADV_TT_CLIENT_ISOLA ? 'I' : '.'), - (flags & BATADV_TT_CLIENT_TEMP ? 'T' : '.')); + ((flags & BATADV_TT_CLIENT_ROAM) ? 'R' : '.'), + ((flags & BATADV_TT_CLIENT_WIFI) ? 'W' : '.'), + ((flags & BATADV_TT_CLIENT_ISOLA) ? 'I' : '.'), + ((flags & BATADV_TT_CLIENT_TEMP) ? 'T' : '.')); batadv_orig_node_vlan_free_ref(vlan); } @@ -2548,7 +2548,7 @@ static bool batadv_send_other_tt_response(struct batadv_priv *bat_priv, batadv_dbg(BATADV_DBG_TT, bat_priv, "Received TT_REQUEST from %pM for ttvn: %u (%pM) [%c]\n", req_src, tt_data->ttvn, req_dst, - (tt_data->flags & BATADV_TT_FULL_TABLE ? 'F' : '.')); + ((tt_data->flags & BATADV_TT_FULL_TABLE) ? 'F' : '.')); /* Let's get the orig node of the REAL destination */ req_dst_orig_node = batadv_orig_hash_find(bat_priv, req_dst); @@ -2679,7 +2679,7 @@ static bool batadv_send_my_tt_response(struct batadv_priv *bat_priv, batadv_dbg(BATADV_DBG_TT, bat_priv, "Received TT_REQUEST from %pM for ttvn: %u (me) [%c]\n", req_src, tt_data->ttvn, - (tt_data->flags & BATADV_TT_FULL_TABLE ? 'F' : '.')); + ((tt_data->flags & BATADV_TT_FULL_TABLE) ? 'F' : '.')); spin_lock_bh(&bat_priv->tt.commit_lock); @@ -2918,7 +2918,7 @@ static void batadv_handle_tt_response(struct batadv_priv *bat_priv, batadv_dbg(BATADV_DBG_TT, bat_priv, "Received TT_RESPONSE from %pM for ttvn %d t_size: %d [%c]\n", resp_src, tt_data->ttvn, num_entries, - (tt_data->flags & BATADV_TT_FULL_TABLE ? 'F' : '.')); + ((tt_data->flags & BATADV_TT_FULL_TABLE) ? 'F' : '.')); orig_node = batadv_orig_hash_find(bat_priv, resp_src); if (!orig_node) -- cgit From 7dac6d9391b89fd85bb5eddd18506ae759c9a57b Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Thu, 23 Apr 2015 18:22:25 +0200 Subject: batman-adv: Remove unused post-VLAN ethhdr in batadv_gw_dhcp_recipient_get Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner --- net/batman-adv/gateway_client.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'net') diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index 14f0a2207379..bb0158620628 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -753,11 +753,6 @@ batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len, if (!pskb_may_pull(skb, *header_len + sizeof(*udphdr))) return BATADV_DHCP_NO; - /* skb->data might have been reallocated by pskb_may_pull() */ - ethhdr = eth_hdr(skb); - if (ntohs(ethhdr->h_proto) == ETH_P_8021Q) - ethhdr = (struct ethhdr *)(skb->data + VLAN_HLEN); - udphdr = (struct udphdr *)(skb->data + *header_len); *header_len += sizeof(*udphdr); -- cgit From 94d1dd87316fdb7a403a2750e13ec839fd2fadea Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Mon, 31 Mar 2014 13:48:10 +0200 Subject: batman-adv: change the MAC of each VLAN upon ndo_set_mac_address The MAC address of the soft-interface is used to initialise the "non-purge" TT entry of each existing VLAN. Therefore when the user invokes ndo_set_mac_address() all the "non-purge" TT entries have to be updated, not only the one belonging to the non-tagged network. Signed-off-by: Antonio Quartulli Signed-off-by: Marek Lindner --- net/batman-adv/soft-interface.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index a859322d7b41..c002961da75d 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -129,6 +129,7 @@ static struct net_device_stats *batadv_interface_stats(struct net_device *dev) static int batadv_interface_set_mac_addr(struct net_device *dev, void *p) { struct batadv_priv *bat_priv = netdev_priv(dev); + struct batadv_softif_vlan *vlan; struct sockaddr *addr = p; uint8_t old_addr[ETH_ALEN]; @@ -139,12 +140,17 @@ static int batadv_interface_set_mac_addr(struct net_device *dev, void *p) ether_addr_copy(dev->dev_addr, addr->sa_data); /* only modify transtable if it has been initialized before */ - if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_ACTIVE) { - batadv_tt_local_remove(bat_priv, old_addr, BATADV_NO_FLAGS, + if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) + return 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(vlan, &bat_priv->softif_vlan_list, list) { + batadv_tt_local_remove(bat_priv, old_addr, vlan->vid, "mac address changed", false); - batadv_tt_local_add(dev, addr->sa_data, BATADV_NO_FLAGS, + batadv_tt_local_add(dev, addr->sa_data, vlan->vid, BATADV_NULL_IFINDEX, BATADV_NO_MARK); } + rcu_read_unlock(); return 0; } -- cgit From b80c0e78582d4a3a004dc1ade4eb06babc6a4eea Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 4 Jun 2015 18:30:43 -0700 Subject: tcp: get_cookie_sock() consolidation IPv4 and IPv6 share same implementation of get_cookie_sock(), and there is no point inlining it. We add tcp_ prefix to the common helper name and export it. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/syncookies.c | 10 +++++----- net/ipv6/syncookies.c | 19 +------------------ 2 files changed, 6 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index df849e5a10f1..d70b1f603692 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -219,9 +219,9 @@ int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th, } EXPORT_SYMBOL_GPL(__cookie_v4_check); -static struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, - struct request_sock *req, - struct dst_entry *dst) +struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst) { struct inet_connection_sock *icsk = inet_csk(sk); struct sock *child; @@ -235,7 +235,7 @@ static struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, } return child; } - +EXPORT_SYMBOL(tcp_get_cookie_sock); /* * when syncookies are in effect and tcp timestamps are enabled we stored @@ -391,7 +391,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) ireq->rcv_wscale = rcv_wscale; ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), &rt->dst); - ret = get_cookie_sock(sk, skb, req, &rt->dst); + ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst); /* ip_queue_xmit() depends on our flow being setup * Normal sockets get it right from inet_csk_route_child_sock() */ diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 21bc2eb53c57..0909f4e0d53c 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -41,23 +41,6 @@ static __u16 const msstab[] = { 9000 - 60, }; -static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, - struct request_sock *req, - struct dst_entry *dst) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct sock *child; - - child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst); - if (child) { - atomic_set(&req->rsk_refcnt, 1); - inet_csk_reqsk_queue_add(sk, req, child); - } else { - reqsk_free(req); - } - return child; -} - static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv6_cookie_scratch); @@ -264,7 +247,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) ireq->rcv_wscale = rcv_wscale; ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), dst); - ret = get_cookie_sock(sk, skb, req, dst); + ret = tcp_get_cookie_sock(sk, skb, req, dst); out: return ret; out_free: -- cgit From f38b24c90528c888915ef6e3bc320bdb30b14cf2 Mon Sep 17 00:00:00 2001 From: Firo Yang Date: Mon, 8 Jun 2015 11:54:51 +0800 Subject: fib_trie: coding style: Use pointer after check As Alexander Duyck pointed out that: struct tnode { ... struct key_vector kv[1]; } The kv[1] member of struct tnode is an arry that refernced by a null pointer will not crash the system, like this: struct tnode *p = NULL; struct key_vector *kv = p->kv; As such p->kv doesn't actually dereference anything, it is simply a means for getting the offset to the array from the pointer p. This patch make the code more regular to avoid making people feel odd when they look at the code. Signed-off-by: Firo Yang Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 01bce1506cd7..3c699c4e90a4 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -325,13 +325,15 @@ static inline void empty_child_dec(struct key_vector *n) static struct key_vector *leaf_new(t_key key, struct fib_alias *fa) { - struct tnode *kv = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL); - struct key_vector *l = kv->kv; + struct key_vector *l; + struct tnode *kv; + kv = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL); if (!kv) return NULL; /* initialize key vector */ + l = kv->kv; l->key = key; l->pos = 0; l->bits = 0; @@ -346,24 +348,26 @@ static struct key_vector *leaf_new(t_key key, struct fib_alias *fa) static struct key_vector *tnode_new(t_key key, int pos, int bits) { - struct tnode *tnode = tnode_alloc(bits); unsigned int shift = pos + bits; - struct key_vector *tn = tnode->kv; + struct key_vector *tn; + struct tnode *tnode; /* verify bits and pos their msb bits clear and values are valid */ BUG_ON(!bits || (shift > KEYLENGTH)); - pr_debug("AT %p s=%zu %zu\n", tnode, TNODE_SIZE(0), - sizeof(struct key_vector *) << bits); - + tnode = tnode_alloc(bits); if (!tnode) return NULL; + pr_debug("AT %p s=%zu %zu\n", tnode, TNODE_SIZE(0), + sizeof(struct key_vector *) << bits); + if (bits == KEYLENGTH) tnode->full_children = 1; else tnode->empty_children = 1ul << bits; + tn = tnode->kv; tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0; tn->pos = pos; tn->bits = bits; @@ -2054,11 +2058,12 @@ static struct key_vector *fib_trie_get_next(struct fib_trie_iter *iter) static struct key_vector *fib_trie_get_first(struct fib_trie_iter *iter, struct trie *t) { - struct key_vector *n, *pn = t->kv; + struct key_vector *n, *pn; if (!t) return NULL; + pn = t->kv; n = rcu_dereference(pn->tnode[0]); if (!n) return NULL; -- cgit From 781f899f2f9d8b71e35225a087f90052059486c5 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Sat, 6 Jun 2015 06:06:49 +0200 Subject: Bluetooth: Fix race condition with user channel and setup stage During the initial setup stage of a controller, the low-level transport is actually active. This means that HCI_UP is true. To avoid toggling the transport off and back on again for normal operation the kernel holds a grace period with HCI_AUTO_OFF that will turn the low-level transport off in case no user is present. The idea of the grace period is important to avoid having to initialize all of the controller twice. So legacy ioctl and the new management interface knows how to clear this grace period and then start normal operation. For the user channel operation this grace period has not been taken into account which results in the problem that HCI_UP and HCI_AUTO_OFF are set and the kernel will return EBUSY. However from a system point of view the controller is ready to be grabbed by either the ioctl, the management interface or the user channel. This patch brings the user channel to the same level as the other two entries for operating a controller. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg Cc: stable@vger.kernel.org --- net/bluetooth/hci_sock.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 9467545e5c97..f2d30d1156c9 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -741,10 +741,11 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, goto done; } - if (test_bit(HCI_UP, &hdev->flags) || - test_bit(HCI_INIT, &hdev->flags) || + if (test_bit(HCI_INIT, &hdev->flags) || hci_dev_test_flag(hdev, HCI_SETUP) || - hci_dev_test_flag(hdev, HCI_CONFIG)) { + hci_dev_test_flag(hdev, HCI_CONFIG) || + (!hci_dev_test_flag(hdev, HCI_AUTO_OFF) && + test_bit(HCI_UP, &hdev->flags))) { err = -EBUSY; hci_dev_put(hdev); goto done; @@ -760,10 +761,21 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, err = hci_dev_open(hdev->id); if (err) { - hci_dev_clear_flag(hdev, HCI_USER_CHANNEL); - mgmt_index_added(hdev); - hci_dev_put(hdev); - goto done; + if (err == -EALREADY) { + /* In case the transport is already up and + * running, clear the error here. + * + * This can happen when opening an user + * channel and HCI_AUTO_OFF grace period + * is still active. + */ + err = 0; + } else { + hci_dev_clear_flag(hdev, HCI_USER_CHANNEL); + mgmt_index_added(hdev); + hci_dev_put(hdev); + goto done; + } } atomic_inc(&hdev->promisc); -- cgit From 2622e2a03cd7320f57a4f5171c242dccdab035dd Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sun, 31 May 2015 17:44:45 -0700 Subject: NFC: nci: hci: Fix releasing uninitialized skbs Several of these goto exit; uses should be direct returns as skb is not yet initialized by nci_hci_get_param(). Miscellanea: o Use !memcmp instead of memcmp() == 0 o Remove unnecessary goto from if () {... goto exit;} else {...} exit: Signed-off-by: Joe Perches Signed-off-by: Samuel Ortiz --- net/nfc/nci/hci.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/nfc/nci/hci.c b/net/nfc/nci/hci.c index ed54ec533836..af002df640c7 100644 --- a/net/nfc/nci/hci.c +++ b/net/nfc/nci/hci.c @@ -639,22 +639,19 @@ int nci_hci_dev_session_init(struct nci_dev *ndev) ndev->hci_dev->init_data.gates[0].gate, ndev->hci_dev->init_data.gates[0].pipe); if (r < 0) - goto exit; + return r; r = nci_hci_get_param(ndev, NCI_HCI_ADMIN_GATE, NCI_HCI_ADMIN_PARAM_SESSION_IDENTITY, &skb); if (r < 0) - goto exit; + return r; if (skb->len && skb->len == strlen(ndev->hci_dev->init_data.session_id) && - memcmp(ndev->hci_dev->init_data.session_id, - skb->data, skb->len) == 0 && + !memcmp(ndev->hci_dev->init_data.session_id, skb->data, skb->len) && ndev->ops->hci_load_session) { /* Restore gate<->pipe table from some proprietary location. */ r = ndev->ops->hci_load_session(ndev); - if (r < 0) - goto exit; } else { r = nci_hci_dev_connect_gates(ndev, ndev->hci_dev->init_data.gate_count, @@ -667,8 +664,6 @@ int nci_hci_dev_session_init(struct nci_dev *ndev) ndev->hci_dev->init_data.session_id, strlen(ndev->hci_dev->init_data.session_id)); } - if (r == 0) - goto exit; exit: kfree_skb(skb); -- cgit From b6355e972aaab0173ce11a1650e7dba67f820918 Mon Sep 17 00:00:00 2001 From: Samuel Ortiz Date: Sat, 6 Jun 2015 13:16:37 +0200 Subject: NFC: nci: Handle proprietary response and notifications Allow for drivers to explicitly define handlers for each proprietary notifications and responses they expect to support. Reviewed-by: Christophe Ricard Signed-off-by: Samuel Ortiz --- net/nfc/nci/core.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ net/nfc/nci/ntf.c | 10 ++++++++++ net/nfc/nci/rsp.c | 10 ++++++++++ 3 files changed, 72 insertions(+) (limited to 'net') diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index 49ff32106080..56d57c93ea1a 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -28,6 +28,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": %s: " fmt, __func__ #include +#include #include #include #include @@ -961,6 +962,14 @@ struct nci_dev *nci_allocate_device(struct nci_ops *ops, return NULL; ndev->ops = ops; + + if (ops->n_prop_ops > NCI_MAX_PROPRIETARY_CMD) { + pr_err("Too many proprietary commands: %zd\n", + ops->n_prop_ops); + ops->prop_ops = NULL; + ops->n_prop_ops = 0; + } + ndev->tx_headroom = tx_headroom; ndev->tx_tailroom = tx_tailroom; init_completion(&ndev->req_completion); @@ -1165,6 +1174,49 @@ int nci_send_cmd(struct nci_dev *ndev, __u16 opcode, __u8 plen, void *payload) return 0; } +/* Proprietary commands API */ +static struct nci_prop_ops *prop_cmd_lookup(struct nci_dev *ndev, + __u16 opcode) +{ + size_t i; + struct nci_prop_ops *prop_op; + + if (!ndev->ops->prop_ops || !ndev->ops->n_prop_ops) + return NULL; + + for (i = 0; i < ndev->ops->n_prop_ops; i++) { + prop_op = &ndev->ops->prop_ops[i]; + if (prop_op->opcode == opcode) + return prop_op; + } + + return NULL; +} + +int nci_prop_rsp_packet(struct nci_dev *ndev, __u16 rsp_opcode, + struct sk_buff *skb) +{ + struct nci_prop_ops *prop_op; + + prop_op = prop_cmd_lookup(ndev, rsp_opcode); + if (!prop_op || !prop_op->rsp) + return -ENOTSUPP; + + return prop_op->rsp(ndev, skb); +} + +int nci_prop_ntf_packet(struct nci_dev *ndev, __u16 ntf_opcode, + struct sk_buff *skb) +{ + struct nci_prop_ops *prop_op; + + prop_op = prop_cmd_lookup(ndev, ntf_opcode); + if (!prop_op || !prop_op->ntf) + return -ENOTSUPP; + + return prop_op->ntf(ndev, skb); +} + /* ---- NCI TX Data worker thread ---- */ static void nci_tx_work(struct work_struct *work) diff --git a/net/nfc/nci/ntf.c b/net/nfc/nci/ntf.c index 3218071072ac..5d1c2e391c56 100644 --- a/net/nfc/nci/ntf.c +++ b/net/nfc/nci/ntf.c @@ -758,6 +758,15 @@ void nci_ntf_packet(struct nci_dev *ndev, struct sk_buff *skb) /* strip the nci control header */ skb_pull(skb, NCI_CTRL_HDR_SIZE); + if (nci_opcode_gid(ntf_opcode) == NCI_GID_PROPRIETARY) { + if (nci_prop_ntf_packet(ndev, ntf_opcode, skb)) { + pr_err("unsupported ntf opcode 0x%x\n", + ntf_opcode); + } + + goto end; + } + switch (ntf_opcode) { case NCI_OP_CORE_CONN_CREDITS_NTF: nci_core_conn_credits_ntf_packet(ndev, skb); @@ -796,5 +805,6 @@ void nci_ntf_packet(struct nci_dev *ndev, struct sk_buff *skb) break; } +end: kfree_skb(skb); } diff --git a/net/nfc/nci/rsp.c b/net/nfc/nci/rsp.c index 02486bc2ceea..408bd8f857ab 100644 --- a/net/nfc/nci/rsp.c +++ b/net/nfc/nci/rsp.c @@ -296,6 +296,15 @@ void nci_rsp_packet(struct nci_dev *ndev, struct sk_buff *skb) /* strip the nci control header */ skb_pull(skb, NCI_CTRL_HDR_SIZE); + if (nci_opcode_gid(rsp_opcode) == NCI_GID_PROPRIETARY) { + if (nci_prop_rsp_packet(ndev, rsp_opcode, skb) == -ENOTSUPP) { + pr_err("unsupported rsp opcode 0x%x\n", + rsp_opcode); + } + + goto end; + } + switch (rsp_opcode) { case NCI_OP_CORE_RESET_RSP: nci_core_reset_rsp_packet(ndev, skb); @@ -346,6 +355,7 @@ void nci_rsp_packet(struct nci_dev *ndev, struct sk_buff *skb) break; } +end: kfree_skb(skb); /* trigger the next cmd */ -- cgit From 81859ab8779567af491fbf83ea628cdf09188d90 Mon Sep 17 00:00:00 2001 From: Christophe Ricard Date: Sat, 6 Jun 2015 13:16:39 +0200 Subject: NFC: nci: Add NCI_RESET return code check before setup setup was executed in any case, even if NCI_RESET failed. Signed-off-by: Christophe Ricard Signed-off-by: Samuel Ortiz --- net/nfc/nci/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index 56d57c93ea1a..b900e6a2a284 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -347,8 +347,9 @@ static int nci_open_device(struct nci_dev *ndev) rc = __nci_request(ndev, nci_reset_req, 0, msecs_to_jiffies(NCI_RESET_TIMEOUT)); - if (ndev->ops->setup) - ndev->ops->setup(ndev); + if (!rc && ndev->ops->setup) { + rc = ndev->ops->setup(ndev); + } if (!rc) { rc = __nci_request(ndev, nci_init_req, 0, -- cgit From c39daeee50eb0b95d3b91bda21b77955a459ee5f Mon Sep 17 00:00:00 2001 From: Christophe Ricard Date: Sat, 6 Jun 2015 13:16:40 +0200 Subject: NFC: nci: Add nci init ops for early device initialization Some device may need to execute some proprietary commands in order to "wake-up"; Before the nci state initialization. Signed-off-by: Christophe Ricard Signed-off-by: Samuel Ortiz --- net/nfc/nci/core.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index b900e6a2a284..458e58bb9cb1 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -344,8 +344,13 @@ static int nci_open_device(struct nci_dev *ndev) set_bit(NCI_INIT, &ndev->flags); - rc = __nci_request(ndev, nci_reset_req, 0, - msecs_to_jiffies(NCI_RESET_TIMEOUT)); + if (ndev->ops->init) + rc = ndev->ops->init(ndev); + + if (!rc) { + rc = __nci_request(ndev, nci_reset_req, 0, + msecs_to_jiffies(NCI_RESET_TIMEOUT)); + } if (!rc && ndev->ops->setup) { rc = ndev->ops->setup(ndev); -- cgit From 759afb8d288ffbe9a1cdb20af037b5c072dc38b2 Mon Sep 17 00:00:00 2001 From: Christophe Ricard Date: Sat, 6 Jun 2015 13:16:41 +0200 Subject: NFC: nci: Add nci_prop_cmd allowing to send proprietary nci cmd Handle allowing to send proprietary nci commands anywhere in the nci state machine. Signed-off-by: Christophe Ricard Signed-off-by: Samuel Ortiz --- net/nfc/nci/core.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'net') diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index 458e58bb9cb1..b5dc15044466 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -324,6 +324,32 @@ static void nci_rf_deactivate_req(struct nci_dev *ndev, unsigned long opt) sizeof(struct nci_rf_deactivate_cmd), &cmd); } +struct nci_prop_cmd_param { + __u16 opcode; + size_t len; + __u8 *payload; +}; + +static void nci_prop_cmd_req(struct nci_dev *ndev, unsigned long opt) +{ + struct nci_prop_cmd_param *param = (struct nci_prop_cmd_param *)opt; + + nci_send_cmd(ndev, param->opcode, param->len, param->payload); +} + +int nci_prop_cmd(struct nci_dev *ndev, __u8 oid, size_t len, __u8 *payload) +{ + struct nci_prop_cmd_param param; + + param.opcode = nci_opcode_pack(NCI_GID_PROPRIETARY, oid); + param.len = len; + param.payload = payload; + + return __nci_request(ndev, nci_prop_cmd_req, (unsigned long)¶m, + msecs_to_jiffies(NCI_CMD_TIMEOUT)); +} +EXPORT_SYMBOL(nci_prop_cmd); + static int nci_open_device(struct nci_dev *ndev) { int rc = 0; -- cgit From 0e70cba71f8b61e0a0c7df526f5cee2d842ee93c Mon Sep 17 00:00:00 2001 From: Christophe Ricard Date: Sat, 6 Jun 2015 13:16:48 +0200 Subject: NFC: nci: Move close ops call in nci_close_device When closing the device some data (proprietary commands) might be sent. The core state machine needs to be set for correct command execution. Signed-off-by: Christophe Ricard Signed-off-by: Samuel Ortiz --- net/nfc/nci/core.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index b5dc15044466..edc10cc8e10b 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -440,6 +440,12 @@ static int nci_close_device(struct nci_dev *ndev) set_bit(NCI_INIT, &ndev->flags); __nci_request(ndev, nci_reset_req, 0, msecs_to_jiffies(NCI_RESET_TIMEOUT)); + + /* After this point our queues are empty + * and no works are scheduled. + */ + ndev->ops->close(ndev); + clear_bit(NCI_INIT, &ndev->flags); del_timer_sync(&ndev->cmd_timer); @@ -447,10 +453,6 @@ static int nci_close_device(struct nci_dev *ndev) /* Flush cmd wq */ flush_workqueue(ndev->cmd_wq); - /* After this point our queues are empty - * and no works are scheduled. */ - ndev->ops->close(ndev); - /* Clear flags */ ndev->flags = 0; -- cgit From 9e58095f9660f88d6a2febe87d5073a6b2e9c399 Mon Sep 17 00:00:00 2001 From: Samuel Ortiz Date: Tue, 14 Oct 2014 02:19:46 +0200 Subject: NFC: netlink: Implement vendor command support Vendor commands are passed from userspace through the NFC_CMD_VENDOR netlink command, allowing driver and hardware specific operations implementations like for example RF tuning or production line calibration. Drivers will associate a set of vendor commands to a vendor id, which could typically be an OUI. The netlink kernel implementation will try to match the received vendor id and sub command attributes with the registered ones. When such match is found, the driver defined sub command routine is called. Signed-off-by: Samuel Ortiz --- net/nfc/netlink.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) (limited to 'net') diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index 3763036710ae..f85f37ed19b2 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -5,6 +5,12 @@ * Lauro Ramos Venancio * Aloisio Almeida Jr * + * Vendor commands implementation based on net/wireless/nl80211.c + * which is: + * + * Copyright 2006-2010 Johannes Berg + * Copyright 2013-2014 Intel Mobile Communications GmbH + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -1489,6 +1495,50 @@ static int nfc_genl_se_io(struct sk_buff *skb, struct genl_info *info) return nfc_se_io(dev, se_idx, apdu, apdu_len, se_io_cb, ctx); } +static int nfc_genl_vendor_cmd(struct sk_buff *skb, + struct genl_info *info) +{ + struct nfc_dev *dev; + struct nfc_vendor_cmd *cmd; + u32 dev_idx, vid, subcmd; + u8 *data; + size_t data_len; + int i; + + if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || + !info->attrs[NFC_ATTR_VENDOR_ID] || + !info->attrs[NFC_ATTR_VENDOR_SUBCMD]) + return -EINVAL; + + dev_idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); + vid = nla_get_u32(info->attrs[NFC_ATTR_VENDOR_ID]); + subcmd = nla_get_u32(info->attrs[NFC_ATTR_VENDOR_SUBCMD]); + + dev = nfc_get_device(dev_idx); + if (!dev || !dev->vendor_cmds || !dev->n_vendor_cmds) + return -ENODEV; + + data = nla_data(info->attrs[NFC_ATTR_VENDOR_DATA]); + if (data) { + data_len = nla_len(info->attrs[NFC_ATTR_VENDOR_DATA]); + if (data_len == 0) + return -EINVAL; + } else { + data_len = 0; + } + + for (i = 0; i < dev->n_vendor_cmds; i++) { + cmd = &dev->vendor_cmds[i]; + + if (cmd->vendor_id != vid || cmd->subcmd != subcmd) + continue; + + return cmd->doit(dev, data, data_len); + } + + return -EOPNOTSUPP; +} + static const struct genl_ops nfc_genl_ops[] = { { .cmd = NFC_CMD_GET_DEVICE, @@ -1579,6 +1629,11 @@ static const struct genl_ops nfc_genl_ops[] = { .doit = nfc_genl_activate_target, .policy = nfc_genl_policy, }, + { + .cmd = NFC_CMD_VENDOR, + .doit = nfc_genl_vendor_cmd, + .policy = nfc_genl_policy, + }, }; -- cgit From 8b76ce34c43a569f981623485c1b6c700594678e Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Mon, 8 Jun 2015 18:14:39 +0300 Subject: Bluetooth: Fix encryption key size handling for LTKs The encryption key size for LTKs is supposed to be applied only at the moment of encryption. When generating a Link Key (using LE SC) from the LTK the full non-shortened value should be used. This patch modifies the code to always keep the full value around and only apply the key size when passing the value to HCI. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_conn.c | 4 ++-- net/bluetooth/hci_event.c | 3 ++- net/bluetooth/smp.c | 15 +++------------ 3 files changed, 7 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index ee5e59839b02..2c48bf0b5afb 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -276,7 +276,7 @@ u8 hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency, } void hci_le_start_enc(struct hci_conn *conn, __le16 ediv, __le64 rand, - __u8 ltk[16]) + __u8 ltk[16], __u8 key_size) { struct hci_dev *hdev = conn->hdev; struct hci_cp_le_start_enc cp; @@ -288,7 +288,7 @@ void hci_le_start_enc(struct hci_conn *conn, __le16 ediv, __le64 rand, cp.handle = cpu_to_le16(conn->handle); cp.rand = rand; cp.ediv = ediv; - memcpy(cp.ltk, ltk, sizeof(cp.ltk)); + memcpy(cp.ltk, ltk, key_size); hci_send_cmd(hdev, HCI_OP_LE_START_ENC, sizeof(cp), &cp); } diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 7b61be73650f..fcbfa4138eb1 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -4955,7 +4955,8 @@ static void hci_le_ltk_request_evt(struct hci_dev *hdev, struct sk_buff *skb) goto not_found; } - memcpy(cp.ltk, ltk->val, sizeof(ltk->val)); + memcpy(cp.ltk, ltk->val, ltk->enc_size); + memset(cp.ltk + ltk->enc_size, 0, sizeof(cp.ltk) - ltk->enc_size); cp.handle = cpu_to_le16(conn->handle); conn->pending_sec_level = smp_ltk_sec_level(ltk); diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 659371af39e4..3921cba056d3 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -997,13 +997,10 @@ static u8 smp_random(struct smp_chan *smp) smp_s1(smp->tfm_aes, smp->tk, smp->rrnd, smp->prnd, stk); - memset(stk + smp->enc_key_size, 0, - SMP_MAX_ENC_KEY_SIZE - smp->enc_key_size); - if (test_and_set_bit(HCI_CONN_ENCRYPT_PEND, &hcon->flags)) return SMP_UNSPECIFIED; - hci_le_start_enc(hcon, ediv, rand, stk); + hci_le_start_enc(hcon, ediv, rand, stk, smp->enc_key_size); hcon->enc_key_size = smp->enc_key_size; set_bit(HCI_CONN_STK_ENCRYPT, &hcon->flags); } else { @@ -1016,9 +1013,6 @@ static u8 smp_random(struct smp_chan *smp) smp_s1(smp->tfm_aes, smp->tk, smp->prnd, smp->rrnd, stk); - memset(stk + smp->enc_key_size, 0, - SMP_MAX_ENC_KEY_SIZE - smp->enc_key_size); - if (hcon->pending_sec_level == BT_SECURITY_HIGH) auth = 1; else @@ -1156,9 +1150,6 @@ static void sc_add_ltk(struct smp_chan *smp) else auth = 0; - memset(smp->tk + smp->enc_key_size, 0, - SMP_MAX_ENC_KEY_SIZE - smp->enc_key_size); - smp->ltk = hci_add_ltk(hcon->hdev, &hcon->dst, hcon->dst_type, key_type, auth, smp->tk, smp->enc_key_size, 0, 0); @@ -2202,7 +2193,7 @@ static bool smp_ltk_encrypt(struct l2cap_conn *conn, u8 sec_level) if (test_and_set_bit(HCI_CONN_ENCRYPT_PEND, &hcon->flags)) return true; - hci_le_start_enc(hcon, key->ediv, key->rand, key->val); + hci_le_start_enc(hcon, key->ediv, key->rand, key->val, key->enc_size); hcon->enc_key_size = key->enc_size; /* We never store STKs for master role, so clear this flag */ @@ -2750,7 +2741,7 @@ static int smp_cmd_dhkey_check(struct l2cap_conn *conn, struct sk_buff *skb) sc_add_ltk(smp); if (hcon->out) { - hci_le_start_enc(hcon, 0, 0, smp->tk); + hci_le_start_enc(hcon, 0, 0, smp->tk, smp->enc_key_size); hcon->enc_key_size = smp->enc_key_size; } -- cgit From dd895d7f21b244e7fd4c7477697e274de7e44ecb Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Tue, 9 Jun 2015 08:05:10 +0200 Subject: can: cangw: introduce optional uid to reference created routing jobs Similar to referencing iptables rules by their line number this UID allows to reference created routing jobs, e.g. to alter configured data modifications. The UID is an optional non-zero value which can be provided at routing job creation time. When the UID is set the UID replaces the data modification configuration as job identification attribute e.g. at job removal time. Signed-off-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde --- net/can/gw.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 56 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/can/gw.c b/net/can/gw.c index a6f448e18ea8..455168718c2e 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -110,6 +110,7 @@ struct cf_mod { void (*xor)(struct can_frame *cf, struct cgw_csum_xor *xor); void (*crc8)(struct can_frame *cf, struct cgw_csum_crc8 *crc8); } csumfunc; + u32 uid; }; @@ -548,6 +549,11 @@ static int cgw_put_job(struct sk_buff *skb, struct cgw_job *gwj, int type, goto cancel; } + if (gwj->mod.uid) { + if (nla_put_u32(skb, CGW_MOD_UID, gwj->mod.uid) < 0) + goto cancel; + } + if (gwj->mod.csumfunc.crc8) { if (nla_put(skb, CGW_CS_CRC8, CGW_CS_CRC8_LEN, &gwj->mod.csum.crc8) < 0) @@ -619,6 +625,7 @@ static const struct nla_policy cgw_policy[CGW_MAX+1] = { [CGW_DST_IF] = { .type = NLA_U32 }, [CGW_FILTER] = { .len = sizeof(struct can_filter) }, [CGW_LIM_HOPS] = { .type = NLA_U8 }, + [CGW_MOD_UID] = { .type = NLA_U32 }, }; /* check for common and gwtype specific attributes */ @@ -761,6 +768,10 @@ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod, else mod->csumfunc.xor = cgw_csum_xor_neg; } + + if (tb[CGW_MOD_UID]) { + nla_memcpy(&mod->uid, tb[CGW_MOD_UID], sizeof(u32)); + } } if (gwtype == CGW_TYPE_CAN_CAN) { @@ -802,6 +813,8 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh) { struct rtcanmsg *r; struct cgw_job *gwj; + struct cf_mod mod; + struct can_can_gw ccgw; u8 limhops = 0; int err = 0; @@ -819,6 +832,36 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh) if (r->gwtype != CGW_TYPE_CAN_CAN) return -EINVAL; + err = cgw_parse_attr(nlh, &mod, CGW_TYPE_CAN_CAN, &ccgw, &limhops); + if (err < 0) + return err; + + if (mod.uid) { + + ASSERT_RTNL(); + + /* check for updating an existing job with identical uid */ + hlist_for_each_entry(gwj, &cgw_list, list) { + + if (gwj->mod.uid != mod.uid) + continue; + + /* interfaces & filters must be identical */ + if (memcmp(&gwj->ccgw, &ccgw, sizeof(ccgw))) + return -EINVAL; + + /* update modifications with disabled softirq & quit */ + local_bh_disable(); + memcpy(&gwj->mod, &mod, sizeof(mod)); + local_bh_enable(); + return 0; + } + } + + /* ifindex == 0 is not allowed for job creation */ + if (!ccgw.src_idx || !ccgw.dst_idx) + return -ENODEV; + gwj = kmem_cache_alloc(cgw_cache, GFP_KERNEL); if (!gwj) return -ENOMEM; @@ -828,18 +871,14 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh) gwj->deleted_frames = 0; gwj->flags = r->flags; gwj->gwtype = r->gwtype; + gwj->limit_hops = limhops; - err = cgw_parse_attr(nlh, &gwj->mod, CGW_TYPE_CAN_CAN, &gwj->ccgw, - &limhops); - if (err < 0) - goto out; + /* insert already parsed information */ + memcpy(&gwj->mod, &mod, sizeof(mod)); + memcpy(&gwj->ccgw, &ccgw, sizeof(ccgw)); err = -ENODEV; - /* ifindex == 0 is not allowed for job creation */ - if (!gwj->ccgw.src_idx || !gwj->ccgw.dst_idx) - goto out; - gwj->src.dev = __dev_get_by_index(&init_net, gwj->ccgw.src_idx); if (!gwj->src.dev) @@ -856,8 +895,6 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh) if (gwj->dst.dev->type != ARPHRD_CAN) goto out; - gwj->limit_hops = limhops; - ASSERT_RTNL(); err = cgw_register_filter(gwj); @@ -931,8 +968,15 @@ static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh) if (gwj->limit_hops != limhops) continue; - if (memcmp(&gwj->mod, &mod, sizeof(mod))) - continue; + /* we have a match when uid is enabled and identical */ + if (gwj->mod.uid || mod.uid) { + if (gwj->mod.uid != mod.uid) + continue; + } else { + /* no uid => check for identical modifications */ + if (memcmp(&gwj->mod, &mod, sizeof(mod))) + continue; + } /* if (r->gwtype == CGW_TYPE_CAN_CAN) - is made sure here */ if (memcmp(&gwj->ccgw, &ccgw, sizeof(ccgw))) -- cgit From 9a4d3d4ba17c93def2b4dc3126eba30716d15469 Mon Sep 17 00:00:00 2001 From: Stefan Schmidt Date: Mon, 8 Jun 2015 22:06:40 +0200 Subject: mac802154/iface: remove superfluous WARN_ON call in slave_open() This call was used before we aligned our code with the wireless code base. We are wanted to handle this in the err: code path. Which would actually not work because the WARN_ON() macro would reset the res value to 0 and thus we would never hit err:. Removing it makes the code do what we actually intend. Signed-off-by: Stefan Schmidt Signed-off-by: Marcel Holtmann --- net/mac802154/iface.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c index 3a67d35d4672..e3d77b07c0e4 100644 --- a/net/mac802154/iface.c +++ b/net/mac802154/iface.c @@ -147,7 +147,6 @@ static int mac802154_slave_open(struct net_device *dev) if (!local->open_count) { res = drv_start(local); - WARN_ON(res); if (res) goto err; } -- cgit From 867146a0d242bb9aecc267aa33e0804501550368 Mon Sep 17 00:00:00 2001 From: Loic Poulain Date: Tue, 9 Jun 2015 11:46:30 +0200 Subject: Bluetooth: Don't call shutdown when leaving user channel Don't interfere with the user channel exclusive access. Signed-off-by: Loic Poulain Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index f6c99098959f..573711c2d09e 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1553,6 +1553,7 @@ static int hci_dev_do_close(struct hci_dev *hdev) BT_DBG("%s %p", hdev->name, hdev); if (!hci_dev_test_flag(hdev, HCI_UNREGISTER) && + !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && test_bit(HCI_UP, &hdev->flags)) { /* Execute vendor specific shutdown routine */ if (hdev->shutdown) -- cgit From 9b4c33364eb653a824c58e637c73caa6feb9879c Mon Sep 17 00:00:00 2001 From: Arron Wang Date: Tue, 9 Jun 2015 17:47:22 +0800 Subject: Bluetooth: Make l2cap_recv_acldata() and sco_recv_scodata() return void The return value of l2cap_recv_acldata() and sco_recv_scodata() are not used, then change it to return void Signed-off-by: Arron Wang Signed-off-by: Marcel Holtmann --- net/bluetooth/l2cap_core.c | 5 ++--- net/bluetooth/sco.c | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index dad419782a12..07bd316d02ba 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -7442,7 +7442,7 @@ static void l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) mutex_unlock(&conn->chan_lock); } -int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) +void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) { struct l2cap_conn *conn = hcon->l2cap_data; struct l2cap_hdr *hdr; @@ -7485,7 +7485,7 @@ int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) if (len == skb->len) { /* Complete frame received */ l2cap_recv_frame(conn, skb); - return 0; + return; } BT_DBG("Start: total len %d, frag len %d", len, skb->len); @@ -7544,7 +7544,6 @@ int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) drop: kfree_skb(skb); - return 0; } static struct hci_cb l2cap_cb = { diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 6b6e59dc54cf..688a040c5626 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -1110,7 +1110,7 @@ static void sco_disconn_cfm(struct hci_conn *hcon, __u8 reason) sco_conn_del(hcon, bt_to_errno(reason)); } -int sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb) +void sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb) { struct sco_conn *conn = hcon->sco_data; @@ -1121,12 +1121,11 @@ int sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb) if (skb->len) { sco_recv_frame(conn, skb); - return 0; + return; } drop: kfree_skb(skb); - return 0; } static struct hci_cb sco_cb = { -- cgit From ff50e8afc537e66bb3daf5d1cd6628d6b76e7f06 Mon Sep 17 00:00:00 2001 From: Arron Wang Date: Tue, 9 Jun 2015 17:47:23 +0800 Subject: Bluetooth: Move SCO support under BT_BREDR config option SCO/eSCO link is supported by BR/EDR controller, it is suitable to move them under BT_BREDR config option Signed-off-by: Arron Wang Signed-off-by: Marcel Holtmann --- net/bluetooth/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/Makefile b/net/bluetooth/Makefile index 9a8ea232d28f..29c12ae72a66 100644 --- a/net/bluetooth/Makefile +++ b/net/bluetooth/Makefile @@ -12,9 +12,10 @@ obj-$(CONFIG_BT_6LOWPAN) += bluetooth_6lowpan.o bluetooth_6lowpan-y := 6lowpan.o bluetooth-y := af_bluetooth.o hci_core.o hci_conn.o hci_event.o mgmt.o \ - hci_sock.o hci_sysfs.o l2cap_core.o l2cap_sock.o smp.o sco.o lib.o \ + hci_sock.o hci_sysfs.o l2cap_core.o l2cap_sock.o smp.o lib.o \ a2mp.o amp.o ecc.o hci_request.o mgmt_util.o +bluetooth-$(CONFIG_BT_BREDR) += sco.o bluetooth-$(CONFIG_BT_DEBUGFS) += hci_debugfs.o bluetooth-$(CONFIG_BT_SELFTEST) += selftest.o -- cgit From 472be00d04edccbc1ad2d79540b1e9ea99cc7c7f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 9 Jun 2015 16:45:08 +0200 Subject: mac80211: handle aggregation session timeout on fast-xmit path The conversion to the fast-xmit path lost proper aggregation session timeout handling - the last_tx wasn't set on that path and the timer would therefore incorrectly tear down the session periodically (with those drivers/rate control algorithms that have a timeout.) In case of iwlwifi, this was every 5 seconds and caused significant throughput degradation. Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index dbef1b881b3d..7fe528adc5ae 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -2719,9 +2719,12 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) { tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]); - if (tid_tx && - !test_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state)) - return false; + if (tid_tx) { + if (!test_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state)) + return false; + if (tid_tx->timeout) + tid_tx->last_tx = jiffies; + } } /* after this point (skb is modified) we cannot return false */ -- cgit From d01f858c78b0a67218d73b17ec1e494e014ab52a Mon Sep 17 00:00:00 2001 From: Michal Kazior Date: Wed, 3 Jun 2015 08:36:13 +0200 Subject: mac80211: release channel on auth failure There were a few rare cases when upon authentication failure channel wasn't released. This could cause stale pointers to remain in chanctx assigned_vifs after interface removal and trigger general protection fault later. This could be triggered, e.g. on ath10k with the following steps: 1. start an AP 2. create 2 extra vifs on ath10k host 3. connect vif1 to the AP 4. connect vif2 to the AP (auth fails because ath10k firmware isn't able to maintain 2 peers with colliding AP mac addresses across vifs and consequently refuses sta_info_insert() in ieee80211_prep_connection()) 5. remove the 2 extra vifs 6. goto step 2; at step 3 kernel was crashing: general protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC Modules linked in: ath10k_pci ath10k_core ath ... Call Trace: [] ieee80211_check_combinations+0x22b/0x290 [] ? ieee80211_check_concurrent_iface+0x125/0x220 [] ? netpoll_poll_disable+0x84/0x100 [] ieee80211_check_concurrent_iface+0x133/0x220 [] ieee80211_open+0x3e/0x80 [] __dev_open+0xb6/0x130 [] __dev_change_flags+0xa1/0x170 ... RIP [] ieee80211_chanctx_radar_detect+0xa0/0x170 (gdb) l * ieee80211_chanctx_radar_detect+0xa0 0xffffffff81a23140 is in ieee80211_chanctx_radar_detect (/devel/src/linux/net/mac80211/util.c:3182). 3177 */ 3178 WARN_ON(ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER && 3179 !list_empty(&ctx->assigned_vifs)); 3180 3181 list_for_each_entry(sdata, &ctx->assigned_vifs, assigned_chanctx_list) 3182 if (sdata->radar_required) 3183 radar_detect |= BIT(sdata->vif.bss_conf.chandef.width); 3184 3185 return radar_detect; Signed-off-by: Michal Kazior Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 94274ef569fa..235ed68dca36 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -4601,6 +4601,9 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, eth_zero_addr(ifmgd->bssid); ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BSSID); ifmgd->auth_data = NULL; + mutex_lock(&sdata->local->mtx); + ieee80211_vif_release_channel(sdata); + mutex_unlock(&sdata->local->mtx); err_free: kfree(auth_data); return err; -- cgit From 74d803b6021f7a1cb06363cd9f65dbdf4fcf35e7 Mon Sep 17 00:00:00 2001 From: Sara Sharon Date: Wed, 3 Jun 2015 10:44:17 +0300 Subject: mac80211: ignore invalid scan RSSI values Channels in 2.4GHz band overlap, this means that if we send a probe request on channel 1 and then move to channel 2, we will hear the probe response on channel 2. In this case, the RSSI will be lower than if we had heard it on the channel on which it was sent (1 in this case). The scan result ignores those invalid values and the station last signal should not be updated as well. In case the scan determines the signal to be invalid turn on the flag so the station last signal will not be updated with the value and thus user space probing for NL80211_STA_INFO_SIGNAL and NL80211_STA_INFO_SIGNAL_AVG will not get this invalid RSSI value. Signed-off-by: Sara Sharon Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- net/mac80211/scan.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 3eb121d2aa45..1454c1b7d06c 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -6,7 +6,7 @@ * Copyright 2005, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc * Copyright 2007, Michael Wu - * Copyright 2013-2014 Intel Mobile Communications GmbH + * Copyright 2013-2015 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -69,6 +69,7 @@ ieee80211_bss_info_update(struct ieee80211_local *local, int clen, srlen; enum nl80211_bss_scan_width scan_width; s32 signal = 0; + bool signal_valid; if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) signal = rx_status->signal * 100; @@ -86,6 +87,11 @@ ieee80211_bss_info_update(struct ieee80211_local *local, GFP_ATOMIC); if (!cbss) return NULL; + /* In case the signal is invalid update the status */ + signal_valid = abs(channel->center_freq - cbss->channel->center_freq) + <= local->hw.wiphy->max_adj_channel_rssi_comp; + if (!signal_valid) + rx_status->flag |= RX_FLAG_NO_SIGNAL_VAL; bss = (void *)cbss->priv; -- cgit From afd2efb91990667cd4d9171a743f8a89e19d5ef1 Mon Sep 17 00:00:00 2001 From: Alexis Green Date: Fri, 5 Jun 2015 12:43:54 -0700 Subject: mac80211: Fix incorrectly named last_hop_metric variable in mesh_rx_path_sel_frame The last hop metric should refer to link cost (this is how hwmp_route_info_get uses it for example). But in mesh_rx_path_sel_frame we are not dealing with link cost but with the total cost to the origin of a PREQ or PREP. Signed-off-by: Alexis Green CC: Jesse Jones Signed-off-by: Johannes Berg --- net/mac80211/mesh_hwmp.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index 214e63b84e5c..be33e4188209 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -854,7 +854,7 @@ void mesh_rx_path_sel_frame(struct ieee80211_sub_if_data *sdata, { struct ieee802_11_elems elems; size_t baselen; - u32 last_hop_metric; + u32 path_metric; struct sta_info *sta; /* need action_code */ @@ -877,21 +877,21 @@ void mesh_rx_path_sel_frame(struct ieee80211_sub_if_data *sdata, if (elems.preq_len != 37) /* Right now we support just 1 destination and no AE */ return; - last_hop_metric = hwmp_route_info_get(sdata, mgmt, elems.preq, - MPATH_PREQ); - if (last_hop_metric) + path_metric = hwmp_route_info_get(sdata, mgmt, elems.preq, + MPATH_PREQ); + if (path_metric) hwmp_preq_frame_process(sdata, mgmt, elems.preq, - last_hop_metric); + path_metric); } if (elems.prep) { if (elems.prep_len != 31) /* Right now we support no AE */ return; - last_hop_metric = hwmp_route_info_get(sdata, mgmt, elems.prep, - MPATH_PREP); - if (last_hop_metric) + path_metric = hwmp_route_info_get(sdata, mgmt, elems.prep, + MPATH_PREP); + if (path_metric) hwmp_prep_frame_process(sdata, mgmt, elems.prep, - last_hop_metric); + path_metric); } if (elems.perr) { if (elems.perr_len != 15) -- cgit From 8df734e865b74d9f273216482a45a38269dc767a Mon Sep 17 00:00:00 2001 From: Chun-Yeow Yeoh Date: Tue, 9 Jun 2015 13:35:33 +0800 Subject: mac80211: fix the beacon csa counter for mesh and ibss The csa counter has moved from sdata to beacon/presp but it is not updated accordingly for mesh and ibss. Fix this. Fixes: af296bdb8da4 ("mac80211: move csa counters from sdata to beacon/presp") Signed-off-by: Chun-Yeow Yeoh Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 1 + net/mac80211/ibss.c | 1 + net/mac80211/mesh.c | 1 + 3 files changed, 3 insertions(+) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 1a17d3208d8f..d1c94c6a12e1 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -3437,6 +3437,7 @@ static int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, /* Update CSA counters */ if (sdata->vif.csa_active && (sdata->vif.type == NL80211_IFTYPE_AP || + sdata->vif.type == NL80211_IFTYPE_MESH_POINT || sdata->vif.type == NL80211_IFTYPE_ADHOC) && params->n_csa_offsets) { int i; diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 21716af8bec3..7f72bc9bae2e 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -146,6 +146,7 @@ ieee80211_ibss_build_presp(struct ieee80211_sub_if_data *sdata, csa_settings->chandef.chan->center_freq); presp->csa_counter_offsets[0] = (pos - presp->head); *pos++ = csa_settings->count; + presp->csa_current_counter = csa_settings->count; } /* put the remaining rates in WLAN_EID_EXT_SUPP_RATES */ diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index d4684242e78b..817098add1d6 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -680,6 +680,7 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh) *pos++ = 0x0; *pos++ = ieee80211_frequency_to_channel( csa->settings.chandef.chan->center_freq); + bcn->csa_current_counter = csa->settings.count; bcn->csa_counter_offsets[0] = hdr_len + 6; *pos++ = csa->settings.count; *pos++ = WLAN_EID_CHAN_SWITCH_PARAM; -- cgit From e060e7adc296c0b3eab1d7b96f36b496733109e4 Mon Sep 17 00:00:00 2001 From: Alexis Green Date: Tue, 9 Jun 2015 13:00:43 -0700 Subject: mac80211: Always check rates and capabilities in mesh mode In mesh mode there is a race between establishing links and processing rates and capabilities in beacons. This is very noticeable with slow beacons (e.g. beacon intervals of 1s) and manifested for us as stations using minstrel when minstrel_ht should be used. Fixed by changing mesh_sta_info_init so that it always checks rates and such if it has not already done so. Signed-off-by: Alexis Green CC: Jesse Jones Signed-off-by: Johannes Berg --- net/mac80211/mesh_plink.c | 3 ++- net/mac80211/sta_info.h | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index 72a127e8a1b6..5438d13e2f00 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -392,8 +392,9 @@ static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata, sta->last_rx = jiffies; /* rates and capabilities don't change during peering */ - if (sta->plink_state == NL80211_PLINK_ESTAB) + if (sta->plink_state == NL80211_PLINK_ESTAB && sta->processed_beacon) goto out; + sta->processed_beacon = true; if (sta->sta.supp_rates[band] != rates) changed |= IEEE80211_RC_SUPP_RATES_CHANGED; diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 9bd1e97876bd..226f8ca47ad6 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -369,6 +369,8 @@ struct ieee80211_fast_tx { * @rx_msdu: MSDUs received from this station, using IEEE80211_NUM_TID * entry for non-QoS frames * @fast_tx: TX fastpath information + * @processed_beacon: set to true after peer rates and capabilities are + * processed */ struct sta_info { /* General information, mostly static */ @@ -473,6 +475,7 @@ struct sta_info { enum nl80211_mesh_power_mode local_pm; enum nl80211_mesh_power_mode peer_pm; enum nl80211_mesh_power_mode nonpeer_pm; + bool processed_beacon; #endif #ifdef CONFIG_MAC80211_DEBUGFS -- cgit From 1fc62c526a5717c63d9dbedd2e6a530467349713 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Wed, 10 Jun 2015 11:11:20 +0300 Subject: Bluetooth: Fix exposing full value of shortened LTKs When we notify user space of a new LTK or distribute an LTK to the remote peer the value passed should be the shortened version so that it's easy to compare values in various traces. The core spec also sets the requirements for the shortening/masking as: "The masking shall be done after generation and before being distributed, used or stored." Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt.c | 7 ++++++- net/bluetooth/smp.c | 9 ++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index a6f21f8c2f98..e41bbe28a36e 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -7603,7 +7603,12 @@ void mgmt_new_ltk(struct hci_dev *hdev, struct smp_ltk *key, bool persistent) if (key->type == SMP_LTK) ev.key.master = 1; - memcpy(ev.key.val, key->val, sizeof(key->val)); + /* Make sure we copy only the significant bytes based on the + * encryption key size, and set the rest of the value to zeroes. + */ + memcpy(ev.key.val, key->val, sizeof(key->enc_size)); + memset(ev.key.val + key->enc_size, 0, + sizeof(ev.key.val) - key->enc_size); mgmt_event(MGMT_EV_NEW_LONG_TERM_KEY, hdev, &ev, sizeof(ev), NULL); } diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 3921cba056d3..4bfaa3d3ed28 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -1271,7 +1271,14 @@ static void smp_distribute_keys(struct smp_chan *smp) __le16 ediv; __le64 rand; - get_random_bytes(enc.ltk, sizeof(enc.ltk)); + /* Make sure we generate only the significant amount of + * bytes based on the encryption key size, and set the rest + * of the value to zeroes. + */ + get_random_bytes(enc.ltk, smp->enc_key_size); + memset(enc.ltk + smp->enc_key_size, 0, + sizeof(enc.ltk) - smp->enc_key_size); + get_random_bytes(&ediv, sizeof(ediv)); get_random_bytes(&rand, sizeof(rand)); -- cgit From 5ec596c41bba6f4e3eeef5dc089afc8eaa702a7e Mon Sep 17 00:00:00 2001 From: Alexis Green Date: Tue, 9 Jun 2015 16:20:24 -0700 Subject: mac80211: Fix a case of incorrect metric used when forwarding a PREQ This patch fixes a bug in hwmp_preq_frame_process where the wrong metric can be used when forwarding a PREQ. This happens because the code uses the same metric variable to record the value of the metric to the source of the PREQ and the value of the metric to the target of the PREQ. This comes into play when both reply and forward are set which happens when IEEE80211_PREQ_PROACTIVE_PREP_FLAG is set and when MP_F_DO | MP_F_RF is set. The original code had a special case to handle the first case but not the second. The patch uses distinct variables for the two metrics which makes the code flow much clearer and removes the need to restore the original value of metric when forwarding. Signed-off-by: Alexis Green CC: Jesse Jones Signed-off-by: Johannes Berg --- net/mac80211/mesh_hwmp.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index be33e4188209..085edc1d056b 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -510,14 +510,14 @@ static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata, static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, - const u8 *preq_elem, u32 metric) + const u8 *preq_elem, u32 orig_metric) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; struct mesh_path *mpath = NULL; const u8 *target_addr, *orig_addr; const u8 *da; u8 target_flags, ttl, flags; - u32 orig_sn, target_sn, lifetime, orig_metric; + u32 orig_sn, target_sn, lifetime, target_metric; bool reply = false; bool forward = true; bool root_is_gate; @@ -528,7 +528,6 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata, target_sn = PREQ_IE_TARGET_SN(preq_elem); orig_sn = PREQ_IE_ORIG_SN(preq_elem); target_flags = PREQ_IE_TARGET_F(preq_elem); - orig_metric = metric; /* Proactive PREQ gate announcements */ flags = PREQ_IE_FLAGS(preq_elem); root_is_gate = !!(flags & RANN_FLAG_IS_GATE); @@ -539,7 +538,7 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata, mhwmp_dbg(sdata, "PREQ is for us\n"); forward = false; reply = true; - metric = 0; + target_metric = 0; if (time_after(jiffies, ifmsh->last_sn_update + net_traversal_jiffies(sdata)) || time_before(jiffies, ifmsh->last_sn_update)) { @@ -556,7 +555,7 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata, reply = true; target_addr = sdata->vif.addr; target_sn = ++ifmsh->sn; - metric = 0; + target_metric = 0; ifmsh->last_sn_update = jiffies; } if (root_is_gate) @@ -574,7 +573,7 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata, } else if ((!(target_flags & MP_F_DO)) && (mpath->flags & MESH_PATH_ACTIVE)) { reply = true; - metric = mpath->metric; + target_metric = mpath->metric; target_sn = mpath->sn; if (target_flags & MP_F_RF) target_flags |= MP_F_DO; @@ -593,7 +592,8 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata, mesh_path_sel_frame_tx(MPATH_PREP, 0, orig_addr, orig_sn, 0, target_addr, target_sn, mgmt->sa, 0, ttl, - lifetime, metric, 0, sdata); + lifetime, target_metric, 0, + sdata); } else { ifmsh->mshstats.dropped_frames_ttl++; } @@ -619,13 +619,12 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata, if (flags & IEEE80211_PREQ_PROACTIVE_PREP_FLAG) { target_addr = PREQ_IE_TARGET_ADDR(preq_elem); target_sn = PREQ_IE_TARGET_SN(preq_elem); - metric = orig_metric; } mesh_path_sel_frame_tx(MPATH_PREQ, flags, orig_addr, orig_sn, target_flags, target_addr, target_sn, da, hopcount, ttl, lifetime, - metric, preq_id, sdata); + orig_metric, preq_id, sdata); if (!is_multicast_ether_addr(da)) ifmsh->mshstats.fwded_unicast++; else -- cgit From 2df7f8c69521a4d85dfbc788da260b3c4030980c Mon Sep 17 00:00:00 2001 From: Samuel Ortiz Date: Wed, 10 Jun 2015 12:50:22 +0200 Subject: NFC: nci: Export nci_req_complete Drivers implementing proprietary ops may need it now. Signed-off-by: Samuel Ortiz --- net/nfc/nci/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index edc10cc8e10b..f9aa08780b06 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -74,6 +74,7 @@ void nci_req_complete(struct nci_dev *ndev, int result) complete(&ndev->req_completion); } } +EXPORT_SYMBOL(nci_req_complete); static void nci_req_cancel(struct nci_dev *ndev, int err) { -- cgit From 30686bf7f5b3c30831761e188a6e3cb33580fa48 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 2 Jun 2015 21:39:54 +0200 Subject: mac80211: convert HW flags to unsigned long bitmap As we're running out of hardware capability flags pretty quickly, convert them to use the regular test_bit() style unsigned long bitmaps. This introduces a number of helper functions/macros to set and to test the bits, along with new debugfs code. The occurrences of an explicit __clear_bit() are intentional, the drivers were never supposed to change their supported bits on the fly. We should investigate changing this to be a per-frame flag. Signed-off-by: Johannes Berg --- net/mac80211/agg-tx.c | 4 +- net/mac80211/cfg.c | 10 ++-- net/mac80211/debugfs.c | 96 +++++++++++++++++++++----------------- net/mac80211/driver-ops.h | 2 +- net/mac80211/iface.c | 10 ++-- net/mac80211/key.c | 4 +- net/mac80211/main.c | 14 +++--- net/mac80211/mlme.c | 63 ++++++++++++------------- net/mac80211/offchannel.c | 2 +- net/mac80211/pm.c | 4 +- net/mac80211/rate.c | 6 +-- net/mac80211/rc80211_minstrel_ht.c | 2 +- net/mac80211/rx.c | 26 +++++------ net/mac80211/scan.c | 10 ++-- net/mac80211/sta_info.c | 14 +++--- net/mac80211/status.c | 13 +++--- net/mac80211/tdls.c | 2 +- net/mac80211/tx.c | 34 +++++++------- net/mac80211/util.c | 6 +-- 19 files changed, 164 insertions(+), 158 deletions(-) (limited to 'net') diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index cce9d425c718..c8ba2e77737c 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -564,8 +564,8 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid, return -EINVAL; if ((tid >= IEEE80211_NUM_TIDS) || - !(local->hw.flags & IEEE80211_HW_AMPDU_AGGREGATION) || - (local->hw.flags & IEEE80211_HW_TX_AMPDU_SETUP_IN_HW)) + !ieee80211_hw_check(&local->hw, AMPDU_AGGREGATION) || + ieee80211_hw_check(&local->hw, TX_AMPDU_SETUP_IN_HW)) return -EINVAL; ht_dbg(sdata, "Open BA session requested for %pM tid %u\n", diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 1d01190535b0..bf7023f6c327 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1763,7 +1763,7 @@ static int ieee80211_update_mesh_config(struct wiphy *wiphy, /* our RSSI threshold implementation is supported only for * devices that report signal in dBm. */ - if (!(sdata->local->hw.flags & IEEE80211_HW_SIGNAL_DBM)) + if (!ieee80211_hw_check(&sdata->local->hw, SIGNAL_DBM)) return -ENOTSUPP; conf->rssi_threshold = nconf->rssi_threshold; } @@ -2407,7 +2407,7 @@ static int ieee80211_set_power_mgmt(struct wiphy *wiphy, struct net_device *dev, if (sdata->vif.type != NL80211_IFTYPE_STATION) return -EOPNOTSUPP; - if (!(local->hw.flags & IEEE80211_HW_SUPPORTS_PS)) + if (!ieee80211_hw_check(&local->hw, SUPPORTS_PS)) return -EOPNOTSUPP; if (enabled == sdata->u.mgd.powersave && @@ -2422,7 +2422,7 @@ static int ieee80211_set_power_mgmt(struct wiphy *wiphy, struct net_device *dev, __ieee80211_request_smps_mgd(sdata, sdata->u.mgd.req_smps); sdata_unlock(sdata); - if (local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_PS) + if (ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS)) ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); ieee80211_recalc_ps(local, -1); @@ -2466,7 +2466,7 @@ static int ieee80211_set_bitrate_mask(struct wiphy *wiphy, if (!ieee80211_sdata_running(sdata)) return -ENETDOWN; - if (local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL) { + if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) { ret = drv_set_bitrate_mask(local, sdata, mask); if (ret) return ret; @@ -3451,7 +3451,7 @@ static int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_TX_OFFCHAN | IEEE80211_TX_INTFL_OFFCHAN_TX_OK; - if (local->hw.flags & IEEE80211_HW_QUEUE_CONTROL) + if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) IEEE80211_SKB_CB(skb)->hw_queue = local->hw.offchannel_tx_hw_queue; diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 1b94d2704c27..3ea8b7de9633 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -91,56 +91,66 @@ static const struct file_operations reset_ops = { }; #endif +static const char *hw_flag_names[NUM_IEEE80211_HW_FLAGS + 1] = { +#define FLAG(F) [IEEE80211_HW_##F] = #F + FLAG(HAS_RATE_CONTROL), + FLAG(RX_INCLUDES_FCS), + FLAG(HOST_BROADCAST_PS_BUFFERING), + FLAG(SIGNAL_UNSPEC), + FLAG(SIGNAL_DBM), + FLAG(NEED_DTIM_BEFORE_ASSOC), + FLAG(SPECTRUM_MGMT), + FLAG(AMPDU_AGGREGATION), + FLAG(SUPPORTS_PS), + FLAG(PS_NULLFUNC_STACK), + FLAG(SUPPORTS_DYNAMIC_PS), + FLAG(MFP_CAPABLE), + FLAG(WANT_MONITOR_VIF), + FLAG(NO_AUTO_VIF), + FLAG(SW_CRYPTO_CONTROL), + FLAG(SUPPORT_FAST_XMIT), + FLAG(REPORTS_TX_ACK_STATUS), + FLAG(CONNECTION_MONITOR), + FLAG(QUEUE_CONTROL), + FLAG(SUPPORTS_PER_STA_GTK), + FLAG(AP_LINK_PS), + FLAG(TX_AMPDU_SETUP_IN_HW), + FLAG(SUPPORTS_RC_TABLE), + FLAG(P2P_DEV_ADDR_FOR_INTF), + FLAG(TIMING_BEACON_ONLY), + FLAG(SUPPORTS_HT_CCK_RATES), + FLAG(CHANCTX_STA_CSA), + FLAG(SUPPORTS_CLONED_SKBS), + FLAG(SINGLE_SCAN_ON_ALL_BANDS), + + /* keep last for the build bug below */ + (void *)0x1 +#undef FLAG +}; + static ssize_t hwflags_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct ieee80211_local *local = file->private_data; - int mxln = 500; + size_t bufsz = 30 * NUM_IEEE80211_HW_FLAGS; + char *buf = kzalloc(bufsz, GFP_KERNEL); + char *pos = buf, *end = buf + bufsz - 1; ssize_t rv; - char *buf = kzalloc(mxln, GFP_KERNEL); - int sf = 0; /* how many written so far */ + int i; if (!buf) - return 0; - - sf += scnprintf(buf, mxln - sf, "0x%x\n", local->hw.flags); - if (local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL) - sf += scnprintf(buf + sf, mxln - sf, "HAS_RATE_CONTROL\n"); - if (local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS) - sf += scnprintf(buf + sf, mxln - sf, "RX_INCLUDES_FCS\n"); - if (local->hw.flags & IEEE80211_HW_HOST_BROADCAST_PS_BUFFERING) - sf += scnprintf(buf + sf, mxln - sf, - "HOST_BCAST_PS_BUFFERING\n"); - if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC) - sf += scnprintf(buf + sf, mxln - sf, "SIGNAL_UNSPEC\n"); - if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) - sf += scnprintf(buf + sf, mxln - sf, "SIGNAL_DBM\n"); - if (local->hw.flags & IEEE80211_HW_NEED_DTIM_BEFORE_ASSOC) - sf += scnprintf(buf + sf, mxln - sf, - "NEED_DTIM_BEFORE_ASSOC\n"); - if (local->hw.flags & IEEE80211_HW_SPECTRUM_MGMT) - sf += scnprintf(buf + sf, mxln - sf, "SPECTRUM_MGMT\n"); - if (local->hw.flags & IEEE80211_HW_AMPDU_AGGREGATION) - sf += scnprintf(buf + sf, mxln - sf, "AMPDU_AGGREGATION\n"); - if (local->hw.flags & IEEE80211_HW_SUPPORTS_PS) - sf += scnprintf(buf + sf, mxln - sf, "SUPPORTS_PS\n"); - if (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK) - sf += scnprintf(buf + sf, mxln - sf, "PS_NULLFUNC_STACK\n"); - if (local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_PS) - sf += scnprintf(buf + sf, mxln - sf, "SUPPORTS_DYNAMIC_PS\n"); - if (local->hw.flags & IEEE80211_HW_MFP_CAPABLE) - sf += scnprintf(buf + sf, mxln - sf, "MFP_CAPABLE\n"); - if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) - sf += scnprintf(buf + sf, mxln - sf, - "REPORTS_TX_ACK_STATUS\n"); - if (local->hw.flags & IEEE80211_HW_CONNECTION_MONITOR) - sf += scnprintf(buf + sf, mxln - sf, "CONNECTION_MONITOR\n"); - if (local->hw.flags & IEEE80211_HW_SUPPORTS_PER_STA_GTK) - sf += scnprintf(buf + sf, mxln - sf, "SUPPORTS_PER_STA_GTK\n"); - if (local->hw.flags & IEEE80211_HW_AP_LINK_PS) - sf += scnprintf(buf + sf, mxln - sf, "AP_LINK_PS\n"); - if (local->hw.flags & IEEE80211_HW_TX_AMPDU_SETUP_IN_HW) - sf += scnprintf(buf + sf, mxln - sf, "TX_AMPDU_SETUP_IN_HW\n"); + return -ENOMEM; + + /* fail compilation if somebody adds or removes + * a flag without updating the name array above + */ + BUILD_BUG_ON(hw_flag_names[NUM_IEEE80211_HW_FLAGS] != (void *)0x1); + + for (i = 0; i < NUM_IEEE80211_HW_FLAGS; i++) { + if (test_bit(i, local->hw.flags)) + pos += scnprintf(pos, end - pos, "%s", + hw_flag_names[i]); + } rv = simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf)); kfree(buf); diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index c01e681b90fb..32a2e707e222 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -146,7 +146,7 @@ static inline int drv_add_interface(struct ieee80211_local *local, if (WARN_ON(sdata->vif.type == NL80211_IFTYPE_AP_VLAN || (sdata->vif.type == NL80211_IFTYPE_MONITOR && - !(local->hw.flags & IEEE80211_HW_WANT_MONITOR_VIF) && + !ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) && !(sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE)))) return -EINVAL; diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index b2e85ffca7ed..ed1edac14372 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -338,7 +338,7 @@ static int ieee80211_check_queues(struct ieee80211_sub_if_data *sdata, if ((iftype != NL80211_IFTYPE_AP && iftype != NL80211_IFTYPE_P2P_GO && iftype != NL80211_IFTYPE_MESH_POINT) || - !(sdata->local->hw.flags & IEEE80211_HW_QUEUE_CONTROL)) { + !ieee80211_hw_check(&sdata->local->hw, QUEUE_CONTROL)) { sdata->vif.cab_queue = IEEE80211_INVAL_HW_QUEUE; return 0; } @@ -378,7 +378,7 @@ static void ieee80211_set_default_queues(struct ieee80211_sub_if_data *sdata) int i; for (i = 0; i < IEEE80211_NUM_ACS; i++) { - if (local->hw.flags & IEEE80211_HW_QUEUE_CONTROL) + if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) sdata->vif.hw_queue[i] = IEEE80211_INVAL_HW_QUEUE; else if (local->hw.queues >= IEEE80211_NUM_ACS) sdata->vif.hw_queue[i] = i; @@ -393,7 +393,7 @@ int ieee80211_add_virtual_monitor(struct ieee80211_local *local) struct ieee80211_sub_if_data *sdata; int ret; - if (!(local->hw.flags & IEEE80211_HW_WANT_MONITOR_VIF)) + if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) return 0; ASSERT_RTNL(); @@ -454,7 +454,7 @@ void ieee80211_del_virtual_monitor(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata; - if (!(local->hw.flags & IEEE80211_HW_WANT_MONITOR_VIF)) + if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) return; ASSERT_RTNL(); @@ -1586,7 +1586,7 @@ static void ieee80211_assign_perm_addr(struct ieee80211_local *local, break; case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_P2P_GO: - if (local->hw.flags & IEEE80211_HW_P2P_DEV_ADDR_FOR_INTF) { + if (ieee80211_hw_check(&local->hw, P2P_DEV_ADDR_FOR_INTF)) { list_for_each_entry(sdata, &local->interfaces, list) { if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE) continue; diff --git a/net/mac80211/key.c b/net/mac80211/key.c index b9aac809628f..8abc31ebcf61 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -147,7 +147,7 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key) * is supported; if not, return. */ if (sta && !(key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE) && - !(key->local->hw.flags & IEEE80211_HW_SUPPORTS_PER_STA_GTK)) + !ieee80211_hw_check(&key->local->hw, SUPPORTS_PER_STA_GTK)) goto out_unsupported; if (sta && !sta->uploaded) @@ -201,7 +201,7 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key) /* all of these we can do in software - if driver can */ if (ret == 1) return 0; - if (key->local->hw.flags & IEEE80211_HW_SW_CRYPTO_CONTROL) + if (ieee80211_hw_check(&key->local->hw, SW_CRYPTO_CONTROL)) return -EINVAL; return 0; default: diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 674164fe5cdb..3c63468b4dfb 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -661,7 +661,7 @@ static int ieee80211_init_cipher_suites(struct ieee80211_local *local) { bool have_wep = !(IS_ERR(local->wep_tx_tfm) || IS_ERR(local->wep_rx_tfm)); - bool have_mfp = local->hw.flags & IEEE80211_HW_MFP_CAPABLE; + bool have_mfp = ieee80211_hw_check(&local->hw, MFP_CAPABLE); int n_suites = 0, r = 0, w = 0; u32 *suites; static const u32 cipher_suites[] = { @@ -681,7 +681,7 @@ static int ieee80211_init_cipher_suites(struct ieee80211_local *local) WLAN_CIPHER_SUITE_BIP_GMAC_256, }; - if (local->hw.flags & IEEE80211_HW_SW_CRYPTO_CONTROL || + if (ieee80211_hw_check(&local->hw, SW_CRYPTO_CONTROL) || local->hw.wiphy->cipher_suites) { /* If the driver advertises, or doesn't support SW crypto, * we only need to remove WEP if necessary. @@ -797,7 +797,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) netdev_features_t feature_whitelist; struct cfg80211_chan_def dflt_chandef = {}; - if (hw->flags & IEEE80211_HW_QUEUE_CONTROL && + if (ieee80211_hw_check(hw, QUEUE_CONTROL) && (local->hw.offchannel_tx_hw_queue == IEEE80211_INVAL_HW_QUEUE || local->hw.offchannel_tx_hw_queue >= local->hw.queues)) return -EINVAL; @@ -945,9 +945,9 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) /* mac80211 supports control port protocol changing */ local->hw.wiphy->flags |= WIPHY_FLAG_CONTROL_PORT_PROTOCOL; - if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) { + if (ieee80211_hw_check(&local->hw, SIGNAL_DBM)) { local->hw.wiphy->signal_type = CFG80211_SIGNAL_TYPE_MBM; - } else if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC) { + } else if (ieee80211_hw_check(&local->hw, SIGNAL_UNSPEC)) { local->hw.wiphy->signal_type = CFG80211_SIGNAL_TYPE_UNSPEC; if (hw->max_signal <= 0) { result = -EINVAL; @@ -1001,7 +1001,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) local->hw.wiphy->flags |= WIPHY_FLAG_TDLS_EXTERNAL_SETUP; /* mac80211 supports eCSA, if the driver supports STA CSA at all */ - if (local->hw.flags & IEEE80211_HW_CHANCTX_STA_CSA) + if (ieee80211_hw_check(&local->hw, CHANCTX_STA_CSA)) local->ext_capa[0] |= WLAN_EXT_CAPA1_EXT_CHANNEL_SWITCHING; local->hw.wiphy->max_num_csa_counters = IEEE80211_MAX_CSA_COUNTERS_NUM; @@ -1069,7 +1069,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) /* add one default STA interface if supported */ if (local->hw.wiphy->interface_modes & BIT(NL80211_IFTYPE_STATION) && - !(hw->flags & IEEE80211_HW_NO_AUTO_VIF)) { + !ieee80211_hw_check(hw, NO_AUTO_VIF)) { result = ieee80211_if_add(local, "wlan%d", NET_NAME_ENUM, NULL, NL80211_IFTYPE_STATION, NULL); if (result) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 235ed68dca36..9b2cc278ac2a 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -118,7 +118,7 @@ void ieee80211_sta_reset_beacon_monitor(struct ieee80211_sub_if_data *sdata) if (sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER) return; - if (sdata->local->hw.flags & IEEE80211_HW_CONNECTION_MONITOR) + if (ieee80211_hw_check(&sdata->local->hw, CONNECTION_MONITOR)) return; mod_timer(&sdata->u.mgd.bcn_mon_timer, @@ -134,7 +134,7 @@ void ieee80211_sta_reset_conn_monitor(struct ieee80211_sub_if_data *sdata) ifmgd->probe_send_count = 0; - if (sdata->local->hw.flags & IEEE80211_HW_CONNECTION_MONITOR) + if (ieee80211_hw_check(&sdata->local->hw, CONNECTION_MONITOR)) return; mod_timer(&sdata->u.mgd.conn_mon_timer, @@ -677,7 +677,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) capab |= WLAN_CAPABILITY_PRIVACY; if ((assoc_data->capability & WLAN_CAPABILITY_SPECTRUM_MGMT) && - (local->hw.flags & IEEE80211_HW_SPECTRUM_MGMT)) + ieee80211_hw_check(&local->hw, SPECTRUM_MGMT)) capab |= WLAN_CAPABILITY_SPECTRUM_MGMT; if (ifmgd->flags & IEEE80211_STA_ENABLE_RRM) @@ -885,7 +885,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) drv_mgd_prepare_tx(local, sdata); IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; - if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) + if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS | IEEE80211_TX_INTFL_MLME_CONN_TX; ieee80211_tx_skb(sdata, skb); @@ -927,7 +927,7 @@ void ieee80211_send_nullfunc(struct ieee80211_local *local, IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT | IEEE80211_TX_INTFL_OFFCHAN_TX_OK; - if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) + if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; if (ifmgd->flags & IEEE80211_STA_CONNECTION_POLL) @@ -1198,7 +1198,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, chanctx = container_of(conf, struct ieee80211_chanctx, conf); if (local->use_chanctx && - !(local->hw.flags & IEEE80211_HW_CHANCTX_STA_CSA)) { + !ieee80211_hw_check(&local->hw, CHANCTX_STA_CSA)) { sdata_info(sdata, "driver doesn't support chan-switch with channel contexts\n"); goto drop_connection; @@ -1407,15 +1407,15 @@ static void ieee80211_enable_ps(struct ieee80211_local *local, return; if (conf->dynamic_ps_timeout > 0 && - !(local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_PS)) { + !ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS)) { mod_timer(&local->dynamic_ps_timer, jiffies + msecs_to_jiffies(conf->dynamic_ps_timeout)); } else { - if (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK) + if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK)) ieee80211_send_nullfunc(local, sdata, 1); - if ((local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK) && - (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS)) + if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK) && + ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) return; conf->flags |= IEEE80211_CONF_PS; @@ -1474,7 +1474,7 @@ void ieee80211_recalc_ps(struct ieee80211_local *local, s32 latency) int count = 0; int timeout; - if (!(local->hw.flags & IEEE80211_HW_SUPPORTS_PS)) { + if (!ieee80211_hw_check(&local->hw, SUPPORTS_PS)) { local->ps_sdata = NULL; return; } @@ -1620,7 +1620,7 @@ void ieee80211_dynamic_ps_enable_work(struct work_struct *work) spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } - if ((local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK) && + if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK) && !(ifmgd->flags & IEEE80211_STA_NULLFUNC_ACKED)) { if (drv_tx_frames_pending(local)) { mod_timer(&local->dynamic_ps_timer, jiffies + @@ -1633,8 +1633,8 @@ void ieee80211_dynamic_ps_enable_work(struct work_struct *work) } } - if (!((local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) && - (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK)) || + if (!(ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS) && + ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK)) || (ifmgd->flags & IEEE80211_STA_NULLFUNC_ACKED)) { ifmgd->flags &= ~IEEE80211_STA_NULLFUNC_ACKED; local->hw.conf.flags |= IEEE80211_CONF_PS; @@ -2159,7 +2159,7 @@ static void ieee80211_reset_ap_probe(struct ieee80211_sub_if_data *sdata) ieee80211_recalc_ps(local, -1); mutex_unlock(&local->iflist_mtx); - if (sdata->local->hw.flags & IEEE80211_HW_CONNECTION_MONITOR) + if (ieee80211_hw_check(&sdata->local->hw, CONNECTION_MONITOR)) goto out; /* @@ -2257,7 +2257,7 @@ static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata) */ ifmgd->probe_send_count++; - if (sdata->local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) { + if (ieee80211_hw_check(&sdata->local->hw, REPORTS_TX_ACK_STATUS)) { ifmgd->nullfunc_failed = false; ieee80211_send_nullfunc(sdata->local, sdata, 0); } else { @@ -2562,7 +2562,7 @@ static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata, return; auth_data->expected_transaction = 4; drv_mgd_prepare_tx(sdata->local, sdata); - if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) + if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) tx_flags = IEEE80211_TX_CTL_REQ_TX_STATUS | IEEE80211_TX_INTFL_MLME_CONN_TX; ieee80211_send_auth(sdata, 3, auth_data->algorithm, 0, @@ -3337,7 +3337,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, } ifmgd->have_beacon = true; ifmgd->assoc_data->need_beacon = false; - if (local->hw.flags & IEEE80211_HW_TIMING_BEACON_ONLY) { + if (ieee80211_hw_check(&local->hw, TIMING_BEACON_ONLY)) { sdata->vif.bss_conf.sync_tsf = le64_to_cpu(mgmt->u.beacon.timestamp); sdata->vif.bss_conf.sync_device_ts = @@ -3443,7 +3443,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, len - baselen, false, &elems, care_about_ies, ncrc); - if (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK) { + if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK)) { bool directed_tim = ieee80211_check_tim(elems.tim, elems.tim_len, ifmgd->aid); @@ -3511,7 +3511,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, * the driver will use them. The synchronized view is currently * guaranteed only in certain callbacks. */ - if (local->hw.flags & IEEE80211_HW_TIMING_BEACON_ONLY) { + if (ieee80211_hw_check(&local->hw, TIMING_BEACON_ONLY)) { sdata->vif.bss_conf.sync_tsf = le64_to_cpu(mgmt->u.beacon.timestamp); sdata->vif.bss_conf.sync_device_ts = @@ -3749,7 +3749,7 @@ static int ieee80211_probe_auth(struct ieee80211_sub_if_data *sdata) auth_data->expected_transaction = trans; } - if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) + if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) tx_flags = IEEE80211_TX_CTL_REQ_TX_STATUS | IEEE80211_TX_INTFL_MLME_CONN_TX; @@ -3822,7 +3822,7 @@ static int ieee80211_do_assoc(struct ieee80211_sub_if_data *sdata) IEEE80211_ASSOC_MAX_TRIES); ieee80211_send_assoc(sdata); - if (!(local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS)) { + if (!ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { assoc_data->timeout = jiffies + IEEE80211_ASSOC_TIMEOUT; assoc_data->timeout_started = true; run_again(sdata, assoc_data->timeout); @@ -3936,7 +3936,7 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata) memcpy(bssid, ifmgd->associated->bssid, ETH_ALEN); - if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) + if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) max_tries = max_nullfunc_tries; else max_tries = max_probe_tries; @@ -3961,7 +3961,7 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata) } } else if (time_is_after_jiffies(ifmgd->probe_timeout)) run_again(sdata, ifmgd->probe_timeout); - else if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) { + else if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { mlme_dbg(sdata, "Failed to send nullfunc to AP %pM after %dms, disconnecting\n", bssid, probe_wait_ms); @@ -4030,14 +4030,11 @@ static void ieee80211_sta_monitor_work(struct work_struct *work) static void ieee80211_restart_sta_timer(struct ieee80211_sub_if_data *sdata) { - u32 flags; - if (sdata->vif.type == NL80211_IFTYPE_STATION) { __ieee80211_stop_poll(sdata); /* let's probe the connection once */ - flags = sdata->local->hw.flags; - if (!(flags & IEEE80211_HW_CONNECTION_MONITOR)) + if (!ieee80211_hw_check(&sdata->local->hw, CONNECTION_MONITOR)) ieee80211_queue_work(&sdata->local->hw, &sdata->u.mgd.monitor_work); /* and do all the other regular work too */ @@ -4450,8 +4447,8 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, sdata->vif.bss_conf.sync_dtim_count = tim_ie[2]; else sdata->vif.bss_conf.sync_dtim_count = 0; - } else if (!(local->hw.flags & - IEEE80211_HW_TIMING_BEACON_ONLY)) { + } else if (!ieee80211_hw_check(&sdata->local->hw, + TIMING_BEACON_ONLY)) { ies = rcu_dereference(cbss->proberesp_ies); /* must be non-NULL since beacon IEs were NULL */ sdata->vif.bss_conf.sync_tsf = ies->tsf; @@ -4829,7 +4826,7 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, rcu_read_unlock(); if (WARN((sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_UAPSD) && - (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK), + ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK), "U-APSD not supported with HW_PS_NULLFUNC_STACK\n")) sdata->vif.driver_flags &= ~IEEE80211_VIF_SUPPORTS_UAPSD; @@ -4910,7 +4907,7 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, rcu_read_lock(); beacon_ies = rcu_dereference(req->bss->beacon_ies); - if (sdata->local->hw.flags & IEEE80211_HW_NEED_DTIM_BEFORE_ASSOC && + if (ieee80211_hw_check(&sdata->local->hw, NEED_DTIM_BEFORE_ASSOC) && !beacon_ies) { /* * Wait up to one beacon interval ... @@ -4937,7 +4934,7 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, assoc_data->timeout = jiffies; assoc_data->timeout_started = true; - if (local->hw.flags & IEEE80211_HW_TIMING_BEACON_ONLY) { + if (ieee80211_hw_check(&local->hw, TIMING_BEACON_ONLY)) { sdata->vif.bss_conf.sync_tsf = beacon_ies->tsf; sdata->vif.bss_conf.sync_device_ts = bss->device_ts_beacon; diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index 683f0e3cb124..f2c75cf491fc 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -46,7 +46,7 @@ static void ieee80211_offchannel_ps_enable(struct ieee80211_sub_if_data *sdata) } if (!local->offchannel_ps_enabled || - !(local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK)) + !ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK)) /* * If power save was enabled, no need to send a nullfunc * frame because AP knows that we are sleeping. But if the diff --git a/net/mac80211/pm.c b/net/mac80211/pm.c index ac6ad6238e3a..06b60980c62c 100644 --- a/net/mac80211/pm.c +++ b/net/mac80211/pm.c @@ -23,7 +23,7 @@ int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan) ieee80211_del_virtual_monitor(local); - if (hw->flags & IEEE80211_HW_AMPDU_AGGREGATION) { + if (ieee80211_hw_check(hw, AMPDU_AGGREGATION)) { mutex_lock(&local->sta_mtx); list_for_each_entry(sta, &local->sta_list, list) { set_sta_flag(sta, WLAN_STA_BLOCK_BA); @@ -82,7 +82,7 @@ int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan) if (err < 0) { local->quiescing = false; local->wowlan = false; - if (hw->flags & IEEE80211_HW_AMPDU_AGGREGATION) { + if (ieee80211_hw_check(hw, AMPDU_AGGREGATION)) { mutex_lock(&local->sta_mtx); list_for_each_entry(sta, &local->sta_list, list) { diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index de69adf24f53..36ba7c4f0283 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -680,7 +680,7 @@ void rate_control_get_rate(struct ieee80211_sub_if_data *sdata, info->control.rates[i].count = 0; } - if (sdata->local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL) + if (ieee80211_hw_check(&sdata->local->hw, HAS_RATE_CONTROL)) return; if (ista) { @@ -691,7 +691,7 @@ void rate_control_get_rate(struct ieee80211_sub_if_data *sdata, ref->ops->get_rate(ref->priv, NULL, NULL, txrc); } - if (sdata->local->hw.flags & IEEE80211_HW_SUPPORTS_RC_TABLE) + if (ieee80211_hw_check(&sdata->local->hw, SUPPORTS_RC_TABLE)) return; ieee80211_get_tx_rates(&sdata->vif, ista, txrc->skb, @@ -733,7 +733,7 @@ int ieee80211_init_rate_ctrl_alg(struct ieee80211_local *local, if (local->open_count) return -EBUSY; - if (local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL) { + if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) { if (WARN_ON(!local->ops->set_rts_threshold)) return -EINVAL; return 0; diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index 7430a1df2ab1..543b67233535 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -1070,7 +1070,7 @@ minstrel_ht_update_cck(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, if (sband->band != IEEE80211_BAND_2GHZ) return; - if (!(mp->hw->flags & IEEE80211_HW_SUPPORTS_HT_CCK_RATES)) + if (!ieee80211_hw_check(mp->hw, SUPPORTS_HT_CCK_RATES)) return; mi->cck_supported = 0; diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 7d85f7516324..5dae166cb7f5 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -52,7 +52,7 @@ static struct sk_buff *remove_monitor_info(struct ieee80211_local *local, struct sk_buff *skb, unsigned int rtap_vendor_space) { - if (local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS) { + if (ieee80211_hw_check(&local->hw, RX_INCLUDES_FCS)) { if (likely(skb->len > FCS_LEN)) __pskb_trim(skb, skb->len - FCS_LEN); else { @@ -110,7 +110,7 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local, len = ALIGN(len, 8); len += 8; } - if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) + if (ieee80211_hw_check(&local->hw, SIGNAL_DBM)) len += 1; /* antenna field, if we don't have per-chain info */ @@ -185,7 +185,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, } mpdulen = skb->len; - if (!(has_fcs && (local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS))) + if (!(has_fcs && ieee80211_hw_check(&local->hw, RX_INCLUDES_FCS))) mpdulen += FCS_LEN; rthdr = (struct ieee80211_radiotap_header *)skb_push(skb, rtap_len); @@ -239,7 +239,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, } /* IEEE80211_RADIOTAP_FLAGS */ - if (has_fcs && (local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS)) + if (has_fcs && ieee80211_hw_check(&local->hw, RX_INCLUDES_FCS)) *pos |= IEEE80211_RADIOTAP_F_FCS; if (status->flag & (RX_FLAG_FAILED_FCS_CRC | RX_FLAG_FAILED_PLCP_CRC)) *pos |= IEEE80211_RADIOTAP_F_BADFCS; @@ -289,7 +289,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, pos += 2; /* IEEE80211_RADIOTAP_DBM_ANTSIGNAL */ - if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM && + if (ieee80211_hw_check(&local->hw, SIGNAL_DBM) && !(status->flag & RX_FLAG_NO_SIGNAL_VAL)) { *pos = status->signal; rthdr->it_present |= @@ -458,7 +458,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, * the SKB because it has a bad FCS/PLCP checksum. */ - if (local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS) + if (ieee80211_hw_check(&local->hw, RX_INCLUDES_FCS)) present_fcs_len = FCS_LEN; /* ensure hdr->frame_control and vendor radiotap data are in skb head */ @@ -1197,7 +1197,7 @@ static void sta_ps_start(struct sta_info *sta) atomic_inc(&ps->num_sta_ps); set_sta_flag(sta, WLAN_STA_PS_STA); - if (!(local->hw.flags & IEEE80211_HW_AP_LINK_PS)) + if (!ieee80211_hw_check(&local->hw, AP_LINK_PS)) drv_sta_notify(local, sdata, STA_NOTIFY_SLEEP, &sta->sta); ps_dbg(sdata, "STA %pM aid %d enters power save mode\n", sta->sta.addr, sta->sta.aid); @@ -1245,7 +1245,7 @@ int ieee80211_sta_ps_transition(struct ieee80211_sta *sta, bool start) struct sta_info *sta_inf = container_of(sta, struct sta_info, sta); bool in_ps; - WARN_ON(!(sta_inf->local->hw.flags & IEEE80211_HW_AP_LINK_PS)); + WARN_ON(!ieee80211_hw_check(&sta_inf->local->hw, AP_LINK_PS)); /* Don't let the same PS state be set twice */ in_ps = test_sta_flag(sta_inf, WLAN_STA_PS_STA); @@ -1281,7 +1281,7 @@ ieee80211_rx_h_uapsd_and_pspoll(struct ieee80211_rx_data *rx) * uAPSD and PS-Poll frames (the latter shouldn't even come up from * it to mac80211 since they're handled.) */ - if (sdata->local->hw.flags & IEEE80211_HW_AP_LINK_PS) + if (ieee80211_hw_check(&sdata->local->hw, AP_LINK_PS)) return RX_CONTINUE; /* @@ -1413,7 +1413,7 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx) * Change STA power saving mode only at the end of a frame * exchange sequence. */ - if (!(sta->local->hw.flags & IEEE80211_HW_AP_LINK_PS) && + if (!ieee80211_hw_check(&sta->local->hw, AP_LINK_PS) && !ieee80211_has_morefrags(hdr->frame_control) && !(status->rx_flags & IEEE80211_RX_DEFERRED_RELEASE) && (rx->sdata->vif.type == NL80211_IFTYPE_AP || @@ -2543,7 +2543,7 @@ ieee80211_rx_h_mgmt_check(struct ieee80211_rx_data *rx) !(rx->flags & IEEE80211_RX_BEACON_REPORTED)) { int sig = 0; - if (rx->local->hw.flags & IEEE80211_HW_SIGNAL_DBM) + if (ieee80211_hw_check(&rx->local->hw, SIGNAL_DBM)) sig = status->signal; cfg80211_report_obss_beacon(rx->local->hw.wiphy, @@ -2874,7 +2874,7 @@ ieee80211_rx_h_userspace_mgmt(struct ieee80211_rx_data *rx) * it transmitted were processed or returned. */ - if (rx->local->hw.flags & IEEE80211_HW_SIGNAL_DBM) + if (ieee80211_hw_check(&rx->local->hw, SIGNAL_DBM)) sig = status->signal; if (cfg80211_rx_mgmt(&rx->sdata->wdev, status->freq, sig, @@ -2939,7 +2939,7 @@ ieee80211_rx_h_action_return(struct ieee80211_rx_data *rx) info->flags = IEEE80211_TX_CTL_TX_OFFCHAN | IEEE80211_TX_INTFL_OFFCHAN_TX_OK | IEEE80211_TX_CTL_NO_CCK_RATE; - if (local->hw.flags & IEEE80211_HW_QUEUE_CONTROL) + if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) info->hw_queue = local->hw.offchannel_tx_hw_queue; } diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 1454c1b7d06c..11d0901ebb7b 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -71,9 +71,9 @@ ieee80211_bss_info_update(struct ieee80211_local *local, s32 signal = 0; bool signal_valid; - if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) + if (ieee80211_hw_check(&local->hw, SIGNAL_DBM)) signal = rx_status->signal * 100; - else if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC) + else if (ieee80211_hw_check(&local->hw, SIGNAL_UNSPEC)) signal = (rx_status->signal * 100) / local->hw.max_signal; scan_width = NL80211_BSS_CHAN_WIDTH_20; @@ -263,7 +263,7 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_local *local) if (test_bit(SCAN_HW_CANCELLED, &local->scanning)) return false; - if (local->hw.flags & IEEE80211_HW_SINGLE_SCAN_ON_ALL_BANDS) { + if (ieee80211_hw_check(&local->hw, SINGLE_SCAN_ON_ALL_BANDS)) { for (i = 0; i < req->n_channels; i++) { local->hw_scan_req->req.channels[i] = req->channels[i]; bands_used |= BIT(req->channels[i]->band); @@ -332,7 +332,7 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) return; if (hw_scan && !aborted && - !(local->hw.flags & IEEE80211_HW_SINGLE_SCAN_ON_ALL_BANDS) && + !ieee80211_hw_check(&local->hw, SINGLE_SCAN_ON_ALL_BANDS) && ieee80211_prep_hw_scan(local)) { int rc; @@ -526,7 +526,7 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, local->hw_scan_ies_bufsize = local->scan_ies_len + req->ie_len; - if (local->hw.flags & IEEE80211_HW_SINGLE_SCAN_ON_ALL_BANDS) { + if (ieee80211_hw_check(&local->hw, SINGLE_SCAN_ON_ALL_BANDS)) { int i, n_bands = 0; u8 bands_counted = 0; diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index ce0c1662de42..666ddac3c87c 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -282,7 +282,7 @@ static void sta_deliver_ps_frames(struct work_struct *wk) static int sta_prepare_rate_control(struct ieee80211_local *local, struct sta_info *sta, gfp_t gfp) { - if (local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL) + if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) return 0; sta->rate_ctrl = local->rate_ctrl; @@ -643,7 +643,7 @@ static void __sta_info_recalc_tim(struct sta_info *sta, bool ignore_pending) } /* No need to do anything if the driver does all */ - if (local->hw.flags & IEEE80211_HW_AP_LINK_PS) + if (ieee80211_hw_check(&local->hw, AP_LINK_PS)) return; if (sta->dead) @@ -1148,7 +1148,7 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta) sta->driver_buffered_tids = 0; sta->txq_buffered_tids = 0; - if (!(local->hw.flags & IEEE80211_HW_AP_LINK_PS)) + if (!ieee80211_hw_check(&local->hw, AP_LINK_PS)) drv_sta_notify(local, sdata, STA_NOTIFY_AWAKE, &sta->sta); if (sta->sta.txq[0]) { @@ -1879,8 +1879,8 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) sinfo->rx_beacon_signal_avg = ieee80211_ave_rssi(&sdata->vif); } - if ((sta->local->hw.flags & IEEE80211_HW_SIGNAL_DBM) || - (sta->local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC)) { + if (ieee80211_hw_check(&sta->local->hw, SIGNAL_DBM) || + ieee80211_hw_check(&sta->local->hw, SIGNAL_UNSPEC)) { if (!(sinfo->filled & BIT(NL80211_STA_INFO_SIGNAL))) { sinfo->signal = (s8)sta->last_signal; sinfo->filled |= BIT(NL80211_STA_INFO_SIGNAL); @@ -1932,7 +1932,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU_RETRIES)) && - local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) { + ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU_RETRIES); tidstats->tx_msdu_retries = sta->tx_msdu_retries[i]; @@ -1940,7 +1940,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU_FAILED)) && - local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) { + ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU_FAILED); tidstats->tx_msdu_failed = sta->tx_msdu_failed[i]; diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 67c428735a51..45628f37c083 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -181,7 +181,7 @@ static void ieee80211_frame_acked(struct sta_info *sta, struct sk_buff *skb) struct ieee80211_local *local = sta->local; struct ieee80211_sub_if_data *sdata = sta->sdata; - if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) + if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) sta->last_rx = jiffies; if (ieee80211_is_data_qos(mgmt->frame_control)) { @@ -414,8 +414,7 @@ static void ieee80211_tdls_td_tx_handle(struct ieee80211_local *local, if (is_teardown) { /* This mechanism relies on being able to get ACKs */ - WARN_ON(!(local->hw.flags & - IEEE80211_HW_REPORTS_TX_ACK_STATUS)); + WARN_ON(!ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)); /* Check if peer has ACKed */ if (flags & IEEE80211_TX_STAT_ACK) { @@ -731,7 +730,7 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) ieee80211_get_qos_ctl(hdr), sta, true, acked); - if ((local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL) && + if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL) && (ieee80211_is_data(hdr->frame_control)) && (rates_idx != -1)) sta->last_tx_rate = info->status.rates[rates_idx]; @@ -798,11 +797,11 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) ieee80211_frame_acked(sta, skb); if ((sta->sdata->vif.type == NL80211_IFTYPE_STATION) && - (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS)) + ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) ieee80211_sta_tx_notify(sta->sdata, (void *) skb->data, acked, info->status.tx_time); - if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) { + if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { if (info->flags & IEEE80211_TX_STAT_ACK) { if (sta->lost_packets) sta->lost_packets = 0; @@ -853,7 +852,7 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) } if (ieee80211_is_nullfunc(fc) && ieee80211_has_pm(fc) && - (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) && + ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS) && !(info->flags & IEEE80211_TX_CTL_INJECTED) && local->ps_sdata && !(local->scanning)) { if (info->flags & IEEE80211_TX_STAT_ACK) { diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index 28298cc50326..ad31b2dab4f5 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -935,7 +935,7 @@ ieee80211_tdls_prep_mgmt_packet(struct wiphy *wiphy, struct net_device *dev, * packet through the AP. */ if ((action_code == WLAN_TDLS_TEARDOWN) && - (sdata->local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS)) { + ieee80211_hw_check(&sdata->local->hw, REPORTS_TX_ACK_STATUS)) { bool try_resend; /* Should we keep skb for possible resend */ /* If not sending directly to peer - no point in keeping skb */ diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 7fe528adc5ae..8410bb3bf5e8 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -211,11 +211,11 @@ ieee80211_tx_h_dynamic_ps(struct ieee80211_tx_data *tx) struct ieee80211_if_managed *ifmgd; /* driver doesn't support power save */ - if (!(local->hw.flags & IEEE80211_HW_SUPPORTS_PS)) + if (!ieee80211_hw_check(&local->hw, SUPPORTS_PS)) return TX_CONTINUE; /* hardware does dynamic power save */ - if (local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_PS) + if (ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS)) return TX_CONTINUE; /* dynamic power save disabled */ @@ -431,7 +431,7 @@ ieee80211_tx_h_multicast_ps_buf(struct ieee80211_tx_data *tx) if (ieee80211_is_probe_req(hdr->frame_control)) return TX_CONTINUE; - if (tx->local->hw.flags & IEEE80211_HW_QUEUE_CONTROL) + if (ieee80211_hw_check(&tx->local->hw, QUEUE_CONTROL)) info->hw_queue = tx->sdata->vif.cab_queue; /* no stations in PS mode */ @@ -441,7 +441,7 @@ ieee80211_tx_h_multicast_ps_buf(struct ieee80211_tx_data *tx) info->flags |= IEEE80211_TX_CTL_SEND_AFTER_DTIM; /* device releases frame after DTIM beacon */ - if (!(tx->local->hw.flags & IEEE80211_HW_HOST_BROADCAST_PS_BUFFERING)) + if (!ieee80211_hw_check(&tx->local->hw, HOST_BROADCAST_PS_BUFFERING)) return TX_CONTINUE; /* buffered in mac80211 */ @@ -1185,8 +1185,8 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata, if (tx->sta && ieee80211_is_data_qos(hdr->frame_control) && !ieee80211_is_qos_nullfunc(hdr->frame_control) && - (local->hw.flags & IEEE80211_HW_AMPDU_AGGREGATION) && - !(local->hw.flags & IEEE80211_HW_TX_AMPDU_SETUP_IN_HW)) { + ieee80211_hw_check(&local->hw, AMPDU_AGGREGATION) && + !ieee80211_hw_check(&local->hw, TX_AMPDU_SETUP_IN_HW)) { struct tid_ampdu_tx *tid_tx; qc = ieee80211_get_qos_ctl(hdr); @@ -1429,7 +1429,7 @@ static bool __ieee80211_tx(struct ieee80211_local *local, vif = &sdata->vif; info->hw_queue = vif->hw_queue[skb_get_queue_mapping(skb)]; - } else if (local->hw.flags & IEEE80211_HW_QUEUE_CONTROL) { + } else if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) { dev_kfree_skb(skb); return true; } else @@ -1475,7 +1475,7 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx) CALL_TXH(ieee80211_tx_h_ps_buf); CALL_TXH(ieee80211_tx_h_check_control_port_protocol); CALL_TXH(ieee80211_tx_h_select_key); - if (!(tx->local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL)) + if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL)) CALL_TXH(ieee80211_tx_h_rate_ctrl); if (unlikely(info->flags & IEEE80211_TX_INTFL_RETRANSMISSION)) { @@ -1490,7 +1490,7 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx) /* handlers after fragment must be aware of tx info fragmentation! */ CALL_TXH(ieee80211_tx_h_stats); CALL_TXH(ieee80211_tx_h_encrypt); - if (!(tx->local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL)) + if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL)) CALL_TXH(ieee80211_tx_h_calculate_duration); #undef CALL_TXH @@ -1580,7 +1580,7 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata, /* set up hw_queue value early */ if (!(info->flags & IEEE80211_TX_CTL_TX_OFFCHAN) || - !(local->hw.flags & IEEE80211_HW_QUEUE_CONTROL)) + !ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)]; @@ -1607,7 +1607,7 @@ static int ieee80211_skb_resize(struct ieee80211_sub_if_data *sdata, } if (skb_cloned(skb) && - (!(local->hw.flags & IEEE80211_HW_SUPPORTS_CLONED_SKBS) || + (!ieee80211_hw_check(&local->hw, SUPPORTS_CLONED_SKBS) || !skb_clone_writable(skb, ETH_HLEN) || (may_encrypt && sdata->crypto_tx_tailroom_needed_cnt))) I802_DEBUG_INC(local->tx_expand_skb_head_cloned); @@ -2426,7 +2426,7 @@ void ieee80211_check_fast_xmit(struct sta_info *sta) struct ieee80211_chanctx_conf *chanctx_conf; __le16 fc; - if (!(local->hw.flags & IEEE80211_HW_SUPPORT_FAST_XMIT)) + if (!ieee80211_hw_check(&local->hw, SUPPORT_FAST_XMIT)) return; /* Locking here protects both the pointer itself, and against concurrent @@ -2442,8 +2442,8 @@ void ieee80211_check_fast_xmit(struct sta_info *sta) * cleared/changed already. */ spin_lock_bh(&sta->lock); - if (local->hw.flags & IEEE80211_HW_SUPPORTS_PS && - !(local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_PS) && + if (ieee80211_hw_check(&local->hw, SUPPORTS_PS) && + !ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS) && sdata->vif.type == NL80211_IFTYPE_STATION) goto out; @@ -2790,7 +2790,7 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, if (fast_tx->key) info->control.hw_key = &fast_tx->key->conf; - if (!(local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL)) { + if (!ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) { tx.skb = skb; r = ieee80211_tx_h_rate_ctrl(&tx); skb = tx.skb; @@ -3807,7 +3807,7 @@ int ieee80211_reserve_tid(struct ieee80211_sta *pubsta, u8 tid) synchronize_net(); /* Tear down BA sessions so we stop aggregating on this TID */ - if (local->hw.flags & IEEE80211_HW_AMPDU_AGGREGATION) { + if (ieee80211_hw_check(&local->hw, AMPDU_AGGREGATION)) { set_sta_flag(sta, WLAN_STA_BLOCK_BA); __ieee80211_stop_tx_ba_session(sta, tid, AGG_STOP_LOCAL_REQUEST); @@ -3821,7 +3821,7 @@ int ieee80211_reserve_tid(struct ieee80211_sta *pubsta, u8 tid) ieee80211_wake_vif_queues(local, sdata, IEEE80211_QUEUE_STOP_REASON_RESERVE_TID); - if (local->hw.flags & IEEE80211_HW_AMPDU_AGGREGATION) + if (ieee80211_hw_check(&local->hw, AMPDU_AGGREGATION)) clear_sta_flag(sta, WLAN_STA_BLOCK_BA); ret = 0; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index b864ebc6ab8f..43e5aadd7a89 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -564,7 +564,7 @@ ieee80211_get_vif_queues(struct ieee80211_local *local, { unsigned int queues; - if (sdata && local->hw.flags & IEEE80211_HW_QUEUE_CONTROL) { + if (sdata && ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) { int ac; queues = 0; @@ -592,7 +592,7 @@ void __ieee80211_flush_queues(struct ieee80211_local *local, * If no queue was set, or if the HW doesn't support * IEEE80211_HW_QUEUE_CONTROL - flush all queues */ - if (!queues || !(local->hw.flags & IEEE80211_HW_QUEUE_CONTROL)) + if (!queues || !ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) queues = ieee80211_get_vif_queues(local, sdata); ieee80211_stop_queues_by_reason(&local->hw, queues, @@ -2046,7 +2046,7 @@ int ieee80211_reconfig(struct ieee80211_local *local) * about the sessions, but we and the AP still think they * are active. This is really a workaround though. */ - if (hw->flags & IEEE80211_HW_AMPDU_AGGREGATION) { + if (ieee80211_hw_check(hw, AMPDU_AGGREGATION)) { mutex_lock(&local->sta_mtx); list_for_each_entry(sta, &local->sta_list, list) { -- cgit From 835a6a2f8603237a3e6cded5a6765090ecb06ea5 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 10 Jun 2015 20:28:33 +0300 Subject: Bluetooth: Stop sabotaging list poisoning list_del() poisons pointers with special values, no need to overwrite them. Signed-off-by: Alexey Dobriyan Signed-off-by: Marcel Holtmann --- net/bluetooth/l2cap_core.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 07bd316d02ba..51594fb7b9e7 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1601,7 +1601,7 @@ int l2cap_register_user(struct l2cap_conn *conn, struct l2cap_user *user) hci_dev_lock(hdev); - if (user->list.next || user->list.prev) { + if (!list_empty(&user->list)) { ret = -EINVAL; goto out_unlock; } @@ -1631,12 +1631,10 @@ void l2cap_unregister_user(struct l2cap_conn *conn, struct l2cap_user *user) hci_dev_lock(hdev); - if (!user->list.next || !user->list.prev) + if (list_empty(&user->list)) goto out_unlock; list_del(&user->list); - user->list.next = NULL; - user->list.prev = NULL; user->remove(conn, user); out_unlock: @@ -1651,8 +1649,6 @@ static void l2cap_unregister_all_users(struct l2cap_conn *conn) while (!list_empty(&conn->users)) { user = list_first_entry(&conn->users, struct l2cap_user, list); list_del(&user->list); - user->list.next = NULL; - user->list.prev = NULL; user->remove(conn, user); } } -- cgit From 8c86f967dd24a79ef202fd6f479ca9988ea31f3b Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 9 Jun 2015 03:34:13 -0700 Subject: bridge: make br_fdb_delete also check if the port matches Before this patch the user-specified bridge port was ignored when deleting an fdb entry and thus one could delete an entry that belonged to any port. Example (eth0 and eth1 are br0 ports): bridge fdb add 00:11:22:33:44:55 dev eth0 master bridge fdb del 00:11:22:33:44:55 dev eth1 master (succeeds) after the patch: bridge fdb add 00:11:22:33:44:55 dev eth0 master bridge fdb del 00:11:22:33:44:55 dev eth1 master RTNETLINK answers: No such file or directory Based on a patch by Wilson Kok. Reported-by: Wilson Kok Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_fdb.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index cecb482ed919..13949a71591d 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -873,13 +873,15 @@ out: return err; } -static int fdb_delete_by_addr(struct net_bridge *br, const u8 *addr, u16 vlan) +static int fdb_delete_by_addr_and_port(struct net_bridge_port *p, + const u8 *addr, u16 vlan) { + struct net_bridge *br = p->br; struct hlist_head *head = &br->hash[br_mac_hash(addr, vlan)]; struct net_bridge_fdb_entry *fdb; fdb = fdb_find(head, addr, vlan); - if (!fdb) + if (!fdb || fdb->dst != p) return -ENOENT; fdb_delete(br, fdb); @@ -892,7 +894,7 @@ static int __br_fdb_delete(struct net_bridge_port *p, int err; spin_lock_bh(&p->br->hash_lock); - err = fdb_delete_by_addr(p->br, addr, vid); + err = fdb_delete_by_addr_and_port(p, addr, vid); spin_unlock_bh(&p->br->hash_lock); return err; -- cgit From 37a9a8df8ce9de6ea73349c9ac8bdf6ba4ec4f70 Mon Sep 17 00:00:00 2001 From: Stephen Smalley Date: Wed, 10 Jun 2015 08:44:59 -0400 Subject: net/unix: support SCM_SECURITY for stream sockets SCM_SECURITY was originally only implemented for datagram sockets, not for stream sockets. However, SCM_CREDENTIALS is supported on Unix stream sockets. For consistency, implement Unix stream support for SCM_SECURITY as well. Also clean up the existing code and get rid of the superfluous UNIXSID macro. Motivated by https://bugzilla.redhat.com/show_bug.cgi?id=1224211, where systemd was using SCM_CREDENTIALS and assumed wrongly that SCM_SECURITY was also supported on Unix stream sockets. Signed-off-by: Stephen Smalley Acked-by: Paul Moore Signed-off-by: David S. Miller --- net/unix/af_unix.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index f25e1675b865..03ee4d359f6a 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -140,12 +140,17 @@ static struct hlist_head *unix_sockets_unbound(void *addr) #ifdef CONFIG_SECURITY_NETWORK static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) { - memcpy(UNIXSID(skb), &scm->secid, sizeof(u32)); + UNIXCB(skb).secid = scm->secid; } static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) { - scm->secid = *UNIXSID(skb); + scm->secid = UNIXCB(skb).secid; +} + +static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) +{ + return (scm->secid == UNIXCB(skb).secid); } #else static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) @@ -153,6 +158,11 @@ static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) { } + +static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) +{ + return true; +} #endif /* CONFIG_SECURITY_NETWORK */ /* @@ -1414,6 +1424,7 @@ static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool sen UNIXCB(skb).uid = scm->creds.uid; UNIXCB(skb).gid = scm->creds.gid; UNIXCB(skb).fp = NULL; + unix_get_secdata(scm, skb); if (scm->fp && send_fds) err = unix_attach_fds(scm, skb); @@ -1509,7 +1520,6 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, if (err < 0) goto out_free; max_level = err + 1; - unix_get_secdata(&scm, skb); skb_put(skb, len - data_len); skb->data_len = data_len; @@ -2118,11 +2128,13 @@ unlock: /* Never glue messages from different writers */ if ((UNIXCB(skb).pid != scm.pid) || !uid_eq(UNIXCB(skb).uid, scm.creds.uid) || - !gid_eq(UNIXCB(skb).gid, scm.creds.gid)) + !gid_eq(UNIXCB(skb).gid, scm.creds.gid) || + !unix_secdata_eq(&scm, skb)) break; } else if (test_bit(SOCK_PASSCRED, &sock->flags)) { /* Copy credentials */ scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); + unix_set_secdata(&scm, skb); check_creds = true; } -- cgit From f9c2ff22bb2df7b8f153afd2a4bea07176bad144 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 10 Jun 2015 22:11:17 -0700 Subject: net: tcp: dctcp_update_alpha() fixes. dctcp_alpha can be read by from dctcp_get_info() without synchro, so use WRITE_ONCE() to prevent compiler from using dctcp_alpha as a temporary variable. Also, playing with small dctcp_shift_g (like 1), can expose an overflow with 32bit values shifted 9 times before divide. Use an u64 field to avoid this problem, and perform the divide only if acked_bytes_ecn is not zero. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_dctcp.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index 4c41c1287197..7092a61c4dc8 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -204,20 +204,26 @@ static void dctcp_update_alpha(struct sock *sk, u32 flags) /* Expired RTT */ if (!before(tp->snd_una, ca->next_seq)) { - /* For avoiding denominator == 1. */ - if (ca->acked_bytes_total == 0) - ca->acked_bytes_total = 1; + u64 bytes_ecn = ca->acked_bytes_ecn; + u32 alpha = ca->dctcp_alpha; /* alpha = (1 - g) * alpha + g * F */ - ca->dctcp_alpha = ca->dctcp_alpha - - (ca->dctcp_alpha >> dctcp_shift_g) + - (ca->acked_bytes_ecn << (10U - dctcp_shift_g)) / - ca->acked_bytes_total; - if (ca->dctcp_alpha > DCTCP_MAX_ALPHA) - /* Clamp dctcp_alpha to max. */ - ca->dctcp_alpha = DCTCP_MAX_ALPHA; + alpha -= alpha >> dctcp_shift_g; + if (bytes_ecn) { + /* If dctcp_shift_g == 1, a 32bit value would overflow + * after 8 Mbytes. + */ + bytes_ecn <<= (10 - dctcp_shift_g); + do_div(bytes_ecn, max(1U, ca->acked_bytes_total)); + alpha = min(alpha + (u32)bytes_ecn, DCTCP_MAX_ALPHA); + } + /* dctcp_alpha can be read from dctcp_get_info() without + * synchro, so we ask compiler to not use dctcp_alpha + * as a temporary variable in prior operations. + */ + WRITE_ONCE(ca->dctcp_alpha, alpha); dctcp_reset(tp, ca); } } -- cgit From af201f723f694c8bf12f80c39c897371c4800d31 Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Wed, 10 Jun 2015 17:04:49 -0700 Subject: switchdev: fix handling for drivers not supporting IPv4 fib add/del ops If CONFIG_NET_SWITCHDEV is enabled, but port driver does not implement support for IPv4 FIB add/del ops, don't fail route add/del offload operations. Route adds will not be marked as OFFLOAD. Routes will be installed in the kernel FIB, as usual. This was report/fixed by Florian when testing DSA driver with net-next on devices with L2 offload support but no L3 offload support. What he reported was an initial route installed from DHCP client would fail (route not installed to kernel FIB). This was triggering the setting of ipv4.fib_offload_disabled, which would disable route offloading after the first failure. So subsequent attempts to install the route would succeed. There is follow-on work/discussion to address the handling of route install failures, but for now, let's differentiate between no support and failed support. Reported-by: Florian Fainelli Signed-off-by: Scott Feldman Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- net/switchdev/switchdev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index e008057dab46..b683e89b4caa 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -853,7 +853,7 @@ int switchdev_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi, if (!err) fi->fib_flags |= RTNH_F_OFFLOAD; - return err; + return err == -EOPNOTSUPP ? 0 : err; } EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_add); @@ -898,7 +898,7 @@ int switchdev_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi, if (!err) fi->fib_flags &= ~RTNH_F_OFFLOAD; - return err; + return err == -EOPNOTSUPP ? 0 : err; } EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_del); -- cgit From 7782ad8bb55c18f634a6713ec38c8f1ae86ad0b5 Mon Sep 17 00:00:00 2001 From: Kenneth Klette Jonassen Date: Wed, 10 Jun 2015 19:08:16 +0200 Subject: tcp: export tcp_enter_cwr() Upcoming tcp_cdg uses tcp_enter_cwr() to initiate PRR. Export this function so that CDG can be compiled as a module. Cc: Eric Dumazet Cc: Yuchung Cheng Cc: Stephen Hemminger Cc: Neal Cardwell Cc: David Hayes Cc: Andreas Petlund Cc: Dave Taht Cc: Nicolas Kuhn Signed-off-by: Kenneth Klette Jonassen Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 15c4536188a4..d4f76ab6e136 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2552,6 +2552,7 @@ void tcp_enter_cwr(struct sock *sk) tcp_set_ca_state(sk, TCP_CA_CWR); } } +EXPORT_SYMBOL(tcp_enter_cwr); static void tcp_try_keep_open(struct sock *sk) { -- cgit From 2b0a8c9eee81882fc0001ccf6d9af62cdc682f9e Mon Sep 17 00:00:00 2001 From: Kenneth Klette Jonassen Date: Wed, 10 Jun 2015 19:08:17 +0200 Subject: tcp: add CDG congestion control CAIA Delay-Gradient (CDG) is a TCP congestion control that modifies the TCP sender in order to [1]: o Use the delay gradient as a congestion signal. o Back off with an average probability that is independent of the RTT. o Coexist with flows that use loss-based congestion control, i.e., flows that are unresponsive to the delay signal. o Tolerate packet loss unrelated to congestion. (Disabled by default.) Its FreeBSD implementation was presented for the ICCRG in July 2012; slides are available at http://www.ietf.org/proceedings/84/iccrg.html Running the experiment scenarios in [1] suggests that our implementation achieves more goodput compared with FreeBSD 10.0 senders, although it also causes more queueing delay for a given backoff factor. The loss tolerance heuristic is disabled by default due to safety concerns for its use in the Internet [2, p. 45-46]. We use a variant of the Hybrid Slow start algorithm in tcp_cubic to reduce the probability of slow start overshoot. [1] D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using delay gradients." In Networking 2011, pages 328-341. Springer, 2011. [2] K.K. Jonassen. "Implementing CAIA Delay-Gradient in Linux." MSc thesis. Department of Informatics, University of Oslo, 2015. Cc: Eric Dumazet Cc: Yuchung Cheng Cc: Stephen Hemminger Cc: Neal Cardwell Cc: David Hayes Cc: Andreas Petlund Cc: Dave Taht Cc: Nicolas Kuhn Signed-off-by: Kenneth Klette Jonassen Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 20 +++ net/ipv4/Makefile | 1 + net/ipv4/tcp_cdg.c | 433 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 454 insertions(+) create mode 100644 net/ipv4/tcp_cdg.c (limited to 'net') diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index d83071dccd74..6fb3c90ad726 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -615,6 +615,22 @@ config TCP_CONG_DCTCP For further details see: http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf +config TCP_CONG_CDG + tristate "CAIA Delay-Gradient (CDG)" + default n + ---help--- + CAIA Delay-Gradient (CDG) is a TCP congestion control that modifies + the TCP sender in order to: + + o Use the delay gradient as a congestion signal. + o Back off with an average probability that is independent of the RTT. + o Coexist with flows that use loss-based congestion control. + o Tolerate packet loss unrelated to congestion. + + For further details see: + D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using + delay gradients." In Networking 2011. Preprint: http://goo.gl/No3vdg + choice prompt "Default TCP congestion control" default DEFAULT_CUBIC @@ -646,6 +662,9 @@ choice config DEFAULT_DCTCP bool "DCTCP" if TCP_CONG_DCTCP=y + config DEFAULT_CDG + bool "CDG" if TCP_CONG_CDG=y + config DEFAULT_RENO bool "Reno" endchoice @@ -668,6 +687,7 @@ config DEFAULT_TCP_CONG default "veno" if DEFAULT_VENO default "reno" if DEFAULT_RENO default "dctcp" if DEFAULT_DCTCP + default "cdg" if DEFAULT_CDG default "cubic" config TCP_MD5SIG diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index b36236dd6014..efc43f300b8c 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -42,6 +42,7 @@ obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o +obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o obj-$(CONFIG_TCP_CONG_DCTCP) += tcp_dctcp.o obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c new file mode 100644 index 000000000000..a52ce2db53f2 --- /dev/null +++ b/net/ipv4/tcp_cdg.c @@ -0,0 +1,433 @@ +/* + * CAIA Delay-Gradient (CDG) congestion control + * + * This implementation is based on the paper: + * D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using + * delay gradients." In IFIP Networking, pages 328-341. Springer, 2011. + * + * Scavenger traffic (Less-than-Best-Effort) should disable coexistence + * heuristics using parameters use_shadow=0 and use_ineff=0. + * + * Parameters window, backoff_beta, and backoff_factor are crucial for + * throughput and delay. Future work is needed to determine better defaults, + * and to provide guidelines for use in different environments/contexts. + * + * Except for window, knobs are configured via /sys/module/tcp_cdg/parameters/. + * Parameter window is only configurable when loading tcp_cdg as a module. + * + * Notable differences from paper/FreeBSD: + * o Using Hybrid Slow start and Proportional Rate Reduction. + * o Add toggle for shadow window mechanism. Suggested by David Hayes. + * o Add toggle for non-congestion loss tolerance. + * o Scaling parameter G is changed to a backoff factor; + * conversion is given by: backoff_factor = 1000/(G * window). + * o Limit shadow window to 2 * cwnd, or to cwnd when application limited. + * o More accurate e^-x. + */ +#include +#include +#include +#include + +#define HYSTART_ACK_TRAIN 1 +#define HYSTART_DELAY 2 + +static int window __read_mostly = 8; +static unsigned int backoff_beta __read_mostly = 0.7071 * 1024; /* sqrt 0.5 */ +static unsigned int backoff_factor __read_mostly = 42; +static unsigned int hystart_detect __read_mostly = 3; +static unsigned int use_ineff __read_mostly = 5; +static bool use_shadow __read_mostly = true; +static bool use_tolerance __read_mostly; + +module_param(window, int, 0444); +MODULE_PARM_DESC(window, "gradient window size (power of two <= 256)"); +module_param(backoff_beta, uint, 0644); +MODULE_PARM_DESC(backoff_beta, "backoff beta (0-1024)"); +module_param(backoff_factor, uint, 0644); +MODULE_PARM_DESC(backoff_factor, "backoff probability scale factor"); +module_param(hystart_detect, uint, 0644); +MODULE_PARM_DESC(hystart_detect, "use Hybrid Slow start " + "(0: disabled, 1: ACK train, 2: delay threshold, 3: both)"); +module_param(use_ineff, uint, 0644); +MODULE_PARM_DESC(use_ineff, "use ineffectual backoff detection (threshold)"); +module_param(use_shadow, bool, 0644); +MODULE_PARM_DESC(use_shadow, "use shadow window heuristic"); +module_param(use_tolerance, bool, 0644); +MODULE_PARM_DESC(use_tolerance, "use loss tolerance heuristic"); + +struct minmax { + union { + struct { + s32 min; + s32 max; + }; + u64 v64; + }; +}; + +enum cdg_state { + CDG_UNKNOWN = 0, + CDG_NONFULL = 1, + CDG_FULL = 2, + CDG_BACKOFF = 3, +}; + +struct cdg { + struct minmax rtt; + struct minmax rtt_prev; + struct minmax *gradients; + struct minmax gsum; + bool gfilled; + u8 tail; + u8 state; + u8 delack; + u32 rtt_seq; + u32 undo_cwnd; + u32 shadow_wnd; + u16 backoff_cnt; + u16 sample_cnt; + s32 delay_min; + u32 last_ack; + u32 round_start; +}; + +/** + * nexp_u32 - negative base-e exponential + * @ux: x in units of micro + * + * Returns exp(ux * -1e-6) * U32_MAX. + */ +static u32 __pure nexp_u32(u32 ux) +{ + static const u16 v[] = { + /* exp(-x)*65536-1 for x = 0, 0.000256, 0.000512, ... */ + 65535, + 65518, 65501, 65468, 65401, 65267, 65001, 64470, 63422, + 61378, 57484, 50423, 38795, 22965, 8047, 987, 14, + }; + u32 msb = ux >> 8; + u32 res; + int i; + + /* Cut off when ux >= 2^24 (actual result is <= 222/U32_MAX). */ + if (msb > U16_MAX) + return 0; + + /* Scale first eight bits linearly: */ + res = U32_MAX - (ux & 0xff) * (U32_MAX / 1000000); + + /* Obtain e^(x + y + ...) by computing e^x * e^y * ...: */ + for (i = 1; msb; i++, msb >>= 1) { + u32 y = v[i & -(msb & 1)] + U32_C(1); + + res = ((u64)res * y) >> 16; + } + + return res; +} + +/* Based on the HyStart algorithm (by Ha et al.) that is implemented in + * tcp_cubic. Differences/experimental changes: + * o Using Hayes' delayed ACK filter. + * o Using a usec clock for the ACK train. + * o Reset ACK train when application limited. + * o Invoked at any cwnd (i.e. also when cwnd < 16). + * o Invoked only when cwnd < ssthresh (i.e. not when cwnd == ssthresh). + */ +static void tcp_cdg_hystart_update(struct sock *sk) +{ + struct cdg *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + ca->delay_min = min_not_zero(ca->delay_min, ca->rtt.min); + if (ca->delay_min == 0) + return; + + if (hystart_detect & HYSTART_ACK_TRAIN) { + u32 now_us = local_clock() / NSEC_PER_USEC; + + if (ca->last_ack == 0 || !tcp_is_cwnd_limited(sk)) { + ca->last_ack = now_us; + ca->round_start = now_us; + } else if (before(now_us, ca->last_ack + 3000)) { + u32 base_owd = max(ca->delay_min / 2U, 125U); + + ca->last_ack = now_us; + if (after(now_us, ca->round_start + base_owd)) { + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPHYSTARTTRAINDETECT); + NET_ADD_STATS_BH(sock_net(sk), + LINUX_MIB_TCPHYSTARTTRAINCWND, + tp->snd_cwnd); + tp->snd_ssthresh = tp->snd_cwnd; + return; + } + } + } + + if (hystart_detect & HYSTART_DELAY) { + if (ca->sample_cnt < 8) { + ca->sample_cnt++; + } else { + s32 thresh = max(ca->delay_min + ca->delay_min / 8U, + 125U); + + if (ca->rtt.min > thresh) { + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPHYSTARTDELAYDETECT); + NET_ADD_STATS_BH(sock_net(sk), + LINUX_MIB_TCPHYSTARTDELAYCWND, + tp->snd_cwnd); + tp->snd_ssthresh = tp->snd_cwnd; + } + } + } +} + +static s32 tcp_cdg_grad(struct cdg *ca) +{ + s32 gmin = ca->rtt.min - ca->rtt_prev.min; + s32 gmax = ca->rtt.max - ca->rtt_prev.max; + s32 grad; + + if (ca->gradients) { + ca->gsum.min += gmin - ca->gradients[ca->tail].min; + ca->gsum.max += gmax - ca->gradients[ca->tail].max; + ca->gradients[ca->tail].min = gmin; + ca->gradients[ca->tail].max = gmax; + ca->tail = (ca->tail + 1) & (window - 1); + gmin = ca->gsum.min; + gmax = ca->gsum.max; + } + + /* We keep sums to ignore gradients during cwnd reductions; + * the paper's smoothed gradients otherwise simplify to: + * (rtt_latest - rtt_oldest) / window. + * + * We also drop division by window here. + */ + grad = gmin > 0 ? gmin : gmax; + + /* Extrapolate missing values in gradient window: */ + if (!ca->gfilled) { + if (!ca->gradients && window > 1) + grad *= window; /* Memory allocation failed. */ + else if (ca->tail == 0) + ca->gfilled = true; + else + grad = (grad * window) / (int)ca->tail; + } + + /* Backoff was effectual: */ + if (gmin <= -32 || gmax <= -32) + ca->backoff_cnt = 0; + + if (use_tolerance) { + /* Reduce small variations to zero: */ + gmin = DIV_ROUND_CLOSEST(gmin, 64); + gmax = DIV_ROUND_CLOSEST(gmax, 64); + + if (gmin > 0 && gmax <= 0) + ca->state = CDG_FULL; + else if ((gmin > 0 && gmax > 0) || gmax < 0) + ca->state = CDG_NONFULL; + } + return grad; +} + +static bool tcp_cdg_backoff(struct sock *sk, u32 grad) +{ + struct cdg *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + if (prandom_u32() <= nexp_u32(grad * backoff_factor)) + return false; + + if (use_ineff) { + ca->backoff_cnt++; + if (ca->backoff_cnt > use_ineff) + return false; + } + + ca->shadow_wnd = max(ca->shadow_wnd, tp->snd_cwnd); + ca->state = CDG_BACKOFF; + tcp_enter_cwr(sk); + return true; +} + +/* Not called in CWR or Recovery state. */ +static void tcp_cdg_cong_avoid(struct sock *sk, u32 ack, u32 acked) +{ + struct cdg *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + u32 prior_snd_cwnd; + u32 incr; + + if (tp->snd_cwnd < tp->snd_ssthresh && hystart_detect) + tcp_cdg_hystart_update(sk); + + if (after(ack, ca->rtt_seq) && ca->rtt.v64) { + s32 grad = 0; + + if (ca->rtt_prev.v64) + grad = tcp_cdg_grad(ca); + ca->rtt_seq = tp->snd_nxt; + ca->rtt_prev = ca->rtt; + ca->rtt.v64 = 0; + ca->last_ack = 0; + ca->sample_cnt = 0; + + if (grad > 0 && tcp_cdg_backoff(sk, grad)) + return; + } + + if (!tcp_is_cwnd_limited(sk)) { + ca->shadow_wnd = min(ca->shadow_wnd, tp->snd_cwnd); + return; + } + + prior_snd_cwnd = tp->snd_cwnd; + tcp_reno_cong_avoid(sk, ack, acked); + + incr = tp->snd_cwnd - prior_snd_cwnd; + ca->shadow_wnd = max(ca->shadow_wnd, ca->shadow_wnd + incr); +} + +static void tcp_cdg_acked(struct sock *sk, u32 num_acked, s32 rtt_us) +{ + struct cdg *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + if (rtt_us <= 0) + return; + + /* A heuristic for filtering delayed ACKs, adapted from: + * D.A. Hayes. "Timing enhancements to the FreeBSD kernel to support + * delay and rate based TCP mechanisms." TR 100219A. CAIA, 2010. + */ + if (tp->sacked_out == 0) { + if (num_acked == 1 && ca->delack) { + /* A delayed ACK is only used for the minimum if it is + * provenly lower than an existing non-zero minimum. + */ + ca->rtt.min = min(ca->rtt.min, rtt_us); + ca->delack--; + return; + } else if (num_acked > 1 && ca->delack < 5) { + ca->delack++; + } + } + + ca->rtt.min = min_not_zero(ca->rtt.min, rtt_us); + ca->rtt.max = max(ca->rtt.max, rtt_us); +} + +static u32 tcp_cdg_ssthresh(struct sock *sk) +{ + struct cdg *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + ca->undo_cwnd = tp->snd_cwnd; + + if (ca->state == CDG_BACKOFF) + return max(2U, (tp->snd_cwnd * min(1024U, backoff_beta)) >> 10); + + if (ca->state == CDG_NONFULL && use_tolerance) + return tp->snd_cwnd; + + ca->shadow_wnd = min(ca->shadow_wnd >> 1, tp->snd_cwnd); + if (use_shadow) + return max3(2U, ca->shadow_wnd, tp->snd_cwnd >> 1); + return max(2U, tp->snd_cwnd >> 1); +} + +static u32 tcp_cdg_undo_cwnd(struct sock *sk) +{ + struct cdg *ca = inet_csk_ca(sk); + + return max(tcp_sk(sk)->snd_cwnd, ca->undo_cwnd); +} + +static void tcp_cdg_cwnd_event(struct sock *sk, const enum tcp_ca_event ev) +{ + struct cdg *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct minmax *gradients; + + switch (ev) { + case CA_EVENT_CWND_RESTART: + gradients = ca->gradients; + if (gradients) + memset(gradients, 0, window * sizeof(gradients[0])); + memset(ca, 0, sizeof(*ca)); + + ca->gradients = gradients; + ca->rtt_seq = tp->snd_nxt; + ca->shadow_wnd = tp->snd_cwnd; + break; + case CA_EVENT_COMPLETE_CWR: + ca->state = CDG_UNKNOWN; + ca->rtt_seq = tp->snd_nxt; + ca->rtt_prev = ca->rtt; + ca->rtt.v64 = 0; + break; + default: + break; + } +} + +static void tcp_cdg_init(struct sock *sk) +{ + struct cdg *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + /* We silently fall back to window = 1 if allocation fails. */ + if (window > 1) + ca->gradients = kcalloc(window, sizeof(ca->gradients[0]), + GFP_NOWAIT | __GFP_NOWARN); + ca->rtt_seq = tp->snd_nxt; + ca->shadow_wnd = tp->snd_cwnd; +} + +static void tcp_cdg_release(struct sock *sk) +{ + struct cdg *ca = inet_csk_ca(sk); + + kfree(ca->gradients); +} + +struct tcp_congestion_ops tcp_cdg __read_mostly = { + .cong_avoid = tcp_cdg_cong_avoid, + .cwnd_event = tcp_cdg_cwnd_event, + .pkts_acked = tcp_cdg_acked, + .undo_cwnd = tcp_cdg_undo_cwnd, + .ssthresh = tcp_cdg_ssthresh, + .release = tcp_cdg_release, + .init = tcp_cdg_init, + .owner = THIS_MODULE, + .name = "cdg", +}; + +static int __init tcp_cdg_register(void) +{ + if (backoff_beta > 1024 || window < 1 || window > 256) + return -ERANGE; + if (!is_power_of_2(window)) + return -EINVAL; + + BUILD_BUG_ON(sizeof(struct cdg) > ICSK_CA_PRIV_SIZE); + tcp_register_congestion_control(&tcp_cdg); + return 0; +} + +static void __exit tcp_cdg_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_cdg); +} + +module_init(tcp_cdg_register); +module_exit(tcp_cdg_unregister); +MODULE_AUTHOR("Kenneth Klette Jonassen"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP CDG"); -- cgit From a4244b0cf58d56c171874e85228ba5deffeb017a Mon Sep 17 00:00:00 2001 From: Hadar Hen Zion Date: Thu, 11 Jun 2015 10:28:16 +0300 Subject: net/ethtool: Add current supported tunable options Add strings array of the current supported tunable options. Signed-off-by: Hadar Hen Zion Reviewed-by: Amir Vadai Signed-off-by: David S. Miller --- net/core/ethtool.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'net') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index eb0c3ace7458..b495ab1797fa 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -106,6 +106,13 @@ rss_hash_func_strings[ETH_RSS_HASH_FUNCS_COUNT][ETH_GSTRING_LEN] = { [ETH_RSS_HASH_XOR_BIT] = "xor", }; +static const char +tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN] = { + [ETHTOOL_ID_UNSPEC] = "Unspec", + [ETHTOOL_RX_COPYBREAK] = "rx-copybreak", + [ETHTOOL_TX_COPYBREAK] = "tx-copybreak", +}; + static int ethtool_get_features(struct net_device *dev, void __user *useraddr) { struct ethtool_gfeatures cmd = { @@ -194,6 +201,9 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset) if (sset == ETH_SS_RSS_HASH_FUNCS) return ARRAY_SIZE(rss_hash_func_strings); + if (sset == ETH_SS_TUNABLES) + return ARRAY_SIZE(tunable_strings); + if (ops->get_sset_count && ops->get_strings) return ops->get_sset_count(dev, sset); else @@ -211,6 +221,8 @@ static void __ethtool_get_strings(struct net_device *dev, else if (stringset == ETH_SS_RSS_HASH_FUNCS) memcpy(data, rss_hash_func_strings, sizeof(rss_hash_func_strings)); + else if (stringset == ETH_SS_TUNABLES) + memcpy(data, tunable_strings, sizeof(tunable_strings)); else /* ops->get_strings is valid because checked earlier */ ops->get_strings(dev, stringset, data); -- cgit From 9961127d4bce6325e9a0b0fb105e0c85a6c62cb7 Mon Sep 17 00:00:00 2001 From: Vincent Cuissard Date: Thu, 11 Jun 2015 11:25:47 +0200 Subject: NFC: nci: add generic uart support Some NFC controller supports UART as host interface. As with SPI, a lot of code can be shared between vendor drivers. This patch add the generic support of UART and provides some extension API for vendor specific needs. This code is strongly inspired by the Bluetooth HCI ldisc implementation. NCI UART vendor drivers will have to register themselves to this layer via nci_uart_register. Underlying tty will have to be configured from user land thanks to an ioctl. Signed-off-by: Vincent Cuissard Signed-off-by: Samuel Ortiz --- net/nfc/nci/Kconfig | 7 + net/nfc/nci/Makefile | 3 + net/nfc/nci/uart.c | 495 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 505 insertions(+) create mode 100644 net/nfc/nci/uart.c (limited to 'net') diff --git a/net/nfc/nci/Kconfig b/net/nfc/nci/Kconfig index a4f1e42e3481..901c1ddba841 100644 --- a/net/nfc/nci/Kconfig +++ b/net/nfc/nci/Kconfig @@ -19,3 +19,10 @@ config NFC_NCI_SPI an NFC Controller (NFCC) and a Device Host (DH). Say yes if you use an NCI driver that requires SPI link layer. + +config NFC_NCI_UART + depends on NFC_NCI && TTY + tristate "NCI over UART protocol support" + default n + help + Say yes if you use an NCI driver that requires UART link layer. diff --git a/net/nfc/nci/Makefile b/net/nfc/nci/Makefile index 7ed8949266cc..b4b85b82e988 100644 --- a/net/nfc/nci/Makefile +++ b/net/nfc/nci/Makefile @@ -7,3 +7,6 @@ obj-$(CONFIG_NFC_NCI) += nci.o nci-objs := core.o data.o lib.o ntf.o rsp.o hci.o nci-$(CONFIG_NFC_NCI_SPI) += spi.o + +nci_uart-y += uart.o +obj-$(CONFIG_NFC_NCI_UART) += nci_uart.o diff --git a/net/nfc/nci/uart.c b/net/nfc/nci/uart.c new file mode 100644 index 000000000000..70c279543074 --- /dev/null +++ b/net/nfc/nci/uart.c @@ -0,0 +1,495 @@ +/* + * Copyright (C) 2015, Marvell International Ltd. + * + * This software file (the "File") is distributed by Marvell International + * Ltd. under the terms of the GNU General Public License Version 2, June 1991 + * (the "License"). You may use, redistribute and/or modify this File in + * accordance with the terms and conditions of the License, a copy of which + * is available on the worldwide web at + * http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt. + * + * THE FILE IS DISTRIBUTED AS-IS, WITHOUT WARRANTY OF ANY KIND, AND THE + * IMPLIED WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE + * ARE EXPRESSLY DISCLAIMED. The License provides additional details about + * this warranty disclaimer. + + */ + +/* Inspired (hugely) by HCI LDISC implementation in Bluetooth. + * + * Copyright (C) 2000-2001 Qualcomm Incorporated + * Copyright (C) 2002-2003 Maxim Krasnyansky + * Copyright (C) 2004-2005 Marcel Holtmann + */ + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/* TX states */ +#define NCI_UART_SENDING 1 +#define NCI_UART_TX_WAKEUP 2 + +static struct nci_uart *nci_uart_drivers[NCI_UART_DRIVER_MAX]; + +static inline struct sk_buff *nci_uart_dequeue(struct nci_uart *nu) +{ + struct sk_buff *skb = nu->tx_skb; + + if (!skb) + skb = skb_dequeue(&nu->tx_q); + else + nu->tx_skb = NULL; + + return skb; +} + +static inline int nci_uart_queue_empty(struct nci_uart *nu) +{ + if (nu->tx_skb) + return 0; + + return skb_queue_empty(&nu->tx_q); +} + +static int nci_uart_tx_wakeup(struct nci_uart *nu) +{ + if (test_and_set_bit(NCI_UART_SENDING, &nu->tx_state)) { + set_bit(NCI_UART_TX_WAKEUP, &nu->tx_state); + return 0; + } + + schedule_work(&nu->write_work); + + return 0; +} + +static void nci_uart_write_work(struct work_struct *work) +{ + struct nci_uart *nu = container_of(work, struct nci_uart, write_work); + struct tty_struct *tty = nu->tty; + struct sk_buff *skb; + +restart: + clear_bit(NCI_UART_TX_WAKEUP, &nu->tx_state); + + if (nu->ops.tx_start) + nu->ops.tx_start(nu); + + while ((skb = nci_uart_dequeue(nu))) { + int len; + + set_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); + len = tty->ops->write(tty, skb->data, skb->len); + skb_pull(skb, len); + if (skb->len) { + nu->tx_skb = skb; + break; + } + kfree_skb(skb); + } + + if (test_bit(NCI_UART_TX_WAKEUP, &nu->tx_state)) + goto restart; + + if (nu->ops.tx_done && nci_uart_queue_empty(nu)) + nu->ops.tx_done(nu); + + clear_bit(NCI_UART_SENDING, &nu->tx_state); +} + +static int nci_uart_set_driver(struct tty_struct *tty, unsigned int driver) +{ + struct nci_uart *nu = NULL; + int ret; + + if (driver >= NCI_UART_DRIVER_MAX) + return -EINVAL; + + if (!nci_uart_drivers[driver]) + return -ENOENT; + + nu = kzalloc(sizeof(*nu), GFP_KERNEL); + if (!nu) + return -ENOMEM; + + memcpy(nu, nci_uart_drivers[driver], sizeof(struct nci_uart)); + nu->tty = tty; + tty->disc_data = nu; + skb_queue_head_init(&nu->tx_q); + INIT_WORK(&nu->write_work, nci_uart_write_work); + spin_lock_init(&nu->rx_lock); + + ret = nu->ops.open(nu); + if (ret) { + tty->disc_data = NULL; + kfree(nu); + } else if (!try_module_get(nu->owner)) { + nu->ops.close(nu); + tty->disc_data = NULL; + kfree(nu); + return -ENOENT; + } + return ret; +} + +/* ------ LDISC part ------ */ + +/* nci_uart_tty_open + * + * Called when line discipline changed to NCI_UART. + * + * Arguments: + * tty pointer to tty info structure + * Return Value: + * 0 if success, otherwise error code + */ +static int nci_uart_tty_open(struct tty_struct *tty) +{ + /* Error if the tty has no write op instead of leaving an exploitable + * hole + */ + if (!tty->ops->write) + return -EOPNOTSUPP; + + tty->disc_data = NULL; + tty->receive_room = 65536; + + /* Flush any pending characters in the driver and line discipline. */ + + /* FIXME: why is this needed. Note don't use ldisc_ref here as the + * open path is before the ldisc is referencable. + */ + + if (tty->ldisc->ops->flush_buffer) + tty->ldisc->ops->flush_buffer(tty); + tty_driver_flush_buffer(tty); + + return 0; +} + +/* nci_uart_tty_close() + * + * Called when the line discipline is changed to something + * else, the tty is closed, or the tty detects a hangup. + */ +static void nci_uart_tty_close(struct tty_struct *tty) +{ + struct nci_uart *nu = (void *)tty->disc_data; + + /* Detach from the tty */ + tty->disc_data = NULL; + + if (!nu) + return; + + if (nu->tx_skb) + kfree_skb(nu->tx_skb); + if (nu->rx_skb) + kfree_skb(nu->rx_skb); + + skb_queue_purge(&nu->tx_q); + + nu->ops.close(nu); + nu->tty = NULL; + module_put(nu->owner); + + cancel_work_sync(&nu->write_work); + + kfree(nu); +} + +/* nci_uart_tty_wakeup() + * + * Callback for transmit wakeup. Called when low level + * device driver can accept more send data. + * + * Arguments: tty pointer to associated tty instance data + * Return Value: None + */ +static void nci_uart_tty_wakeup(struct tty_struct *tty) +{ + struct nci_uart *nu = (void *)tty->disc_data; + + if (!nu) + return; + + clear_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); + + if (tty != nu->tty) + return; + + nci_uart_tx_wakeup(nu); +} + +/* nci_uart_tty_receive() + * + * Called by tty low level driver when receive data is + * available. + * + * Arguments: tty pointer to tty isntance data + * data pointer to received data + * flags pointer to flags for data + * count count of received data in bytes + * + * Return Value: None + */ +static void nci_uart_tty_receive(struct tty_struct *tty, const u8 *data, + char *flags, int count) +{ + struct nci_uart *nu = (void *)tty->disc_data; + + if (!nu || tty != nu->tty) + return; + + spin_lock(&nu->rx_lock); + nu->ops.recv_buf(nu, (void *)data, flags, count); + spin_unlock(&nu->rx_lock); + + tty_unthrottle(tty); +} + +/* nci_uart_tty_ioctl() + * + * Process IOCTL system call for the tty device. + * + * Arguments: + * + * tty pointer to tty instance data + * file pointer to open file object for device + * cmd IOCTL command code + * arg argument for IOCTL call (cmd dependent) + * + * Return Value: Command dependent + */ +static int nci_uart_tty_ioctl(struct tty_struct *tty, struct file *file, + unsigned int cmd, unsigned long arg) +{ + struct nci_uart *nu = (void *)tty->disc_data; + int err = 0; + + switch (cmd) { + case NCIUARTSETDRIVER: + if (!nu) + return nci_uart_set_driver(tty, (unsigned int)arg); + else + return -EBUSY; + break; + default: + err = n_tty_ioctl_helper(tty, file, cmd, arg); + break; + } + + return err; +} + +/* We don't provide read/write/poll interface for user space. */ +static ssize_t nci_uart_tty_read(struct tty_struct *tty, struct file *file, + unsigned char __user *buf, size_t nr) +{ + return 0; +} + +static ssize_t nci_uart_tty_write(struct tty_struct *tty, struct file *file, + const unsigned char *data, size_t count) +{ + return 0; +} + +static unsigned int nci_uart_tty_poll(struct tty_struct *tty, + struct file *filp, poll_table *wait) +{ + return 0; +} + +static int nci_uart_send(struct nci_uart *nu, struct sk_buff *skb) +{ + /* Queue TX packet */ + skb_queue_tail(&nu->tx_q, skb); + + /* Try to start TX (if possible) */ + nci_uart_tx_wakeup(nu); + + return 0; +} + +/* -- Default recv_buf handler -- + * + * This handler supposes that NCI frames are sent over UART link without any + * framing. It reads NCI header, retrieve the packet size and once all packet + * bytes are received it passes it to nci_uart driver for processing. + */ +static int nci_uart_default_recv_buf(struct nci_uart *nu, const u8 *data, + char *flags, int count) +{ + int chunk_len; + + if (!nu->ndev) { + nfc_err(nu->tty->dev, + "receive data from tty but no NCI dev is attached yet, drop buffer\n"); + return 0; + } + + /* Decode all incoming data in packets + * and enqueue then for processing. + */ + while (count > 0) { + /* If this is the first data of a packet, allocate a buffer */ + if (!nu->rx_skb) { + nu->rx_packet_len = -1; + nu->rx_skb = nci_skb_alloc(nu->ndev, + NCI_MAX_PACKET_SIZE, + GFP_KERNEL); + if (!nu->rx_skb) + return -ENOMEM; + } + + /* Eat byte after byte till full packet header is received */ + if (nu->rx_skb->len < NCI_CTRL_HDR_SIZE) { + *skb_put(nu->rx_skb, 1) = *data++; + --count; + continue; + } + + /* Header was received but packet len was not read */ + if (nu->rx_packet_len < 0) + nu->rx_packet_len = NCI_CTRL_HDR_SIZE + + nci_plen(nu->rx_skb->data); + + /* Compute how many bytes are missing and how many bytes can + * be consumed. + */ + chunk_len = nu->rx_packet_len - nu->rx_skb->len; + if (count < chunk_len) + chunk_len = count; + memcpy(skb_put(nu->rx_skb, chunk_len), data, chunk_len); + data += chunk_len; + count -= chunk_len; + + /* Chcek if packet is fully received */ + if (nu->rx_packet_len == nu->rx_skb->len) { + /* Pass RX packet to driver */ + if (nu->ops.recv(nu, nu->rx_skb) != 0) + nfc_err(nu->tty->dev, "corrupted RX packet\n"); + /* Next packet will be a new one */ + nu->rx_skb = NULL; + } + } + + return 0; +} + +/* -- Default recv handler -- */ +static int nci_uart_default_recv(struct nci_uart *nu, struct sk_buff *skb) +{ + return nci_recv_frame(nu->ndev, skb); +} + +int nci_uart_register(struct nci_uart *nu) +{ + if (!nu || !nu->ops.open || + !nu->ops.recv || !nu->ops.close) + return -EINVAL; + + /* Set the send callback */ + nu->ops.send = nci_uart_send; + + /* Install default handlers if not overridden */ + if (!nu->ops.recv_buf) + nu->ops.recv_buf = nci_uart_default_recv_buf; + if (!nu->ops.recv) + nu->ops.recv = nci_uart_default_recv; + + /* Add this driver in the driver list */ + if (!nci_uart_drivers[nu->driver]) { + pr_err("driver %d is already registered\n", nu->driver); + return -EBUSY; + } + nci_uart_drivers[nu->driver] = nu; + + pr_info("NCI uart driver '%s [%d]' registered\n", nu->name, nu->driver); + + return 0; +} +EXPORT_SYMBOL_GPL(nci_uart_register); + +void nci_uart_unregister(struct nci_uart *nu) +{ + pr_info("NCI uart driver '%s [%d]' unregistered\n", nu->name, + nu->driver); + + /* Remove this driver from the driver list */ + nci_uart_drivers[nu->driver] = NULL; +} +EXPORT_SYMBOL_GPL(nci_uart_unregister); + +void nci_uart_set_config(struct nci_uart *nu, int baudrate, int flow_ctrl) +{ + struct ktermios new_termios; + + if (!nu->tty) + return; + + down_read(&nu->tty->termios_rwsem); + new_termios = nu->tty->termios; + up_read(&nu->tty->termios_rwsem); + tty_termios_encode_baud_rate(&new_termios, baudrate, baudrate); + + if (flow_ctrl) + new_termios.c_cflag |= CRTSCTS; + else + new_termios.c_cflag &= ~CRTSCTS; + + tty_set_termios(nu->tty, &new_termios); +} +EXPORT_SYMBOL_GPL(nci_uart_set_config); + +static struct tty_ldisc_ops nci_uart_ldisc = { + .magic = TTY_LDISC_MAGIC, + .owner = THIS_MODULE, + .name = "n_nci", + .open = nci_uart_tty_open, + .close = nci_uart_tty_close, + .read = nci_uart_tty_read, + .write = nci_uart_tty_write, + .poll = nci_uart_tty_poll, + .receive_buf = nci_uart_tty_receive, + .write_wakeup = nci_uart_tty_wakeup, + .ioctl = nci_uart_tty_ioctl, +}; + +static int __init nci_uart_init(void) +{ + memset(nci_uart_drivers, 0, sizeof(nci_uart_drivers)); + return tty_register_ldisc(N_NCI, &nci_uart_ldisc); +} + +static void __exit nci_uart_exit(void) +{ + tty_unregister_ldisc(N_NCI); +} + +module_init(nci_uart_init); +module_exit(nci_uart_exit); + +MODULE_AUTHOR("Marvell International Ltd."); +MODULE_DESCRIPTION("NFC NCI UART driver"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_LDISC(N_NCI); -- cgit From e097dc624f784debbde49701a493bf920bc422c7 Mon Sep 17 00:00:00 2001 From: Vincent Cuissard Date: Thu, 11 Jun 2015 14:00:20 +0200 Subject: NFC: nfcmrvl: add UART driver Add support of Marvell NFC chip controlled over UART Signed-off-by: Vincent Cuissard Signed-off-by: Samuel Ortiz --- net/nfc/nci/uart.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/nfc/nci/uart.c b/net/nfc/nci/uart.c index 70c279543074..2c48810af5bb 100644 --- a/net/nfc/nci/uart.c +++ b/net/nfc/nci/uart.c @@ -12,7 +12,6 @@ * IMPLIED WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE * ARE EXPRESSLY DISCLAIMED. The License provides additional details about * this warranty disclaimer. - */ /* Inspired (hugely) by HCI LDISC implementation in Bluetooth. -- cgit From 57225e7720ac3d7ffcb0086c716753abf6e54e8d Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Thu, 11 Jun 2015 08:19:01 -0700 Subject: switchdev: fix BUG when port driver doesn't support set attr op Fix a BUG_ON() where CONFIG_NET_SWITCHDEV is set but the driver for a bridged port does not support switchdev_port_attr_set op. Don't BUG_ON() if -EOPNOTSUPP is returned. Also change BUG_ON() to netdev_err since this is a normal error path and does not warrant the use of BUG_ON(), which is reserved for unrecoverable errs. Signed-off-by: Scott Feldman Reported-by: Brenden Blanco Signed-off-by: David S. Miller --- net/switchdev/switchdev.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index b683e89b4caa..658bc3ac8008 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -103,7 +103,9 @@ static void switchdev_port_attr_set_work(struct work_struct *work) rtnl_lock(); err = switchdev_port_attr_set(asw->dev, &asw->attr); - BUG_ON(err); + if (err && err != -EOPNOTSUPP) + netdev_err(asw->dev, "failed (err=%d) to set attribute (id=%d)\n", + err, asw->attr.id); rtnl_unlock(); dev_put(asw->dev); -- cgit From a7eea416cb08a514f94c0ca5ff30c18783fab054 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 Jun 2015 09:15:15 -0700 Subject: tcp: reserve tcp_skb_mss() to tcp stack tcp_gso_segment() and tcp_gro_receive() are not strictly part of TCP stack. They should not assume tcp_skb_mss(skb) is in fact skb_shinfo(skb)->gso_size. This will allow us to change tcp_skb_mss() in following patches. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_offload.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 3f7c2fca5431..9864a2dbadce 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -77,7 +77,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, oldlen = (u16)~skb->len; __skb_pull(skb, thlen); - mss = tcp_skb_mss(skb); + mss = skb_shinfo(skb)->gso_size; if (unlikely(skb->len <= mss)) goto out; @@ -242,7 +242,7 @@ found: flush |= *(u32 *)((u8 *)th + i) ^ *(u32 *)((u8 *)th2 + i); - mss = tcp_skb_mss(p); + mss = skb_shinfo(p)->gso_size; flush |= (len - 1) >= mss; flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); -- cgit From 51466a7545b73b7ad7bcfb33410d2823ccfaa501 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 Jun 2015 09:15:16 -0700 Subject: tcp: fill shinfo->gso_type at last moment Our goal is to touch skb_shinfo(skb) only when absolutely needed, to avoid two cache line misses in TCP output path for last skb that is considered but not sent because of various conditions (cwnd, tso defer, receiver window, TSQ...) A packet is GSO only when skb_shinfo(skb)->gso_size is not zero. We can set skb_shinfo(skb)->gso_type to sk->sk_gso_type even for non GSO packets. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 8 ++------ net/ipv4/tcp_output.c | 4 +--- 2 files changed, 3 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d4f76ab6e136..70a6fa8ecbd3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1316,16 +1316,12 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, * code can come after this skb later on it's better to keep * setting gso_size to something. */ - if (!skb_shinfo(prev)->gso_size) { + if (!skb_shinfo(prev)->gso_size) skb_shinfo(prev)->gso_size = mss; - skb_shinfo(prev)->gso_type = sk->sk_gso_type; - } /* CHECKME: To clear or not to clear? Mimics normal skb currently */ - if (tcp_skb_pcount(skb) <= 1) { + if (tcp_skb_pcount(skb) <= 1) skb_shinfo(skb)->gso_size = 0; - skb_shinfo(skb)->gso_type = 0; - } /* Difference in this won't matter, both ACKed by the same cumul. ACK */ TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index eeb59befaf06..a51f7aab27d6 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -412,7 +412,6 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) tcp_skb_pcount_set(skb, 1); shinfo->gso_size = 0; - shinfo->gso_type = 0; TCP_SKB_CB(skb)->seq = seq; if (flags & (TCPHDR_SYN | TCPHDR_FIN)) @@ -1003,6 +1002,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, } tcp_options_write((__be32 *)(th + 1), tp, &opts); + skb_shinfo(skb)->gso_type = sk->sk_gso_type; if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0)) tcp_ecn_send(sk, skb, tcp_header_size); @@ -1080,11 +1080,9 @@ static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, */ tcp_skb_pcount_set(skb, 1); shinfo->gso_size = 0; - shinfo->gso_type = 0; } else { tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now)); shinfo->gso_size = mss_now; - shinfo->gso_type = sk->sk_gso_type; } } -- cgit From 5bbb432c896d23ce8f41f38e88dbd38982df99f9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 Jun 2015 09:15:17 -0700 Subject: tcp: tcp_set_skb_tso_segs() no longer need struct sock parameter tcp_set_skb_tso_segs() & tcp_init_tso_segs() no longer use the sock pointer. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a51f7aab27d6..d12888581337 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1066,8 +1066,7 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) } /* Initialize TSO segments for a packet. */ -static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, - unsigned int mss_now) +static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) { struct skb_shared_info *shinfo = skb_shinfo(skb); @@ -1214,8 +1213,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, old_factor = tcp_skb_pcount(skb); /* Fix up tso_factor for both original and new SKB. */ - tcp_set_skb_tso_segs(sk, skb, mss_now); - tcp_set_skb_tso_segs(sk, buff, mss_now); + tcp_set_skb_tso_segs(skb, mss_now); + tcp_set_skb_tso_segs(buff, mss_now); /* If this packet has been sent out already, we must * adjust the various packet counters. @@ -1295,7 +1294,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) /* Any change of skb->len requires recalculation of tso factor. */ if (tcp_skb_pcount(skb) > 1) - tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb)); + tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb)); return 0; } @@ -1627,13 +1626,12 @@ static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, * This must be invoked the first time we consider transmitting * SKB onto the wire. */ -static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb, - unsigned int mss_now) +static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now) { int tso_segs = tcp_skb_pcount(skb); if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) { - tcp_set_skb_tso_segs(sk, skb, mss_now); + tcp_set_skb_tso_segs(skb, mss_now); tso_segs = tcp_skb_pcount(skb); } return tso_segs; @@ -1688,7 +1686,7 @@ static unsigned int tcp_snd_test(const struct sock *sk, struct sk_buff *skb, const struct tcp_sock *tp = tcp_sk(sk); unsigned int cwnd_quota; - tcp_init_tso_segs(sk, skb, cur_mss); + tcp_init_tso_segs(skb, cur_mss); if (!tcp_nagle_test(tp, skb, cur_mss, nonagle)) return 0; @@ -1757,8 +1755,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, tcp_fragment_tstamp(skb, buff); /* Fix up tso_factor for both original and new SKB. */ - tcp_set_skb_tso_segs(sk, skb, mss_now); - tcp_set_skb_tso_segs(sk, buff, mss_now); + tcp_set_skb_tso_segs(skb, mss_now); + tcp_set_skb_tso_segs(buff, mss_now); /* Link BUFF into the send queue. */ __skb_header_release(buff); @@ -1992,7 +1990,7 @@ static int tcp_mtu_probe(struct sock *sk) skb->len, 0); } else { __pskb_trim_head(skb, copy); - tcp_set_skb_tso_segs(sk, skb, mss_now); + tcp_set_skb_tso_segs(skb, mss_now); } TCP_SKB_CB(skb)->seq += copy; } @@ -2002,7 +2000,7 @@ static int tcp_mtu_probe(struct sock *sk) if (len >= probe_size) break; } - tcp_init_tso_segs(sk, nskb, nskb->len); + tcp_init_tso_segs(nskb, nskb->len); /* We're ready to send. If this fails, the probe will * be resegmented into mss-sized pieces by tcp_write_xmit(). @@ -2064,7 +2062,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, while ((skb = tcp_send_head(sk))) { unsigned int limit; - tso_segs = tcp_init_tso_segs(sk, skb, mss_now); + tso_segs = tcp_init_tso_segs(skb, mss_now); BUG_ON(!tso_segs); if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { @@ -2618,7 +2616,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) if (unlikely(oldpcount > 1)) { if (skb_unclone(skb, GFP_ATOMIC)) return -ENOMEM; - tcp_init_tso_segs(sk, skb, cur_mss); + tcp_init_tso_segs(skb, cur_mss); tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb)); } } @@ -3455,7 +3453,7 @@ int tcp_write_wakeup(struct sock *sk, int mib) if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC)) return -1; } else if (!tcp_skb_pcount(skb)) - tcp_set_skb_tso_segs(sk, skb, mss); + tcp_set_skb_tso_segs(skb, mss); TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); -- cgit From f69ad292cfd13aa7ee00847320c6bb9ba2154e87 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 Jun 2015 09:15:18 -0700 Subject: tcp: fill shinfo->gso_size at last moment In commit cd7d8498c9a5 ("tcp: change tcp_skb_pcount() location") we stored gso_segs in a temporary cache hot location. This patch does the same for gso_size. This allows to save 2 cache line misses in tcp xmit path for the last packet that is considered but not sent because of various conditions (cwnd, tso defer, receiver window, TSQ...) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 8 ++++---- net/ipv4/tcp_output.c | 12 ++++-------- 2 files changed, 8 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 70a6fa8ecbd3..684f095d196e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1316,12 +1316,12 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, * code can come after this skb later on it's better to keep * setting gso_size to something. */ - if (!skb_shinfo(prev)->gso_size) - skb_shinfo(prev)->gso_size = mss; + if (!TCP_SKB_CB(prev)->tcp_gso_size) + TCP_SKB_CB(prev)->tcp_gso_size = mss; /* CHECKME: To clear or not to clear? Mimics normal skb currently */ if (tcp_skb_pcount(skb) <= 1) - skb_shinfo(skb)->gso_size = 0; + TCP_SKB_CB(skb)->tcp_gso_size = 0; /* Difference in this won't matter, both ACKed by the same cumul. ACK */ TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS); @@ -2248,7 +2248,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) (oldcnt >= packets)) break; - mss = skb_shinfo(skb)->gso_size; + mss = tcp_skb_mss(skb); err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, mss, GFP_ATOMIC); if (err < 0) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index d12888581337..787f57ff87c4 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -402,8 +402,6 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, */ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) { - struct skb_shared_info *shinfo = skb_shinfo(skb); - skb->ip_summed = CHECKSUM_PARTIAL; skb->csum = 0; @@ -411,7 +409,6 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) TCP_SKB_CB(skb)->sacked = 0; tcp_skb_pcount_set(skb, 1); - shinfo->gso_size = 0; TCP_SKB_CB(skb)->seq = seq; if (flags & (TCPHDR_SYN | TCPHDR_FIN)) @@ -1028,8 +1025,9 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, tcp_skb_pcount(skb)); tp->segs_out += tcp_skb_pcount(skb); - /* OK, its time to fill skb_shinfo(skb)->gso_segs */ + /* OK, its time to fill skb_shinfo(skb)->gso_{segs|size} */ skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb); + skb_shinfo(skb)->gso_size = tcp_skb_mss(skb); /* Our usage of tstamp should remain private */ skb->tstamp.tv64 = 0; @@ -1068,8 +1066,6 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) /* Initialize TSO segments for a packet. */ static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) { - struct skb_shared_info *shinfo = skb_shinfo(skb); - /* Make sure we own this skb before messing gso_size/gso_segs */ WARN_ON_ONCE(skb_cloned(skb)); @@ -1078,10 +1074,10 @@ static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) * non-TSO case. */ tcp_skb_pcount_set(skb, 1); - shinfo->gso_size = 0; + TCP_SKB_CB(skb)->tcp_gso_size = 0; } else { tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now)); - shinfo->gso_size = mss_now; + TCP_SKB_CB(skb)->tcp_gso_size = mss_now; } } -- cgit From b5e2c45783aa785cbb195e43d5f0c0c6b228bde4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 Jun 2015 09:15:19 -0700 Subject: tcp: remove obsolete check in tcp_set_skb_tso_segs() We had various issues in the past when TCP stack was modifying gso_size/gso_segs while clones were in flight. Commit c52e2421f73 ("tcp: must unclone packets before mangling them") fixed these bugs and added a WARN_ON_ONCE(skb_cloned(skb)); in tcp_set_skb_tso_segs() These bugs are now fixed, and because TCP stack now only sets shinfo->gso_size|segs on the clone itself, the check can be removed. As a result of this change, compiler inlines tcp_set_skb_tso_segs() in tcp_init_tso_segs() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 787f57ff87c4..b1c218df2c85 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1066,9 +1066,6 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) /* Initialize TSO segments for a packet. */ static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) { - /* Make sure we own this skb before messing gso_size/gso_segs */ - WARN_ON_ONCE(skb_cloned(skb)); - if (skb->len <= mss_now || skb->ip_summed == CHECKSUM_NONE) { /* Avoid the costly divide in the normal * non-TSO case. -- cgit From 2fd36558f02c0606768929fc77671716680d01c2 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Thu, 11 Jun 2015 13:52:26 +0300 Subject: Bluetooth: Add debugfs support for max LE encryption key size This patch adds a debugfs control to set a different maximum LE encryption key size. This is useful for testing that implementation of the encryption key size handling is behaving correctly. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/smp.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 57 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 4bfaa3d3ed28..d0220fb76dc0 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -33,6 +33,9 @@ #include "ecc.h" #include "smp.h" +#define SMP_DEV(hdev) \ + ((struct smp_dev *)((struct l2cap_chan *)((hdev)->smp_data))->data) + /* Low-level debug macros to be used for stuff that we don't want * accidentially in dmesg, i.e. the values of the various crypto keys * and the inputs & outputs of crypto functions. @@ -81,6 +84,8 @@ struct smp_dev { u8 local_rand[16]; bool debug_key; + u8 max_key_size; + struct crypto_blkcipher *tfm_aes; struct crypto_hash *tfm_cmac; }; @@ -708,7 +713,7 @@ static void build_pairing_cmd(struct l2cap_conn *conn, if (rsp == NULL) { req->io_capability = conn->hcon->io_capability; req->oob_flag = oob_flag; - req->max_key_size = SMP_MAX_ENC_KEY_SIZE; + req->max_key_size = SMP_DEV(hdev)->max_key_size; req->init_key_dist = local_dist; req->resp_key_dist = remote_dist; req->auth_req = (authreq & AUTH_REQ_MASK(hdev)); @@ -719,7 +724,7 @@ static void build_pairing_cmd(struct l2cap_conn *conn, rsp->io_capability = conn->hcon->io_capability; rsp->oob_flag = oob_flag; - rsp->max_key_size = SMP_MAX_ENC_KEY_SIZE; + rsp->max_key_size = SMP_DEV(hdev)->max_key_size; rsp->init_key_dist = req->init_key_dist & remote_dist; rsp->resp_key_dist = req->resp_key_dist & local_dist; rsp->auth_req = (authreq & AUTH_REQ_MASK(hdev)); @@ -730,10 +735,11 @@ static void build_pairing_cmd(struct l2cap_conn *conn, static u8 check_enc_key_size(struct l2cap_conn *conn, __u8 max_key_size) { struct l2cap_chan *chan = conn->smp; + struct hci_dev *hdev = conn->hcon->hdev; struct smp_chan *smp = chan->data; - if ((max_key_size > SMP_MAX_ENC_KEY_SIZE) || - (max_key_size < SMP_MIN_ENC_KEY_SIZE)) + if (max_key_size > SMP_DEV(hdev)->max_key_size || + max_key_size < SMP_MIN_ENC_KEY_SIZE) return SMP_ENC_KEY_SIZE; smp->enc_key_size = max_key_size; @@ -3130,6 +3136,7 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid) smp->tfm_aes = tfm_aes; smp->tfm_cmac = tfm_cmac; + smp->max_key_size = SMP_MAX_ENC_KEY_SIZE; create_chan: chan = l2cap_chan_create(); @@ -3252,6 +3259,49 @@ static const struct file_operations force_bredr_smp_fops = { .llseek = default_llseek, }; +static ssize_t le_max_key_size_read(struct file *file, + char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct hci_dev *hdev = file->private_data; + char buf[4]; + + snprintf(buf, sizeof(buf), "%2u\n", SMP_DEV(hdev)->max_key_size); + + return simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf)); +} + +static ssize_t le_max_key_size_write(struct file *file, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct hci_dev *hdev = file->private_data; + char buf[32]; + size_t buf_size = min(count, (sizeof(buf) - 1)); + u8 key_size; + + if (copy_from_user(buf, user_buf, buf_size)) + return -EFAULT; + + buf[buf_size] = '\0'; + + sscanf(buf, "%hhu", &key_size); + + if (key_size > SMP_MAX_ENC_KEY_SIZE || key_size < SMP_MIN_ENC_KEY_SIZE) + return -EINVAL; + + SMP_DEV(hdev)->max_key_size = key_size; + + return count; +} + +static const struct file_operations le_max_key_size_fops = { + .open = simple_open, + .read = le_max_key_size_read, + .write = le_max_key_size_write, + .llseek = default_llseek, +}; + int smp_register(struct hci_dev *hdev) { struct l2cap_chan *chan; @@ -3276,6 +3326,9 @@ int smp_register(struct hci_dev *hdev) hdev->smp_data = chan; + debugfs_create_file("le_max_key_size", 0644, hdev->debugfs, hdev, + &le_max_key_size_fops); + /* If the controller does not support BR/EDR Secure Connections * feature, then the BR/EDR SMP channel shall not be present. * -- cgit From b1f663c91c9dd96cc3e57ce7e90d29a6b43b9e16 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Thu, 11 Jun 2015 13:52:27 +0300 Subject: Bluetooth: Add debugfs support for min LE encryption key size This patch adds a debugfs control to set a different minimum LE encryption key size. This is useful for testing that implementation of the encryption key size handling is behaving correctly (e.g. that we get appropriate 'Encryption Key Size' error responses when necessary). Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/smp.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index d0220fb76dc0..d9ed5e8ee6a0 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -84,6 +84,7 @@ struct smp_dev { u8 local_rand[16]; bool debug_key; + u8 min_key_size; u8 max_key_size; struct crypto_blkcipher *tfm_aes; @@ -3136,6 +3137,7 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid) smp->tfm_aes = tfm_aes; smp->tfm_cmac = tfm_cmac; + smp->min_key_size = SMP_MIN_ENC_KEY_SIZE; smp->max_key_size = SMP_MAX_ENC_KEY_SIZE; create_chan: @@ -3259,6 +3261,50 @@ static const struct file_operations force_bredr_smp_fops = { .llseek = default_llseek, }; +static ssize_t le_min_key_size_read(struct file *file, + char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct hci_dev *hdev = file->private_data; + char buf[4]; + + snprintf(buf, sizeof(buf), "%2u\n", SMP_DEV(hdev)->min_key_size); + + return simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf)); +} + +static ssize_t le_min_key_size_write(struct file *file, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct hci_dev *hdev = file->private_data; + char buf[32]; + size_t buf_size = min(count, (sizeof(buf) - 1)); + u8 key_size; + + if (copy_from_user(buf, user_buf, buf_size)) + return -EFAULT; + + buf[buf_size] = '\0'; + + sscanf(buf, "%hhu", &key_size); + + if (key_size > SMP_DEV(hdev)->max_key_size || + key_size < SMP_MIN_ENC_KEY_SIZE) + return -EINVAL; + + SMP_DEV(hdev)->min_key_size = key_size; + + return count; +} + +static const struct file_operations le_min_key_size_fops = { + .open = simple_open, + .read = le_min_key_size_read, + .write = le_min_key_size_write, + .llseek = default_llseek, +}; + static ssize_t le_max_key_size_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) @@ -3287,7 +3333,8 @@ static ssize_t le_max_key_size_write(struct file *file, sscanf(buf, "%hhu", &key_size); - if (key_size > SMP_MAX_ENC_KEY_SIZE || key_size < SMP_MIN_ENC_KEY_SIZE) + if (key_size > SMP_MAX_ENC_KEY_SIZE || + key_size < SMP_DEV(hdev)->min_key_size) return -EINVAL; SMP_DEV(hdev)->max_key_size = key_size; @@ -3326,6 +3373,8 @@ int smp_register(struct hci_dev *hdev) hdev->smp_data = chan; + debugfs_create_file("le_min_key_size", 0644, hdev->debugfs, hdev, + &le_min_key_size_fops); debugfs_create_file("le_max_key_size", 0644, hdev->debugfs, hdev, &le_max_key_size_fops); -- cgit From 035ad621b6e625e185fe7b6929788d1d83727079 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Thu, 11 Jun 2015 13:52:28 +0300 Subject: Bluetooth: Move SC-only check outside of BT_CONFIG branch Checking for SC-only mode requirements when we get an encrypt change event shouldn't be limited to the BT_CONFIG state but done any time encryption changes. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_event.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index fcbfa4138eb1..62934151431b 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -2650,22 +2650,22 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb) goto unlock; } + /* In Secure Connections Only mode, do not allow any connections + * that are not encrypted with AES-CCM using a P-256 authenticated + * combination key. + */ + if (hci_dev_test_flag(hdev, HCI_SC_ONLY) && + (!test_bit(HCI_CONN_AES_CCM, &conn->flags) || + conn->key_type != HCI_LK_AUTH_COMBINATION_P256)) { + hci_connect_cfm(conn, HCI_ERROR_AUTH_FAILURE); + hci_conn_drop(conn); + goto unlock; + } + if (conn->state == BT_CONFIG) { if (!ev->status) conn->state = BT_CONNECTED; - /* In Secure Connections Only mode, do not allow any - * connections that are not encrypted with AES-CCM - * using a P-256 authenticated combination key. - */ - if (hci_dev_test_flag(hdev, HCI_SC_ONLY) && - (!test_bit(HCI_CONN_AES_CCM, &conn->flags) || - conn->key_type != HCI_LK_AUTH_COMBINATION_P256)) { - hci_connect_cfm(conn, HCI_ERROR_AUTH_FAILURE); - hci_conn_drop(conn); - goto unlock; - } - hci_connect_cfm(conn, ev->status); hci_conn_drop(conn); } else -- cgit From 821f37666815c9f3a7a4d195ce9184ad4d084942 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Thu, 11 Jun 2015 13:52:29 +0300 Subject: Bluetooth: Read encryption key size for BR/EDR connections Since Bluetooth 3.0 there's a HCI command available for reading the encryption key size of an BR/EDR connection. This information is essential e.g. for generating an LTK using SMP over BR/EDR, so store it as part of struct hci_conn. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_event.c | 87 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 62934151431b..88c57b12a222 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -2603,6 +2603,64 @@ unlock: hci_dev_unlock(hdev); } +static void read_enc_key_size_complete(struct hci_dev *hdev, u8 status, + u16 opcode, struct sk_buff *skb) +{ + const struct hci_rp_read_enc_key_size *rp; + struct hci_conn *conn; + u16 handle; + + BT_DBG("%s status 0x%02x", hdev->name, status); + + if (!skb || skb->len < sizeof(*rp)) { + BT_ERR("%s invalid HCI Read Encryption Key Size response", + hdev->name); + return; + } + + rp = (void *)skb->data; + handle = le16_to_cpu(rp->handle); + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, handle); + if (!conn) + goto unlock; + + /* If we fail to read the encryption key size, assume maximum + * (which is the same we do also when this HCI command isn't + * supported. + */ + if (rp->status) { + BT_ERR("%s failed to read key size for handle %u", hdev->name, + handle); + conn->enc_key_size = HCI_LINK_KEY_SIZE; + } else { + conn->enc_key_size = rp->key_size; + } + + if (conn->state == BT_CONFIG) { + conn->state = BT_CONNECTED; + hci_connect_cfm(conn, 0); + hci_conn_drop(conn); + } else { + u8 encrypt; + + if (!test_bit(HCI_CONN_ENCRYPT, &conn->flags)) + encrypt = 0x00; + else if (conn->type == ACL_LINK && + test_bit(HCI_CONN_AES_CCM, &conn->flags)) + encrypt = 0x02; + else + encrypt = 0x01; + + hci_encrypt_cfm(conn, 0, encrypt); + } + +unlock: + hci_dev_unlock(hdev); +} + static void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_ev_encrypt_change *ev = (void *) skb->data; @@ -2662,6 +2720,35 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb) goto unlock; } + /* Try reading the encryption key size for encrypted ACL links */ + if (!ev->status && ev->encrypt && conn->type == ACL_LINK) { + struct hci_cp_read_enc_key_size cp; + struct hci_request req; + + /* Only send HCI_Read_Encryption_Key_Size if the + * controller really supports it. If it doesn't, assume + * the default size (16). + */ + if (!(hdev->commands[20] & 0x10)) { + conn->enc_key_size = HCI_LINK_KEY_SIZE; + goto notify; + } + + hci_req_init(&req, hdev); + + cp.handle = cpu_to_le16(conn->handle); + hci_req_add(&req, HCI_OP_READ_ENC_KEY_SIZE, sizeof(cp), &cp); + + if (hci_req_run_skb(&req, read_enc_key_size_complete)) { + BT_ERR("Sending HCI Read Encryption Key Size failed"); + conn->enc_key_size = HCI_LINK_KEY_SIZE; + goto notify; + } + + goto unlock; + } + +notify: if (conn->state == BT_CONFIG) { if (!ev->status) conn->state = BT_CONNECTED; -- cgit From e3f6a257a73d03799918a79529632f2df2b4d668 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Thu, 11 Jun 2015 13:52:30 +0300 Subject: Bluetooth: Use actual encryption key size for SMP over BR/EDR When pairing over SMP over BR/EDR the generated LTK has by default the same key size as the BR/EDR Link Key. Make sure we don't set our Pairing Request/Response max value to anything higher than that. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/smp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index d9ed5e8ee6a0..3d0f7d2a0616 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -1705,7 +1705,7 @@ static void build_bredr_pairing_cmd(struct smp_chan *smp, req->init_key_dist = local_dist; req->resp_key_dist = remote_dist; - req->max_key_size = SMP_MAX_ENC_KEY_SIZE; + req->max_key_size = conn->hcon->enc_key_size; smp->remote_key_dist = remote_dist; @@ -1714,7 +1714,7 @@ static void build_bredr_pairing_cmd(struct smp_chan *smp, memset(rsp, 0, sizeof(*rsp)); - rsp->max_key_size = SMP_MAX_ENC_KEY_SIZE; + rsp->max_key_size = conn->hcon->enc_key_size; rsp->init_key_dist = req->init_key_dist & remote_dist; rsp->resp_key_dist = req->resp_key_dist & local_dist; -- cgit From 5d667ef6e01be22343b07a1f5d5a976962e3103e Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 12 Jun 2015 13:04:47 +0300 Subject: Bluetooth: Remove redundant check for ACL_LINK The encryption key size is read only for BR/EDR (ACL_LINK) connections so there's no need to check for it in the read_enc_key_size_complete() callback. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_event.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 88c57b12a222..32363c2b7f83 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -2648,8 +2648,7 @@ static void read_enc_key_size_complete(struct hci_dev *hdev, u8 status, if (!test_bit(HCI_CONN_ENCRYPT, &conn->flags)) encrypt = 0x00; - else if (conn->type == ACL_LINK && - test_bit(HCI_CONN_AES_CCM, &conn->flags)) + else if (test_bit(HCI_CONN_AES_CCM, &conn->flags)) encrypt = 0x02; else encrypt = 0x01; -- cgit From 779668450a990d402d316ffd9b7b103fbe5ab6f3 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Thu, 21 May 2015 10:57:12 -0300 Subject: netfilter: conntrack: warn the user if there is a better helper to use After db29a9508a92 ("netfilter: conntrack: disable generic tracking for known protocols"), if the specific helper is built but not loaded (a standard for most distributions) systems with a restrictive firewall but weak configuration regarding netfilter modules to load, will silently stop working. This patch then puts a warning message so the sysadmin knows where to start looking into. It's a pr_warn_once regardless of protocol itself but it should be enough to give a hint on where to look. Cc: Florian Westphal Cc: Daniel Borkmann Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_proto_generic.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c index 60865f110309..2281be419a74 100644 --- a/net/netfilter/nf_conntrack_proto_generic.c +++ b/net/netfilter/nf_conntrack_proto_generic.c @@ -90,7 +90,13 @@ static int generic_packet(struct nf_conn *ct, static bool generic_new(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, unsigned int *timeouts) { - return nf_generic_should_process(nf_ct_protonum(ct)); + bool ret; + + ret = nf_generic_should_process(nf_ct_protonum(ct)); + if (!ret) + pr_warn_once("conntrack: generic helper won't handle protocol %d. Please consider loading the specific helper module.\n", + nf_ct_protonum(ct)); + return ret; } #if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) -- cgit From d39a33ed9b9ac0939a5b2ddc4dd3d283373bbe89 Mon Sep 17 00:00:00 2001 From: Bernhard Thaler Date: Sat, 30 May 2015 15:26:13 +0200 Subject: netfilter: bridge: refactor clearing BRNF_NF_BRIDGE_PREROUTING use binary AND on complement of BRNF_NF_BRIDGE_PREROUTING to unset bit in nf_bridge->mask. Signed-off-by: Bernhard Thaler Signed-off-by: Pablo Neira Ayuso --- net/bridge/br_netfilter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 46660a28feef..2651876894e0 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -290,7 +290,7 @@ static int br_nf_pre_routing_finish_ipv6(struct sock *sk, struct sk_buff *skb) skb->pkt_type = PACKET_OTHERHOST; nf_bridge->pkt_otherhost = false; } - nf_bridge->mask ^= BRNF_NF_BRIDGE_PREROUTING; + nf_bridge->mask &= ~BRNF_NF_BRIDGE_PREROUTING; rt = bridge_parent_rtable(nf_bridge->physindev); if (!rt) { @@ -415,7 +415,7 @@ static int br_nf_pre_routing_finish(struct sock *sk, struct sk_buff *skb) skb->pkt_type = PACKET_OTHERHOST; nf_bridge->pkt_otherhost = false; } - nf_bridge->mask ^= BRNF_NF_BRIDGE_PREROUTING; + nf_bridge->mask &= ~BRNF_NF_BRIDGE_PREROUTING; if (daddr_was_changed(skb, nf_bridge)) { if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) { struct in_device *in_dev = __in_dev_get_rcu(dev); -- cgit From 8cae308d2bc81f95c320e7a345b92be6c238f510 Mon Sep 17 00:00:00 2001 From: Bernhard Thaler Date: Sat, 30 May 2015 15:26:57 +0200 Subject: netfilter: bridge: re-order br_nf_pre_routing_finish_ipv6() Put br_nf_pre_routing_finish_ipv6() after daddr_was_changed() and br_nf_pre_routing_finish_bridge() to prepare calling these functions from there. Signed-off-by: Bernhard Thaler Signed-off-by: Pablo Neira Ayuso --- net/bridge/br_netfilter.c | 63 ++++++++++++++++++++++++----------------------- 1 file changed, 32 insertions(+), 31 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 2651876894e0..6cb642c43451 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -278,37 +278,6 @@ static void nf_bridge_update_protocol(struct sk_buff *skb) } } -/* PF_BRIDGE/PRE_ROUTING *********************************************/ -/* Undo the changes made for ip6tables PREROUTING and continue the - * bridge PRE_ROUTING hook. */ -static int br_nf_pre_routing_finish_ipv6(struct sock *sk, struct sk_buff *skb) -{ - struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); - struct rtable *rt; - - if (nf_bridge->pkt_otherhost) { - skb->pkt_type = PACKET_OTHERHOST; - nf_bridge->pkt_otherhost = false; - } - nf_bridge->mask &= ~BRNF_NF_BRIDGE_PREROUTING; - - rt = bridge_parent_rtable(nf_bridge->physindev); - if (!rt) { - kfree_skb(skb); - return 0; - } - skb_dst_set_noref(skb, &rt->dst); - - skb->dev = nf_bridge->physindev; - nf_bridge_update_protocol(skb); - nf_bridge_push_encap_header(skb); - NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, sk, skb, - skb->dev, NULL, - br_handle_frame_finish, 1); - - return 0; -} - /* Obtain the correct destination MAC address, while preserving the original * source MAC address. If we already know this address, we just copy it. If we * don't, we use the neighbour framework to find out. In both cases, we make @@ -360,6 +329,38 @@ static bool daddr_was_changed(const struct sk_buff *skb, return ip_hdr(skb)->daddr != nf_bridge->ipv4_daddr; } +/* PF_BRIDGE/PRE_ROUTING *********************************************/ +/* Undo the changes made for ip6tables PREROUTING and continue the + * bridge PRE_ROUTING hook. + */ +static int br_nf_pre_routing_finish_ipv6(struct sock *sk, struct sk_buff *skb) +{ + struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); + struct rtable *rt; + + if (nf_bridge->pkt_otherhost) { + skb->pkt_type = PACKET_OTHERHOST; + nf_bridge->pkt_otherhost = false; + } + nf_bridge->mask &= ~BRNF_NF_BRIDGE_PREROUTING; + + rt = bridge_parent_rtable(nf_bridge->physindev); + if (!rt) { + kfree_skb(skb); + return 0; + } + skb_dst_set_noref(skb, &rt->dst); + + skb->dev = nf_bridge->physindev; + nf_bridge_update_protocol(skb); + nf_bridge_push_encap_header(skb); + NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, sk, skb, + skb->dev, NULL, + br_handle_frame_finish, 1); + + return 0; +} + /* This requires some explaining. If DNAT has taken place, * we will need to fix up the destination Ethernet address. * This is also true when SNAT takes place (for the reply direction). -- cgit From 72b31f7271df34c6aab36c01305287924826678f Mon Sep 17 00:00:00 2001 From: Bernhard Thaler Date: Sat, 30 May 2015 15:27:40 +0200 Subject: netfilter: bridge: detect NAT66 correctly and change MAC address IPv4 iptables allows to REDIRECT/DNAT/SNAT any traffic over a bridge. e.g. REDIRECT $ sysctl -w net.bridge.bridge-nf-call-iptables=1 $ iptables -t nat -A PREROUTING -p tcp -m tcp --dport 8080 \ -j REDIRECT --to-ports 81 This does not work with ip6tables on a bridge in NAT66 scenario because the REDIRECT/DNAT/SNAT is not correctly detected. The bridge pre-routing (finish) netfilter hook has to check for a possible redirect and then fix the destination mac address. This allows to use the ip6tables rules for local REDIRECT/DNAT/SNAT REDIRECT similar to the IPv4 iptables version. e.g. REDIRECT $ sysctl -w net.bridge.bridge-nf-call-ip6tables=1 $ ip6tables -t nat -A PREROUTING -p tcp -m tcp --dport 8080 \ -j REDIRECT --to-ports 81 This patch makes it possible to use IPv6 NAT66 on a bridge. It was tested on a bridge with two interfaces using SNAT/DNAT NAT66 rules. Reported-by: Artie Hamilton Signed-off-by: Sven Eckelmann [bernhard.thaler@wvnet.at: rebased, add indirect call to ip6_route_input()] [bernhard.thaler@wvnet.at: rebased, split into separate patches] Signed-off-by: Bernhard Thaler Signed-off-by: Pablo Neira Ayuso --- net/bridge/br_netfilter.c | 55 +++++++++++++++++++++++++++++++++++++++-------- net/ipv6/netfilter.c | 1 + 2 files changed, 47 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 6cb642c43451..9ac0c6417c77 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -326,30 +326,63 @@ free_skb: static bool daddr_was_changed(const struct sk_buff *skb, const struct nf_bridge_info *nf_bridge) { - return ip_hdr(skb)->daddr != nf_bridge->ipv4_daddr; + switch (skb->protocol) { + case htons(ETH_P_IP): + return ip_hdr(skb)->daddr != nf_bridge->ipv4_daddr; + case htons(ETH_P_IPV6): + return memcmp(&nf_bridge->ipv6_daddr, &ipv6_hdr(skb)->daddr, + sizeof(ipv6_hdr(skb)->daddr)) != 0; + default: + return false; + } } -/* PF_BRIDGE/PRE_ROUTING *********************************************/ -/* Undo the changes made for ip6tables PREROUTING and continue the - * bridge PRE_ROUTING hook. +/* PF_BRIDGE/PRE_ROUTING: Undo the changes made for ip6tables + * PREROUTING and continue the bridge PRE_ROUTING hook. See comment + * for br_nf_pre_routing_finish(), same logic is used here but + * equivalent IPv6 function ip6_route_input() called indirectly. */ static int br_nf_pre_routing_finish_ipv6(struct sock *sk, struct sk_buff *skb) { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); struct rtable *rt; + struct net_device *dev = skb->dev; + const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops(); if (nf_bridge->pkt_otherhost) { skb->pkt_type = PACKET_OTHERHOST; nf_bridge->pkt_otherhost = false; } nf_bridge->mask &= ~BRNF_NF_BRIDGE_PREROUTING; + if (daddr_was_changed(skb, nf_bridge)) { + skb_dst_drop(skb); + v6ops->route_input(skb); - rt = bridge_parent_rtable(nf_bridge->physindev); - if (!rt) { - kfree_skb(skb); - return 0; + if (skb_dst(skb)->error) { + kfree_skb(skb); + return 0; + } + + if (skb_dst(skb)->dev == dev) { + skb->dev = nf_bridge->physindev; + nf_bridge_update_protocol(skb); + nf_bridge_push_encap_header(skb); + NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, + sk, skb, skb->dev, NULL, + br_nf_pre_routing_finish_bridge, + 1); + return 0; + } + ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr); + skb->pkt_type = PACKET_HOST; + } else { + rt = bridge_parent_rtable(nf_bridge->physindev); + if (!rt) { + kfree_skb(skb); + return 0; + } + skb_dst_set_noref(skb, &rt->dst); } - skb_dst_set_noref(skb, &rt->dst); skb->dev = nf_bridge->physindev; nf_bridge_update_protocol(skb); @@ -579,6 +612,7 @@ static unsigned int br_nf_pre_routing_ipv6(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct nf_hook_state *state) { + struct nf_bridge_info *nf_bridge; const struct ipv6hdr *hdr; u32 pkt_len; @@ -610,6 +644,9 @@ static unsigned int br_nf_pre_routing_ipv6(const struct nf_hook_ops *ops, if (!setup_pre_routing(skb)) return NF_DROP; + nf_bridge = nf_bridge_info_get(skb); + nf_bridge->ipv6_daddr = ipv6_hdr(skb)->daddr; + skb->protocol = htons(ETH_P_IPV6); NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, state->sk, skb, skb->dev, NULL, diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index d958718b5031..bbca09fe9553 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -191,6 +191,7 @@ static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook, static const struct nf_ipv6_ops ipv6ops = { .chk_addr = ipv6_chk_addr, + .route_input = ip6_route_input }; static const struct nf_afinfo nf_ip6_afinfo = { -- cgit From 411ffb4fde80705a9a8db4c2d38dbeef6f5bd689 Mon Sep 17 00:00:00 2001 From: Bernhard Thaler Date: Sat, 30 May 2015 15:28:28 +0200 Subject: netfilter: bridge: refactor frag_max_size Currently frag_max_size is member of br_input_skb_cb and copied back and forth using IPCB(skb) and BR_INPUT_SKB_CB(skb) each time it is changed or used. Attach frag_max_size to nf_bridge_info and set value in pre_routing and forward functions. Use its value in forward and xmit functions. Signed-off-by: Bernhard Thaler Signed-off-by: Pablo Neira Ayuso --- net/bridge/br_netfilter.c | 20 +++++++------------- net/bridge/br_private.h | 1 - 2 files changed, 7 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 9ac0c6417c77..1f30b28745de 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -440,10 +440,8 @@ static int br_nf_pre_routing_finish(struct sock *sk, struct sk_buff *skb) struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); struct rtable *rt; int err; - int frag_max_size; - frag_max_size = IPCB(skb)->frag_max_size; - BR_INPUT_SKB_CB(skb)->frag_max_size = frag_max_size; + nf_bridge->frag_max_size = IPCB(skb)->frag_max_size; if (nf_bridge->pkt_otherhost) { skb->pkt_type = PACKET_OTHERHOST; @@ -738,11 +736,9 @@ static int br_nf_forward_finish(struct sock *sk, struct sk_buff *skb) struct net_device *in; if (!IS_ARP(skb) && !IS_VLAN_ARP(skb)) { - int frag_max_size; if (skb->protocol == htons(ETH_P_IP)) { - frag_max_size = IPCB(skb)->frag_max_size; - BR_INPUT_SKB_CB(skb)->frag_max_size = frag_max_size; + nf_bridge->frag_max_size = IPCB(skb)->frag_max_size; } in = nf_bridge->physindev; @@ -806,12 +802,9 @@ static unsigned int br_nf_forward_ip(const struct nf_hook_ops *ops, } if (pf == NFPROTO_IPV4) { - int frag_max = BR_INPUT_SKB_CB(skb)->frag_max_size; - if (br_parse_ip_options(skb)) return NF_DROP; - - IPCB(skb)->frag_max_size = frag_max; + IPCB(skb)->frag_max_size = nf_bridge->frag_max_size; } nf_bridge->physoutdev = skb->dev; @@ -904,7 +897,7 @@ static int br_nf_ip_fragment(struct sock *sk, struct sk_buff *skb, static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb) { int ret; - int frag_max_size; + struct nf_bridge_info *nf_bridge; unsigned int mtu_reserved; if (skb_is_gso(skb) || skb->protocol != htons(ETH_P_IP)) { @@ -913,17 +906,18 @@ static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb) } mtu_reserved = nf_bridge_mtu_reduction(skb); + nf_bridge = nf_bridge_info_get(skb); /* This is wrong! We should preserve the original fragment * boundaries by preserving frag_list rather than refragmenting. */ if (skb->len + mtu_reserved > skb->dev->mtu) { struct brnf_frag_data *data; - frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size; if (br_parse_ip_options(skb)) /* Drop invalid packet */ return NF_DROP; - IPCB(skb)->frag_max_size = frag_max_size; + + IPCB(skb)->frag_max_size = nf_bridge->frag_max_size; nf_bridge_update_protocol(skb); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 1f36fa70639b..8cde96e68fd5 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -304,7 +304,6 @@ struct br_input_skb_cb { int mrouters_only; #endif - u16 frag_max_size; bool proxyarp_replied; #ifdef CONFIG_BRIDGE_VLAN_FILTERING -- cgit From 77d574e7283cfcbb2c134f4431f77dff1c54423e Mon Sep 17 00:00:00 2001 From: Bernhard Thaler Date: Sat, 30 May 2015 15:29:02 +0200 Subject: netfilter: bridge: rename br_parse_ip_options br_parse_ip_options() does not parse any IP options, it validates IP packets as a whole and the function name is misleading. Rename br_parse_ip_options() to br_validate_ipv4() and remove unneeded commments. Signed-off-by: Bernhard Thaler Signed-off-by: Pablo Neira Ayuso --- net/bridge/br_netfilter.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 1f30b28745de..962d5f867f62 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -216,7 +216,7 @@ static inline void nf_bridge_pull_encap_header_rcsum(struct sk_buff *skb) * expected format */ -static int br_parse_ip_options(struct sk_buff *skb) +static int br_validate_ipv4(struct sk_buff *skb) { const struct iphdr *iph; struct net_device *dev = skb->dev; @@ -692,7 +692,7 @@ static unsigned int br_nf_pre_routing(const struct nf_hook_ops *ops, nf_bridge_pull_encap_header_rcsum(skb); - if (br_parse_ip_options(skb)) + if (br_validate_ipv4(skb)) return NF_DROP; nf_bridge_put(skb->nf_bridge); @@ -802,7 +802,7 @@ static unsigned int br_nf_forward_ip(const struct nf_hook_ops *ops, } if (pf == NFPROTO_IPV4) { - if (br_parse_ip_options(skb)) + if (br_validate_ipv4(skb)) return NF_DROP; IPCB(skb)->frag_max_size = nf_bridge->frag_max_size; } @@ -913,8 +913,7 @@ static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb) if (skb->len + mtu_reserved > skb->dev->mtu) { struct brnf_frag_data *data; - if (br_parse_ip_options(skb)) - /* Drop invalid packet */ + if (br_validate_ipv4(skb)) return NF_DROP; IPCB(skb)->frag_max_size = nf_bridge->frag_max_size; -- cgit From a4611d3b74b56658438ad1de4737a61a46be0fc0 Mon Sep 17 00:00:00 2001 From: Bernhard Thaler Date: Sat, 30 May 2015 15:29:38 +0200 Subject: netfilter: bridge: re-order check_hbh_len() Prepare check_hbh_len() to be called from newly introduced br_validate_ipv6() in next commit. Signed-off-by: Bernhard Thaler Signed-off-by: Pablo Neira Ayuso --- net/bridge/br_netfilter.c | 111 +++++++++++++++++++++++----------------------- 1 file changed, 56 insertions(+), 55 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 962d5f867f62..d201ea4440c9 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -264,6 +264,62 @@ drop: return -1; } +/* We only check the length. A bridge shouldn't do any hop-by-hop stuff + * anyway + */ +static int check_hbh_len(struct sk_buff *skb) +{ + unsigned char *raw = (u8 *)(ipv6_hdr(skb) + 1); + u32 pkt_len; + const unsigned char *nh = skb_network_header(skb); + int off = raw - nh; + int len = (raw[1] + 1) << 3; + + if ((raw + len) - skb->data > skb_headlen(skb)) + goto bad; + + off += 2; + len -= 2; + + while (len > 0) { + int optlen = nh[off + 1] + 2; + + switch (nh[off]) { + case IPV6_TLV_PAD1: + optlen = 1; + break; + + case IPV6_TLV_PADN: + break; + + case IPV6_TLV_JUMBO: + if (nh[off + 1] != 4 || (off & 3) != 2) + goto bad; + pkt_len = ntohl(*(__be32 *)(nh + off + 2)); + if (pkt_len <= IPV6_MAXPLEN || + ipv6_hdr(skb)->payload_len) + goto bad; + if (pkt_len > skb->len - sizeof(struct ipv6hdr)) + goto bad; + if (pskb_trim_rcsum(skb, + pkt_len + sizeof(struct ipv6hdr))) + goto bad; + nh = skb_network_header(skb); + break; + default: + if (optlen > len) + goto bad; + break; + } + off += optlen; + len -= optlen; + } + if (len == 0) + return 0; +bad: + return -1; +} + static void nf_bridge_update_protocol(struct sk_buff *skb) { switch (skb->nf_bridge->orig_proto) { @@ -549,61 +605,6 @@ static struct net_device *setup_pre_routing(struct sk_buff *skb) return skb->dev; } -/* We only check the length. A bridge shouldn't do any hop-by-hop stuff anyway */ -static int check_hbh_len(struct sk_buff *skb) -{ - unsigned char *raw = (u8 *)(ipv6_hdr(skb) + 1); - u32 pkt_len; - const unsigned char *nh = skb_network_header(skb); - int off = raw - nh; - int len = (raw[1] + 1) << 3; - - if ((raw + len) - skb->data > skb_headlen(skb)) - goto bad; - - off += 2; - len -= 2; - - while (len > 0) { - int optlen = nh[off + 1] + 2; - - switch (nh[off]) { - case IPV6_TLV_PAD1: - optlen = 1; - break; - - case IPV6_TLV_PADN: - break; - - case IPV6_TLV_JUMBO: - if (nh[off + 1] != 4 || (off & 3) != 2) - goto bad; - pkt_len = ntohl(*(__be32 *) (nh + off + 2)); - if (pkt_len <= IPV6_MAXPLEN || - ipv6_hdr(skb)->payload_len) - goto bad; - if (pkt_len > skb->len - sizeof(struct ipv6hdr)) - goto bad; - if (pskb_trim_rcsum(skb, - pkt_len + sizeof(struct ipv6hdr))) - goto bad; - nh = skb_network_header(skb); - break; - default: - if (optlen > len) - goto bad; - break; - } - off += optlen; - len -= optlen; - } - if (len == 0) - return 0; -bad: - return -1; - -} - /* Replicate the checks that IPv6 does on packet reception and pass the packet * to ip6tables, which doesn't support NAT, so things are fairly simple. */ static unsigned int br_nf_pre_routing_ipv6(const struct nf_hook_ops *ops, -- cgit From efb6de9b4ba0092b2c55f6a52d16294a8a698edd Mon Sep 17 00:00:00 2001 From: Bernhard Thaler Date: Sat, 30 May 2015 15:30:16 +0200 Subject: netfilter: bridge: forward IPv6 fragmented packets IPv6 fragmented packets are not forwarded on an ethernet bridge with netfilter ip6_tables loaded. e.g. steps to reproduce 1) create a simple bridge like this modprobe br_netfilter brctl addbr br0 brctl addif br0 eth0 brctl addif br0 eth2 ifconfig eth0 up ifconfig eth2 up ifconfig br0 up 2) place a host with an IPv6 address on each side of the bridge set IPv6 address on host A: ip -6 addr add fd01:2345:6789:1::1/64 dev eth0 set IPv6 address on host B: ip -6 addr add fd01:2345:6789:1::2/64 dev eth0 3) run a simple ping command on host A with packets > MTU ping6 -s 4000 fd01:2345:6789:1::2 4) wait some time and run e.g. "ip6tables -t nat -nvL" on the bridge IPv6 fragmented packets traverse the bridge cleanly until somebody runs. "ip6tables -t nat -nvL". As soon as it is run (and netfilter modules are loaded) IPv6 fragmented packets do not traverse the bridge any more (you see no more responses in ping's output). After applying this patch IPv6 fragmented packets traverse the bridge cleanly in above scenario. Signed-off-by: Bernhard Thaler [pablo@netfilter.org: small changes to br_nf_dev_queue_xmit] Signed-off-by: Pablo Neira Ayuso --- net/bridge/br_netfilter.c | 139 +++++++++++++++++++++++++++++++++------------- net/bridge/br_private.h | 6 +- net/ipv6/netfilter.c | 3 +- 3 files changed, 106 insertions(+), 42 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index d201ea4440c9..535f9dae743e 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -34,6 +34,7 @@ #include #include +#include #include #include @@ -320,6 +321,55 @@ bad: return -1; } +/* Equivalent to br_validate_ipv4 for IPv6 */ +static int br_validate_ipv6(struct sk_buff *skb) +{ + const struct ipv6hdr *hdr; + struct net_device *dev = skb->dev; + struct inet6_dev *idev = in6_dev_get(skb->dev); + u32 pkt_len; + u8 ip6h_len = sizeof(struct ipv6hdr); + + if (!pskb_may_pull(skb, ip6h_len)) + goto inhdr_error; + + if (skb->len < ip6h_len) + goto drop; + + hdr = ipv6_hdr(skb); + + if (hdr->version != 6) + goto inhdr_error; + + pkt_len = ntohs(hdr->payload_len); + + if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) { + if (pkt_len + ip6h_len > skb->len) { + IP6_INC_STATS_BH(dev_net(dev), idev, + IPSTATS_MIB_INTRUNCATEDPKTS); + goto drop; + } + if (pskb_trim_rcsum(skb, pkt_len + ip6h_len)) { + IP6_INC_STATS_BH(dev_net(dev), idev, + IPSTATS_MIB_INDISCARDS); + goto drop; + } + } + if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb)) + goto drop; + + memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); + /* No IP options in IPv6 header; however it should be + * checked if some next headers need special treatment + */ + return 0; + +inhdr_error: + IP6_INC_STATS_BH(dev_net(dev), idev, IPSTATS_MIB_INHDRERRORS); +drop: + return -1; +} + static void nf_bridge_update_protocol(struct sk_buff *skb) { switch (skb->nf_bridge->orig_proto) { @@ -405,6 +455,8 @@ static int br_nf_pre_routing_finish_ipv6(struct sock *sk, struct sk_buff *skb) struct net_device *dev = skb->dev; const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops(); + nf_bridge->frag_max_size = IP6CB(skb)->frag_max_size; + if (nf_bridge->pkt_otherhost) { skb->pkt_type = PACKET_OTHERHOST; nf_bridge->pkt_otherhost = false; @@ -606,35 +658,15 @@ static struct net_device *setup_pre_routing(struct sk_buff *skb) } /* Replicate the checks that IPv6 does on packet reception and pass the packet - * to ip6tables, which doesn't support NAT, so things are fairly simple. */ + * to ip6tables. + */ static unsigned int br_nf_pre_routing_ipv6(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct nf_hook_state *state) { struct nf_bridge_info *nf_bridge; - const struct ipv6hdr *hdr; - u32 pkt_len; - - if (skb->len < sizeof(struct ipv6hdr)) - return NF_DROP; - - if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) - return NF_DROP; - - hdr = ipv6_hdr(skb); - - if (hdr->version != 6) - return NF_DROP; - pkt_len = ntohs(hdr->payload_len); - - if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) { - if (pkt_len + sizeof(struct ipv6hdr) > skb->len) - return NF_DROP; - if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr))) - return NF_DROP; - } - if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb)) + if (br_validate_ipv6(skb)) return NF_DROP; nf_bridge_put(skb->nf_bridge); @@ -738,9 +770,11 @@ static int br_nf_forward_finish(struct sock *sk, struct sk_buff *skb) if (!IS_ARP(skb) && !IS_VLAN_ARP(skb)) { - if (skb->protocol == htons(ETH_P_IP)) { + if (skb->protocol == htons(ETH_P_IP)) nf_bridge->frag_max_size = IPCB(skb)->frag_max_size; - } + + if (skb->protocol == htons(ETH_P_IPV6)) + nf_bridge->frag_max_size = IP6CB(skb)->frag_max_size; in = nf_bridge->physindev; if (nf_bridge->pkt_otherhost) { @@ -808,6 +842,12 @@ static unsigned int br_nf_forward_ip(const struct nf_hook_ops *ops, IPCB(skb)->frag_max_size = nf_bridge->frag_max_size; } + if (pf == NFPROTO_IPV6) { + if (br_validate_ipv6(skb)) + return NF_DROP; + IP6CB(skb)->frag_max_size = nf_bridge->frag_max_size; + } + nf_bridge->physoutdev = skb->dev; if (pf == NFPROTO_IPV4) skb->protocol = htons(ETH_P_IP); @@ -855,7 +895,7 @@ static unsigned int br_nf_forward_arp(const struct nf_hook_ops *ops, return NF_STOLEN; } -#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) || IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) static int br_nf_push_frag_xmit(struct sock *sk, struct sk_buff *skb) { struct brnf_frag_data *data; @@ -875,6 +915,7 @@ static int br_nf_push_frag_xmit(struct sock *sk, struct sk_buff *skb) nf_bridge_info_free(skb); return br_dev_queue_push_xmit(sk, skb); } +#endif static int br_nf_ip_fragment(struct sock *sk, struct sk_buff *skb, int (*output)(struct sock *, struct sk_buff *)) @@ -897,21 +938,23 @@ static int br_nf_ip_fragment(struct sock *sk, struct sk_buff *skb, static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb) { - int ret; struct nf_bridge_info *nf_bridge; unsigned int mtu_reserved; - if (skb_is_gso(skb) || skb->protocol != htons(ETH_P_IP)) { + mtu_reserved = nf_bridge_mtu_reduction(skb); + + if (skb_is_gso(skb) || skb->len + mtu_reserved <= skb->dev->mtu) { nf_bridge_info_free(skb); return br_dev_queue_push_xmit(sk, skb); } - mtu_reserved = nf_bridge_mtu_reduction(skb); nf_bridge = nf_bridge_info_get(skb); + +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) /* This is wrong! We should preserve the original fragment * boundaries by preserving frag_list rather than refragmenting. */ - if (skb->len + mtu_reserved > skb->dev->mtu) { + if (skb->protocol == htons(ETH_P_IP)) { struct brnf_frag_data *data; if (br_validate_ipv4(skb)) @@ -928,21 +971,37 @@ static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb) skb_copy_from_linear_data_offset(skb, -data->size, data->mac, data->size); - ret = br_nf_ip_fragment(sk, skb, br_nf_push_frag_xmit); - } else { - nf_bridge_info_free(skb); - ret = br_dev_queue_push_xmit(sk, skb); + return br_nf_ip_fragment(sk, skb, br_nf_push_frag_xmit); } +#endif +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) + if (skb->protocol == htons(ETH_P_IPV6)) { + const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops(); + struct brnf_frag_data *data; - return ret; -} -#else -static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb) -{ + if (br_validate_ipv6(skb)) + return NF_DROP; + + IP6CB(skb)->frag_max_size = nf_bridge->frag_max_size; + + nf_bridge_update_protocol(skb); + + data = this_cpu_ptr(&brnf_frag_data_storage); + data->encap_size = nf_bridge_encap_header_len(skb); + data->size = ETH_HLEN + data->encap_size; + + skb_copy_from_linear_data_offset(skb, -data->size, data->mac, + data->size); + + if (v6ops) + return v6ops->fragment(sk, skb, br_nf_push_frag_xmit); + else + return -EMSGSIZE; + } +#endif nf_bridge_info_free(skb); return br_dev_queue_push_xmit(sk, skb); } -#endif /* PF_BRIDGE/POST_ROUTING ********************************************/ static unsigned int br_nf_post_routing(const struct nf_hook_ops *ops, diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 8cde96e68fd5..5dccced71269 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -18,6 +18,7 @@ #include #include #include +#include #include #define BR_HASH_BITS 8 @@ -214,7 +215,10 @@ struct net_bridge spinlock_t hash_lock; struct hlist_head hash[BR_HASH_SIZE]; #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - struct rtable fake_rtable; + union { + struct rtable fake_rtable; + struct rt6_info fake_rt6_info; + }; bool nf_call_iptables; bool nf_call_ip6tables; bool nf_call_arptables; diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index bbca09fe9553..b4de08a83e0b 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -191,7 +191,8 @@ static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook, static const struct nf_ipv6_ops ipv6ops = { .chk_addr = ipv6_chk_addr, - .route_input = ip6_route_input + .route_input = ip6_route_input, + .fragment = ip6_fragment }; static const struct nf_afinfo nf_ip6_afinfo = { -- cgit From 33b1f31392861947fa2a2a57c3a39ab63b8c9f9d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 5 Jun 2015 13:28:38 +0200 Subject: net: ip_fragment: remove BRIDGE_NETFILTER mtu special handling since commit d6b915e29f4adea9 ("ip_fragment: don't forward defragmented DF packet") the largest fragment size is available in the IPCB. Therefore we no longer need to care about 'encapsulation' overhead of stripped PPPOE/VLAN headers since ip_do_fragment doesn't use device mtu in such cases. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/bridge/br_netfilter.c | 7 +++++++ net/ipv4/ip_output.c | 4 ---- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 535f9dae743e..1e62ae5d8f4e 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -936,6 +936,13 @@ static int br_nf_ip_fragment(struct sock *sk, struct sk_buff *skb, return ip_do_fragment(sk, skb, output); } +static unsigned int nf_bridge_mtu_reduction(const struct sk_buff *skb) +{ + if (skb->nf_bridge->orig_proto == BRNF_PROTO_PPPOE) + return PPPOE_SES_HLEN; + return 0; +} + static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb) { struct nf_bridge_info *nf_bridge; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index f5f5ef1cebd5..19d7e43b5370 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -549,10 +549,6 @@ int ip_do_fragment(struct sock *sk, struct sk_buff *skb, hlen = iph->ihl * 4; mtu = mtu - hlen; /* Size of data space */ -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - if (skb->nf_bridge) - mtu -= nf_bridge_mtu_reduction(skb); -#endif IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE; /* When frag_list is given, use it. First, check its validity: -- cgit From d7b597421519d6f680eb8e152a0d8447466ee2d6 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 5 Jun 2015 13:27:13 +0200 Subject: netfilter: bridge: restore vlan tag when refragmenting If bridge netfilter is used with both bridge-nf-call-iptables and bridge-nf-filter-vlan-tagged enabled then ip fragments in VLAN frames are sent without the vlan header. This has never worked reliably. Turns out this relied on pre-3.5 behaviour where skb frag_list was used to store ip fragments; ip_fragment() then re-used these skbs. But since commit 3cc4949269e01f39443d0fcfffb5bc6b47878d45 ("ipv4: use skb coalescing in defragmentation") this is no longer the case. ip_do_fragment now needs to allocate new skbs, but these don't contain the vlan tag information anymore. Fix it by storing vlan information of the ressembled skb in the br netfilter percpu frag area, and restore them for each of the fragments. Fixes: 3cc4949269e01f3 ("ipv4: use skb coalescing in defragmentation") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/bridge/br_netfilter.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'net') diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 1e62ae5d8f4e..e4e5f2f29173 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -116,6 +116,8 @@ struct brnf_frag_data { char mac[NF_BRIDGE_MAX_MAC_HEADER_LENGTH]; u8 encap_size; u8 size; + u16 vlan_tci; + __be16 vlan_proto; }; static DEFINE_PER_CPU(struct brnf_frag_data, brnf_frag_data_storage); @@ -909,6 +911,11 @@ static int br_nf_push_frag_xmit(struct sock *sk, struct sk_buff *skb) return 0; } + if (data->vlan_tci) { + skb->vlan_tci = data->vlan_tci; + skb->vlan_proto = data->vlan_proto; + } + skb_copy_to_linear_data_offset(skb, -data->size, data->mac, data->size); __skb_push(skb, data->encap_size); @@ -972,6 +979,9 @@ static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb) nf_bridge_update_protocol(skb); data = this_cpu_ptr(&brnf_frag_data_storage); + + data->vlan_tci = skb->vlan_tci; + data->vlan_proto = skb->vlan_proto; data->encap_size = nf_bridge_encap_header_len(skb); data->size = ETH_HLEN + data->encap_size; -- cgit From 71ae0dff02d756e4d2ca710b79f2ff5390029a5f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 11 Jun 2015 01:34:54 +0200 Subject: netfilter: xtables: use percpu rule counters The binary arp/ip/ip6tables ruleset is stored per cpu. The only reason left as to why we need percpu duplication are the rule counters embedded into ipt_entry et al -- since each cpu has its own copy of the rules, all counters can be lockless. The downside is that the more cpus are supported, the more memory is required. Rules are not just duplicated per online cpu but for each possible cpu, i.e. if maxcpu is 144, then rule is duplicated 144 times, not for the e.g. 64 cores present. To save some memory and also improve utilization of shared caches it would be preferable to only store the rule blob once. So we first need to separate counters and the rule blob. Instead of using entry->counters, allocate this percpu and store the percpu address in entry->counters.pcnt on CONFIG_SMP. This change makes no sense as-is; it is merely an intermediate step to remove the percpu duplication of the rule set in a followup patch. Suggested-by: Eric Dumazet Acked-by: Jesper Dangaard Brouer Reported-by: Marcelo Ricardo Leitner Signed-off-by: Florian Westphal Acked-by: Eric Dumazet Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/arp_tables.c | 32 +++++++++++++++++++++++++++----- net/ipv4/netfilter/ip_tables.c | 31 +++++++++++++++++++++++++++---- net/ipv6/netfilter/ip6_tables.c | 31 ++++++++++++++++++++++++++----- 3 files changed, 80 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index a61200754f4b..0ada09ae6e81 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -289,13 +289,15 @@ unsigned int arpt_do_table(struct sk_buff *skb, arp = arp_hdr(skb); do { const struct xt_entry_target *t; + struct xt_counters *counter; if (!arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) { e = arpt_next_entry(e); continue; } - ADD_COUNTER(e->counters, arp_hdr_len(skb->dev), 1); + counter = xt_get_this_cpu_counter(&e->counters); + ADD_COUNTER(*counter, arp_hdr_len(skb->dev), 1); t = arpt_get_target_c(e); @@ -521,6 +523,10 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size) if (ret) return ret; + e->counters.pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(e->counters.pcnt)) + return -ENOMEM; + t = arpt_get_target(e); target = xt_request_find_target(NFPROTO_ARP, t->u.user.name, t->u.user.revision); @@ -538,6 +544,8 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size) err: module_put(t->u.kernel.target->me); out: + xt_percpu_counter_free(e->counters.pcnt); + return ret; } @@ -614,6 +622,7 @@ static inline void cleanup_entry(struct arpt_entry *e) if (par.target->destroy != NULL) par.target->destroy(&par); module_put(par.target->me); + xt_percpu_counter_free(e->counters.pcnt); } /* Checks and translates the user-supplied table segment (held in @@ -723,13 +732,15 @@ static void get_counters(const struct xt_table_info *t, i = 0; xt_entry_foreach(iter, t->entries[cpu], t->size) { + struct xt_counters *tmp; u64 bcnt, pcnt; unsigned int start; + tmp = xt_get_per_cpu_counter(&iter->counters, cpu); do { start = read_seqcount_begin(s); - bcnt = iter->counters.bcnt; - pcnt = iter->counters.pcnt; + bcnt = tmp->bcnt; + pcnt = tmp->pcnt; } while (read_seqcount_retry(s, start)); ADD_COUNTER(counters[i], bcnt, pcnt); @@ -1186,7 +1197,10 @@ static int do_add_counters(struct net *net, const void __user *user, loc_cpu_entry = private->entries[curcpu]; addend = xt_write_recseq_begin(); xt_entry_foreach(iter, loc_cpu_entry, private->size) { - ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt); + struct xt_counters *tmp; + + tmp = xt_get_this_cpu_counter(&iter->counters); + ADD_COUNTER(*tmp, paddc[i].bcnt, paddc[i].pcnt); ++i; } xt_write_recseq_end(addend); @@ -1416,9 +1430,17 @@ static int translate_compat_table(const char *name, i = 0; xt_entry_foreach(iter1, entry1, newinfo->size) { + iter1->counters.pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(iter1->counters.pcnt)) { + ret = -ENOMEM; + break; + } + ret = check_target(iter1, name); - if (ret != 0) + if (ret != 0) { + xt_percpu_counter_free(iter1->counters.pcnt); break; + } ++i; if (strcmp(arpt_get_target(iter1)->u.user.name, XT_ERROR_TARGET) == 0) diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index e7abf5145edc..d190b10dabfe 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -345,6 +345,7 @@ ipt_do_table(struct sk_buff *skb, do { const struct xt_entry_target *t; const struct xt_entry_match *ematch; + struct xt_counters *counter; IP_NF_ASSERT(e); if (!ip_packet_match(ip, indev, outdev, @@ -361,7 +362,8 @@ ipt_do_table(struct sk_buff *skb, goto no_match; } - ADD_COUNTER(e->counters, skb->len, 1); + counter = xt_get_this_cpu_counter(&e->counters); + ADD_COUNTER(*counter, skb->len, 1); t = ipt_get_target(e); IP_NF_ASSERT(t->u.kernel.target); @@ -665,6 +667,10 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, if (ret) return ret; + e->counters.pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(e->counters.pcnt)) + return -ENOMEM; + j = 0; mtpar.net = net; mtpar.table = name; @@ -691,6 +697,7 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, ret = check_target(e, net, name); if (ret) goto err; + return 0; err: module_put(t->u.kernel.target->me); @@ -700,6 +707,9 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, break; cleanup_match(ematch, net); } + + xt_percpu_counter_free(e->counters.pcnt); + return ret; } @@ -784,6 +794,7 @@ cleanup_entry(struct ipt_entry *e, struct net *net) if (par.target->destroy != NULL) par.target->destroy(&par); module_put(par.target->me); + xt_percpu_counter_free(e->counters.pcnt); } /* Checks and translates the user-supplied table segment (held in @@ -888,13 +899,15 @@ get_counters(const struct xt_table_info *t, i = 0; xt_entry_foreach(iter, t->entries[cpu], t->size) { + struct xt_counters *tmp; u64 bcnt, pcnt; unsigned int start; + tmp = xt_get_per_cpu_counter(&iter->counters, cpu); do { start = read_seqcount_begin(s); - bcnt = iter->counters.bcnt; - pcnt = iter->counters.pcnt; + bcnt = tmp->bcnt; + pcnt = tmp->pcnt; } while (read_seqcount_retry(s, start)); ADD_COUNTER(counters[i], bcnt, pcnt); @@ -1374,7 +1387,10 @@ do_add_counters(struct net *net, const void __user *user, loc_cpu_entry = private->entries[curcpu]; addend = xt_write_recseq_begin(); xt_entry_foreach(iter, loc_cpu_entry, private->size) { - ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt); + struct xt_counters *tmp; + + tmp = xt_get_this_cpu_counter(&iter->counters); + ADD_COUNTER(*tmp, paddc[i].bcnt, paddc[i].pcnt); ++i; } xt_write_recseq_end(addend); @@ -1608,6 +1624,10 @@ compat_check_entry(struct ipt_entry *e, struct net *net, const char *name) unsigned int j; int ret = 0; + e->counters.pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(e->counters.pcnt)) + return -ENOMEM; + j = 0; mtpar.net = net; mtpar.table = name; @@ -1632,6 +1652,9 @@ compat_check_entry(struct ipt_entry *e, struct net *net, const char *name) break; cleanup_match(ematch, net); } + + xt_percpu_counter_free(e->counters.pcnt); + return ret; } diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index cdd085f8b770..a1190ee14723 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -367,6 +367,7 @@ ip6t_do_table(struct sk_buff *skb, do { const struct xt_entry_target *t; const struct xt_entry_match *ematch; + struct xt_counters *counter; IP_NF_ASSERT(e); acpar.thoff = 0; @@ -384,7 +385,8 @@ ip6t_do_table(struct sk_buff *skb, goto no_match; } - ADD_COUNTER(e->counters, skb->len, 1); + counter = xt_get_this_cpu_counter(&e->counters); + ADD_COUNTER(*counter, skb->len, 1); t = ip6t_get_target_c(e); IP_NF_ASSERT(t->u.kernel.target); @@ -679,6 +681,10 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name, if (ret) return ret; + e->counters.pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(e->counters.pcnt)) + return -ENOMEM; + j = 0; mtpar.net = net; mtpar.table = name; @@ -714,6 +720,9 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name, break; cleanup_match(ematch, net); } + + xt_percpu_counter_free(e->counters.pcnt); + return ret; } @@ -797,6 +806,8 @@ static void cleanup_entry(struct ip6t_entry *e, struct net *net) if (par.target->destroy != NULL) par.target->destroy(&par); module_put(par.target->me); + + xt_percpu_counter_free(e->counters.pcnt); } /* Checks and translates the user-supplied table segment (held in @@ -901,13 +912,15 @@ get_counters(const struct xt_table_info *t, i = 0; xt_entry_foreach(iter, t->entries[cpu], t->size) { + struct xt_counters *tmp; u64 bcnt, pcnt; unsigned int start; + tmp = xt_get_per_cpu_counter(&iter->counters, cpu); do { start = read_seqcount_begin(s); - bcnt = iter->counters.bcnt; - pcnt = iter->counters.pcnt; + bcnt = tmp->bcnt; + pcnt = tmp->pcnt; } while (read_seqcount_retry(s, start)); ADD_COUNTER(counters[i], bcnt, pcnt); @@ -1374,7 +1387,6 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len, goto free; } - local_bh_disable(); private = t->private; if (private->number != num_counters) { @@ -1388,7 +1400,10 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len, addend = xt_write_recseq_begin(); loc_cpu_entry = private->entries[curcpu]; xt_entry_foreach(iter, loc_cpu_entry, private->size) { - ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt); + struct xt_counters *tmp; + + tmp = xt_get_this_cpu_counter(&iter->counters); + ADD_COUNTER(*tmp, paddc[i].bcnt, paddc[i].pcnt); ++i; } xt_write_recseq_end(addend); @@ -1621,6 +1636,9 @@ static int compat_check_entry(struct ip6t_entry *e, struct net *net, struct xt_mtchk_param mtpar; struct xt_entry_match *ematch; + e->counters.pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(e->counters.pcnt)) + return -ENOMEM; j = 0; mtpar.net = net; mtpar.table = name; @@ -1645,6 +1663,9 @@ static int compat_check_entry(struct ip6t_entry *e, struct net *net, break; cleanup_match(ematch, net); } + + xt_percpu_counter_free(e->counters.pcnt); + return ret; } -- cgit From 482cfc318559e2527dfd8513582d2fdb276e47c2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 11 Jun 2015 01:34:55 +0200 Subject: netfilter: xtables: avoid percpu ruleset duplication We store the rule blob per (possible) cpu. Unfortunately this means we can waste lot of memory on big smp machines. ipt_entry structure ('rule head') is 112 byte, so e.g. with maxcpu=64 one single rule eats close to 8k RAM. Since previous patch made counters percpu it appears there is nothing left in the rule blob that needs to be percpu. On my test system (144 possible cpus, 400k dummy rules) this change saves close to 9 Gigabyte of RAM. Reported-by: Marcelo Ricardo Leitner Acked-by: Jesper Dangaard Brouer Signed-off-by: Florian Westphal Acked-by: Eric Dumazet Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/arp_tables.c | 50 ++++++++++--------------------- net/ipv4/netfilter/ip_tables.c | 64 ++++++++++------------------------------ net/ipv6/netfilter/ip6_tables.c | 65 ++++++++++------------------------------- net/netfilter/x_tables.c | 23 +++++---------- 4 files changed, 55 insertions(+), 147 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 0ada09ae6e81..d75c139393fc 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -275,7 +275,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, * pointer. */ smp_read_barrier_depends(); - table_base = private->entries[smp_processor_id()]; + table_base = private->entries; e = get_entry(table_base, private->hook_entry[hook]); back = get_entry(table_base, private->underflow[hook]); @@ -711,12 +711,6 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, return ret; } - /* And one copy for every other CPU */ - for_each_possible_cpu(i) { - if (newinfo->entries[i] && newinfo->entries[i] != entry0) - memcpy(newinfo->entries[i], entry0, newinfo->size); - } - return ret; } @@ -731,7 +725,7 @@ static void get_counters(const struct xt_table_info *t, seqcount_t *s = &per_cpu(xt_recseq, cpu); i = 0; - xt_entry_foreach(iter, t->entries[cpu], t->size) { + xt_entry_foreach(iter, t->entries, t->size) { struct xt_counters *tmp; u64 bcnt, pcnt; unsigned int start; @@ -785,7 +779,7 @@ static int copy_entries_to_user(unsigned int total_size, if (IS_ERR(counters)) return PTR_ERR(counters); - loc_cpu_entry = private->entries[raw_smp_processor_id()]; + loc_cpu_entry = private->entries; /* ... then copy entire thing ... */ if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { ret = -EFAULT; @@ -880,10 +874,10 @@ static int compat_table_info(const struct xt_table_info *info, if (!newinfo || !info) return -EINVAL; - /* we dont care about newinfo->entries[] */ + /* we dont care about newinfo->entries */ memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; - loc_cpu_entry = info->entries[raw_smp_processor_id()]; + loc_cpu_entry = info->entries; xt_compat_init_offsets(NFPROTO_ARP, info->number); xt_entry_foreach(iter, loc_cpu_entry, info->size) { ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); @@ -1048,7 +1042,7 @@ static int __do_replace(struct net *net, const char *name, get_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ - loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; + loc_cpu_old_entry = oldinfo->entries; xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size) cleanup_entry(iter); @@ -1095,8 +1089,7 @@ static int do_replace(struct net *net, const void __user *user, if (!newinfo) return -ENOMEM; - /* choose the copy that is on our node/cpu */ - loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + loc_cpu_entry = newinfo->entries; if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; @@ -1126,7 +1119,7 @@ static int do_replace(struct net *net, const void __user *user, static int do_add_counters(struct net *net, const void __user *user, unsigned int len, int compat) { - unsigned int i, curcpu; + unsigned int i; struct xt_counters_info tmp; struct xt_counters *paddc; unsigned int num_counters; @@ -1136,7 +1129,6 @@ static int do_add_counters(struct net *net, const void __user *user, struct xt_table *t; const struct xt_table_info *private; int ret = 0; - void *loc_cpu_entry; struct arpt_entry *iter; unsigned int addend; #ifdef CONFIG_COMPAT @@ -1192,11 +1184,9 @@ static int do_add_counters(struct net *net, const void __user *user, } i = 0; - /* Choose the copy that is on our node */ - curcpu = smp_processor_id(); - loc_cpu_entry = private->entries[curcpu]; + addend = xt_write_recseq_begin(); - xt_entry_foreach(iter, loc_cpu_entry, private->size) { + xt_entry_foreach(iter, private->entries, private->size) { struct xt_counters *tmp; tmp = xt_get_this_cpu_counter(&iter->counters); @@ -1410,7 +1400,7 @@ static int translate_compat_table(const char *name, newinfo->hook_entry[i] = info->hook_entry[i]; newinfo->underflow[i] = info->underflow[i]; } - entry1 = newinfo->entries[raw_smp_processor_id()]; + entry1 = newinfo->entries; pos = entry1; size = total_size; xt_entry_foreach(iter0, entry0, total_size) { @@ -1470,11 +1460,6 @@ static int translate_compat_table(const char *name, return ret; } - /* And one copy for every other CPU */ - for_each_possible_cpu(i) - if (newinfo->entries[i] && newinfo->entries[i] != entry1) - memcpy(newinfo->entries[i], entry1, newinfo->size); - *pinfo = newinfo; *pentry0 = entry1; xt_free_table_info(info); @@ -1533,8 +1518,7 @@ static int compat_do_replace(struct net *net, void __user *user, if (!newinfo) return -ENOMEM; - /* choose the copy that is on our node/cpu */ - loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + loc_cpu_entry = newinfo->entries; if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; @@ -1631,7 +1615,6 @@ static int compat_copy_entries_to_user(unsigned int total_size, void __user *pos; unsigned int size; int ret = 0; - void *loc_cpu_entry; unsigned int i = 0; struct arpt_entry *iter; @@ -1639,11 +1622,9 @@ static int compat_copy_entries_to_user(unsigned int total_size, if (IS_ERR(counters)) return PTR_ERR(counters); - /* choose the copy on our node/cpu */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; pos = userptr; size = total_size; - xt_entry_foreach(iter, loc_cpu_entry, total_size) { + xt_entry_foreach(iter, private->entries, total_size) { ret = compat_copy_entry_to_user(iter, &pos, &size, counters, i++); if (ret != 0) @@ -1812,8 +1793,7 @@ struct xt_table *arpt_register_table(struct net *net, goto out; } - /* choose the copy on our node/cpu */ - loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + loc_cpu_entry = newinfo->entries; memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(newinfo, loc_cpu_entry, repl); @@ -1844,7 +1824,7 @@ void arpt_unregister_table(struct xt_table *table) private = xt_unregister_table(table); /* Decrease module usage counts and free resources */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; + loc_cpu_entry = private->entries; xt_entry_foreach(iter, loc_cpu_entry, private->size) cleanup_entry(iter); if (private->number > private->initial_entries) diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index d190b10dabfe..6151500c2df8 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -254,15 +254,13 @@ static void trace_packet(const struct sk_buff *skb, const struct xt_table_info *private, const struct ipt_entry *e) { - const void *table_base; const struct ipt_entry *root; const char *hookname, *chainname, *comment; const struct ipt_entry *iter; unsigned int rulenum = 0; struct net *net = dev_net(in ? in : out); - table_base = private->entries[smp_processor_id()]; - root = get_entry(table_base, private->hook_entry[hook]); + root = get_entry(private->entries, private->hook_entry[hook]); hookname = chainname = hooknames[hook]; comment = comments[NF_IP_TRACE_COMMENT_RULE]; @@ -331,7 +329,7 @@ ipt_do_table(struct sk_buff *skb, * pointer. */ smp_read_barrier_depends(); - table_base = private->entries[cpu]; + table_base = private->entries; jumpstack = (struct ipt_entry **)private->jumpstack[cpu]; stackptr = per_cpu_ptr(private->stackptr, cpu); origptr = *stackptr; @@ -877,12 +875,6 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, return ret; } - /* And one copy for every other CPU */ - for_each_possible_cpu(i) { - if (newinfo->entries[i] && newinfo->entries[i] != entry0) - memcpy(newinfo->entries[i], entry0, newinfo->size); - } - return ret; } @@ -898,7 +890,7 @@ get_counters(const struct xt_table_info *t, seqcount_t *s = &per_cpu(xt_recseq, cpu); i = 0; - xt_entry_foreach(iter, t->entries[cpu], t->size) { + xt_entry_foreach(iter, t->entries, t->size) { struct xt_counters *tmp; u64 bcnt, pcnt; unsigned int start; @@ -946,17 +938,13 @@ copy_entries_to_user(unsigned int total_size, struct xt_counters *counters; const struct xt_table_info *private = table->private; int ret = 0; - const void *loc_cpu_entry; + void *loc_cpu_entry; counters = alloc_counters(table); if (IS_ERR(counters)) return PTR_ERR(counters); - /* choose the copy that is on our node/cpu, ... - * This choice is lazy (because current thread is - * allowed to migrate to another cpu) - */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; + loc_cpu_entry = private->entries; if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { ret = -EFAULT; goto free_counters; @@ -1070,10 +1058,10 @@ static int compat_table_info(const struct xt_table_info *info, if (!newinfo || !info) return -EINVAL; - /* we dont care about newinfo->entries[] */ + /* we dont care about newinfo->entries */ memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; - loc_cpu_entry = info->entries[raw_smp_processor_id()]; + loc_cpu_entry = info->entries; xt_compat_init_offsets(AF_INET, info->number); xt_entry_foreach(iter, loc_cpu_entry, info->size) { ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); @@ -1194,7 +1182,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, struct xt_table *t; struct xt_table_info *oldinfo; struct xt_counters *counters; - void *loc_cpu_old_entry; struct ipt_entry *iter; ret = 0; @@ -1237,8 +1224,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, get_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ - loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; - xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size) + xt_entry_foreach(iter, oldinfo->entries, oldinfo->size) cleanup_entry(iter, net); xt_free_table_info(oldinfo); @@ -1284,8 +1270,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len) if (!newinfo) return -ENOMEM; - /* choose the copy that is on our node/cpu */ - loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + loc_cpu_entry = newinfo->entries; if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; @@ -1316,7 +1301,7 @@ static int do_add_counters(struct net *net, const void __user *user, unsigned int len, int compat) { - unsigned int i, curcpu; + unsigned int i; struct xt_counters_info tmp; struct xt_counters *paddc; unsigned int num_counters; @@ -1326,7 +1311,6 @@ do_add_counters(struct net *net, const void __user *user, struct xt_table *t; const struct xt_table_info *private; int ret = 0; - void *loc_cpu_entry; struct ipt_entry *iter; unsigned int addend; #ifdef CONFIG_COMPAT @@ -1382,11 +1366,8 @@ do_add_counters(struct net *net, const void __user *user, } i = 0; - /* Choose the copy that is on our node */ - curcpu = smp_processor_id(); - loc_cpu_entry = private->entries[curcpu]; addend = xt_write_recseq_begin(); - xt_entry_foreach(iter, loc_cpu_entry, private->size) { + xt_entry_foreach(iter, private->entries, private->size) { struct xt_counters *tmp; tmp = xt_get_this_cpu_counter(&iter->counters); @@ -1739,7 +1720,7 @@ translate_compat_table(struct net *net, newinfo->hook_entry[i] = info->hook_entry[i]; newinfo->underflow[i] = info->underflow[i]; } - entry1 = newinfo->entries[raw_smp_processor_id()]; + entry1 = newinfo->entries; pos = entry1; size = total_size; xt_entry_foreach(iter0, entry0, total_size) { @@ -1791,11 +1772,6 @@ translate_compat_table(struct net *net, return ret; } - /* And one copy for every other CPU */ - for_each_possible_cpu(i) - if (newinfo->entries[i] && newinfo->entries[i] != entry1) - memcpy(newinfo->entries[i], entry1, newinfo->size); - *pinfo = newinfo; *pentry0 = entry1; xt_free_table_info(info); @@ -1842,8 +1818,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) if (!newinfo) return -ENOMEM; - /* choose the copy that is on our node/cpu */ - loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + loc_cpu_entry = newinfo->entries; if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; @@ -1914,7 +1889,6 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, void __user *pos; unsigned int size; int ret = 0; - const void *loc_cpu_entry; unsigned int i = 0; struct ipt_entry *iter; @@ -1922,14 +1896,9 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, if (IS_ERR(counters)) return PTR_ERR(counters); - /* choose the copy that is on our node/cpu, ... - * This choice is lazy (because current thread is - * allowed to migrate to another cpu) - */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; pos = userptr; size = total_size; - xt_entry_foreach(iter, loc_cpu_entry, total_size) { + xt_entry_foreach(iter, private->entries, total_size) { ret = compat_copy_entry_to_user(iter, &pos, &size, counters, i++); if (ret != 0) @@ -2104,8 +2073,7 @@ struct xt_table *ipt_register_table(struct net *net, goto out; } - /* choose the copy on our node/cpu, but dont care about preemption */ - loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + loc_cpu_entry = newinfo->entries; memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(net, newinfo, loc_cpu_entry, repl); @@ -2136,7 +2104,7 @@ void ipt_unregister_table(struct net *net, struct xt_table *table) private = xt_unregister_table(table); /* Decrease module usage counts and free resources */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; + loc_cpu_entry = private->entries; xt_entry_foreach(iter, loc_cpu_entry, private->size) cleanup_entry(iter, net); if (private->number > private->initial_entries) diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index a1190ee14723..80a7f0d38aa4 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -283,15 +283,13 @@ static void trace_packet(const struct sk_buff *skb, const struct xt_table_info *private, const struct ip6t_entry *e) { - const void *table_base; const struct ip6t_entry *root; const char *hookname, *chainname, *comment; const struct ip6t_entry *iter; unsigned int rulenum = 0; struct net *net = dev_net(in ? in : out); - table_base = private->entries[smp_processor_id()]; - root = get_entry(table_base, private->hook_entry[hook]); + root = get_entry(private->entries, private->hook_entry[hook]); hookname = chainname = hooknames[hook]; comment = comments[NF_IP6_TRACE_COMMENT_RULE]; @@ -357,7 +355,7 @@ ip6t_do_table(struct sk_buff *skb, */ smp_read_barrier_depends(); cpu = smp_processor_id(); - table_base = private->entries[cpu]; + table_base = private->entries; jumpstack = (struct ip6t_entry **)private->jumpstack[cpu]; stackptr = per_cpu_ptr(private->stackptr, cpu); origptr = *stackptr; @@ -890,12 +888,6 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, return ret; } - /* And one copy for every other CPU */ - for_each_possible_cpu(i) { - if (newinfo->entries[i] && newinfo->entries[i] != entry0) - memcpy(newinfo->entries[i], entry0, newinfo->size); - } - return ret; } @@ -911,7 +903,7 @@ get_counters(const struct xt_table_info *t, seqcount_t *s = &per_cpu(xt_recseq, cpu); i = 0; - xt_entry_foreach(iter, t->entries[cpu], t->size) { + xt_entry_foreach(iter, t->entries, t->size) { struct xt_counters *tmp; u64 bcnt, pcnt; unsigned int start; @@ -959,17 +951,13 @@ copy_entries_to_user(unsigned int total_size, struct xt_counters *counters; const struct xt_table_info *private = table->private; int ret = 0; - const void *loc_cpu_entry; + void *loc_cpu_entry; counters = alloc_counters(table); if (IS_ERR(counters)) return PTR_ERR(counters); - /* choose the copy that is on our node/cpu, ... - * This choice is lazy (because current thread is - * allowed to migrate to another cpu) - */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; + loc_cpu_entry = private->entries; if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { ret = -EFAULT; goto free_counters; @@ -1083,10 +1071,10 @@ static int compat_table_info(const struct xt_table_info *info, if (!newinfo || !info) return -EINVAL; - /* we dont care about newinfo->entries[] */ + /* we dont care about newinfo->entries */ memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; - loc_cpu_entry = info->entries[raw_smp_processor_id()]; + loc_cpu_entry = info->entries; xt_compat_init_offsets(AF_INET6, info->number); xt_entry_foreach(iter, loc_cpu_entry, info->size) { ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); @@ -1207,7 +1195,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, struct xt_table *t; struct xt_table_info *oldinfo; struct xt_counters *counters; - const void *loc_cpu_old_entry; struct ip6t_entry *iter; ret = 0; @@ -1250,8 +1237,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, get_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ - loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; - xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size) + xt_entry_foreach(iter, oldinfo->entries, oldinfo->size) cleanup_entry(iter, net); xt_free_table_info(oldinfo); @@ -1297,8 +1283,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len) if (!newinfo) return -ENOMEM; - /* choose the copy that is on our node/cpu */ - loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + loc_cpu_entry = newinfo->entries; if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; @@ -1329,7 +1314,7 @@ static int do_add_counters(struct net *net, const void __user *user, unsigned int len, int compat) { - unsigned int i, curcpu; + unsigned int i; struct xt_counters_info tmp; struct xt_counters *paddc; unsigned int num_counters; @@ -1339,7 +1324,6 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len, struct xt_table *t; const struct xt_table_info *private; int ret = 0; - const void *loc_cpu_entry; struct ip6t_entry *iter; unsigned int addend; #ifdef CONFIG_COMPAT @@ -1395,11 +1379,8 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len, } i = 0; - /* Choose the copy that is on our node */ - curcpu = smp_processor_id(); addend = xt_write_recseq_begin(); - loc_cpu_entry = private->entries[curcpu]; - xt_entry_foreach(iter, loc_cpu_entry, private->size) { + xt_entry_foreach(iter, private->entries, private->size) { struct xt_counters *tmp; tmp = xt_get_this_cpu_counter(&iter->counters); @@ -1407,7 +1388,6 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len, ++i; } xt_write_recseq_end(addend); - unlock_up_free: local_bh_enable(); xt_table_unlock(t); @@ -1750,7 +1730,7 @@ translate_compat_table(struct net *net, newinfo->hook_entry[i] = info->hook_entry[i]; newinfo->underflow[i] = info->underflow[i]; } - entry1 = newinfo->entries[raw_smp_processor_id()]; + entry1 = newinfo->entries; pos = entry1; size = total_size; xt_entry_foreach(iter0, entry0, total_size) { @@ -1802,11 +1782,6 @@ translate_compat_table(struct net *net, return ret; } - /* And one copy for every other CPU */ - for_each_possible_cpu(i) - if (newinfo->entries[i] && newinfo->entries[i] != entry1) - memcpy(newinfo->entries[i], entry1, newinfo->size); - *pinfo = newinfo; *pentry0 = entry1; xt_free_table_info(info); @@ -1853,8 +1828,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) if (!newinfo) return -ENOMEM; - /* choose the copy that is on our node/cpu */ - loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + loc_cpu_entry = newinfo->entries; if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; @@ -1925,7 +1899,6 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, void __user *pos; unsigned int size; int ret = 0; - const void *loc_cpu_entry; unsigned int i = 0; struct ip6t_entry *iter; @@ -1933,14 +1906,9 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, if (IS_ERR(counters)) return PTR_ERR(counters); - /* choose the copy that is on our node/cpu, ... - * This choice is lazy (because current thread is - * allowed to migrate to another cpu) - */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; pos = userptr; size = total_size; - xt_entry_foreach(iter, loc_cpu_entry, total_size) { + xt_entry_foreach(iter, private->entries, total_size) { ret = compat_copy_entry_to_user(iter, &pos, &size, counters, i++); if (ret != 0) @@ -2115,8 +2083,7 @@ struct xt_table *ip6t_register_table(struct net *net, goto out; } - /* choose the copy on our node/cpu, but dont care about preemption */ - loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + loc_cpu_entry = newinfo->entries; memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(net, newinfo, loc_cpu_entry, repl); @@ -2146,7 +2113,7 @@ void ip6t_unregister_table(struct net *net, struct xt_table *table) private = xt_unregister_table(table); /* Decrease module usage counts and free resources */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; + loc_cpu_entry = private->entries; xt_entry_foreach(iter, loc_cpu_entry, private->size) cleanup_entry(iter, net); if (private->number > private->initial_entries) diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 83032464a4bd..6062ce3e862c 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -659,7 +659,6 @@ EXPORT_SYMBOL_GPL(xt_compat_target_to_user); struct xt_table_info *xt_alloc_table_info(unsigned int size) { struct xt_table_info *newinfo; - int cpu; /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */ if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > totalram_pages) @@ -671,19 +670,14 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size) newinfo->size = size; - for_each_possible_cpu(cpu) { - if (size <= PAGE_SIZE) - newinfo->entries[cpu] = kmalloc_node(size, - GFP_KERNEL, - cpu_to_node(cpu)); - else - newinfo->entries[cpu] = vmalloc_node(size, - cpu_to_node(cpu)); + if (size <= PAGE_SIZE) + newinfo->entries = kmalloc(size, GFP_KERNEL); + else + newinfo->entries = vmalloc(size); - if (newinfo->entries[cpu] == NULL) { - xt_free_table_info(newinfo); - return NULL; - } + if (newinfo->entries == NULL) { + xt_free_table_info(newinfo); + return NULL; } return newinfo; @@ -694,8 +688,7 @@ void xt_free_table_info(struct xt_table_info *info) { int cpu; - for_each_possible_cpu(cpu) - kvfree(info->entries[cpu]); + kvfree(info->entries); if (info->jumpstack != NULL) { for_each_possible_cpu(cpu) -- cgit From b60f2f3d65de3f3e9e63855e5f5070a3fedcccba Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 12 Jun 2015 12:12:22 +0200 Subject: net: ipv4: un-inline ip_finish_output2 text data bss dec hex filename old: 16527 44 0 16571 40bb net/ipv4/ip_output.o new: 14935 44 0 14979 3a83 net/ipv4/ip_output.o Suggested-by: Eric Dumazet Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index f5f5ef1cebd5..55f3c2e94357 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -172,7 +172,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, } EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); -static inline int ip_finish_output2(struct sock *sk, struct sk_buff *skb) +static int ip_finish_output2(struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct rtable *rt = (struct rtable *)dst; -- cgit From 611d23c559a328ca3f84ac120c02d5a9f88c08f5 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 12 Jun 2015 09:01:05 -0700 Subject: flow_dissector: Fix MPLS entropy label handling in flow dissector Need to shift after masking to get label value for comparison. Fixes: b3baa0fbd02a1a9d493d8 ("mpls: Add MPLS entropy label in flow_keys") Reported-by: Dan Carpenter Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 77e22e4fc898..1818cdcaa9b9 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -299,8 +299,8 @@ mpls: if (!hdr) return false; - if ((ntohl(hdr[0].entry) & MPLS_LS_LABEL_MASK) == - MPLS_LABEL_ENTROPY) { + if ((ntohl(hdr[0].entry) & MPLS_LS_LABEL_MASK) >> + MPLS_LS_LABEL_SHIFT == MPLS_LABEL_ENTROPY) { if (skb_flow_dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) { key_keyid = skb_flow_dissector_target(flow_dissector, -- cgit From 6a74fcf426f51aaa569f0b973d10ac20468df238 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 12 Jun 2015 09:01:06 -0700 Subject: flow_dissector: add support for dst, hop-by-hop and routing ext hdrs If dst, hop-by-hop or routing extension headers are present determine length of the options and skip over them in flow dissection. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 1818cdcaa9b9..22e4dffa0c8b 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -327,6 +327,7 @@ mpls: return false; } +ip_proto_again: switch (ip_proto) { case IPPROTO_GRE: { struct gre_hdr { @@ -383,6 +384,22 @@ mpls: } goto again; } + case NEXTHDR_HOP: + case NEXTHDR_ROUTING: + case NEXTHDR_DEST: { + u8 _opthdr[2], *opthdr; + + if (proto != htons(ETH_P_IPV6)) + break; + + opthdr = __skb_header_pointer(skb, nhoff, sizeof(_opthdr), + data, hlen, &_opthdr); + + ip_proto = _opthdr[0]; + nhoff += (_opthdr[1] + 1) << 3; + + goto ip_proto_again; + } case IPPROTO_IPIP: proto = htons(ETH_P_IP); goto ip; -- cgit From 34ac49664149dd8923e0de5d871f86c80292d27b Mon Sep 17 00:00:00 2001 From: Vincent Cuissard Date: Fri, 12 Jun 2015 15:35:53 +0200 Subject: NFC: nci: remove current SLEEP mode management NCI deactivate management was modified to support all NCI deactivation type. Problem is that all the API are not ready yet for it. Problem is that with current code, when neard asks to deactivate the tag it sends a deactivate SLEEP but nobody will then send a IDLE deactivate. This IDLE deactivate is mandatory since NFC controller can only be unlocked by DH. Signed-off-by: Vincent Cuissard Signed-off-by: Samuel Ortiz --- net/nfc/nci/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index f9aa08780b06..95af2d24d5be 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -798,7 +798,7 @@ static void nci_deactivate_target(struct nfc_dev *nfc_dev, if (atomic_read(&ndev->state) == NCI_POLL_ACTIVE) { nci_request(ndev, nci_rf_deactivate_req, - NCI_DEACTIVATE_TYPE_SLEEP_MODE, + NCI_DEACTIVATE_TYPE_IDLE_MODE, msecs_to_jiffies(NCI_RF_DEACTIVATE_TIMEOUT)); } } -- cgit From 1e98a0f08abddde87f0f93237f10629ecb4880ef Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 12 Jun 2015 19:31:32 -0700 Subject: flow_dissector: fix ipv6 dst, hop-by-hop and routing ext hdrs __skb_header_pointer() returns a pointer that must be checked. Fixes infinite loop reported by Alexei, and add __must_check to catch these errors earlier. Fixes: 6a74fcf426f5 ("flow_dissector: add support for dst, hop-by-hop and routing ext hdrs") Reported-by: Alexei Starovoitov Tested-by: Alexei Starovoitov Signed-off-by: Eric Dumazet Acked-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 22e4dffa0c8b..476e5dda59e1 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -394,9 +394,11 @@ ip_proto_again: opthdr = __skb_header_pointer(skb, nhoff, sizeof(_opthdr), data, hlen, &_opthdr); + if (!opthdr) + return false; - ip_proto = _opthdr[0]; - nhoff += (_opthdr[1] + 1) << 3; + ip_proto = opthdr[0]; + nhoff += (opthdr[1] + 1) << 3; goto ip_proto_again; } -- cgit From a2f0fad32b0d0022c7e5706d333d74a9579f3742 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 12 Jun 2015 19:34:03 -0700 Subject: tcp: tcp_v6_connect() cleanup Remove dead code from tcp_v6_connect() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 45a7176ed460..6748c4277aff 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -120,7 +120,6 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, struct ipv6_pinfo *np = inet6_sk(sk); struct tcp_sock *tp = tcp_sk(sk); struct in6_addr *saddr = NULL, *final_p, final; - struct rt6_info *rt; struct flowi6 fl6; struct dst_entry *dst; int addr_type; @@ -258,7 +257,6 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, sk->sk_gso_type = SKB_GSO_TCPV6; __ip6_dst_store(sk, dst, NULL, NULL); - rt = (struct rt6_info *) dst; if (tcp_death_row.sysctl_tw_recycle && !tp->rx_opt.ts_recent_stamp && ipv6_addr_equal(&fl6.daddr, &sk->sk_v6_daddr)) -- cgit From edda0791743eafc36382fb893d91547f36edaf7d Mon Sep 17 00:00:00 2001 From: Sergey Popovich Date: Fri, 12 Jun 2015 21:11:54 +0200 Subject: netfilter: ipset: Use SET_WITH_*() helpers to test set extensions Signed-off-by: Sergey Popovich Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/ip_set_core.c | 12 ++++++------ net/netfilter/ipset/ip_set_hash_gen.h | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 475e4960a164..347d97afc887 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -390,12 +390,12 @@ ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[], { u64 fullmark; if (tb[IPSET_ATTR_TIMEOUT]) { - if (!(set->extensions & IPSET_EXT_TIMEOUT)) + if (!SET_WITH_TIMEOUT(set)) return -IPSET_ERR_TIMEOUT; ext->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); } if (tb[IPSET_ATTR_BYTES] || tb[IPSET_ATTR_PACKETS]) { - if (!(set->extensions & IPSET_EXT_COUNTER)) + if (!SET_WITH_COUNTER(set)) return -IPSET_ERR_COUNTER; if (tb[IPSET_ATTR_BYTES]) ext->bytes = be64_to_cpu(nla_get_be64( @@ -405,25 +405,25 @@ ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[], tb[IPSET_ATTR_PACKETS])); } if (tb[IPSET_ATTR_COMMENT]) { - if (!(set->extensions & IPSET_EXT_COMMENT)) + if (!SET_WITH_COMMENT(set)) return -IPSET_ERR_COMMENT; ext->comment = ip_set_comment_uget(tb[IPSET_ATTR_COMMENT]); } if (tb[IPSET_ATTR_SKBMARK]) { - if (!(set->extensions & IPSET_EXT_SKBINFO)) + if (!SET_WITH_SKBINFO(set)) return -IPSET_ERR_SKBINFO; fullmark = be64_to_cpu(nla_get_be64(tb[IPSET_ATTR_SKBMARK])); ext->skbmark = fullmark >> 32; ext->skbmarkmask = fullmark & 0xffffffff; } if (tb[IPSET_ATTR_SKBPRIO]) { - if (!(set->extensions & IPSET_EXT_SKBINFO)) + if (!SET_WITH_SKBINFO(set)) return -IPSET_ERR_SKBINFO; ext->skbprio = be32_to_cpu(nla_get_be32( tb[IPSET_ATTR_SKBPRIO])); } if (tb[IPSET_ATTR_SKBQUEUE]) { - if (!(set->extensions & IPSET_EXT_SKBINFO)) + if (!SET_WITH_SKBINFO(set)) return -IPSET_ERR_SKBINFO; ext->skbqueue = be16_to_cpu(nla_get_be16( tb[IPSET_ATTR_SKBQUEUE])); diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 7952869c8023..8dd82db05ed0 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -431,7 +431,7 @@ mtype_destroy(struct ip_set *set) { struct htype *h = set->data; - if (set->extensions & IPSET_EXT_TIMEOUT) + if (SET_WITH_TIMEOUT(set)) del_timer_sync(&h->gc); mtype_ahash_destroy(set, rcu_dereference_bh_nfnl(h->table), true); -- cgit From 7dd37bc8e605d3ce14e6a1bc88ebbfae7ef43b9f Mon Sep 17 00:00:00 2001 From: Sergey Popovich Date: Fri, 12 Jun 2015 21:14:09 +0200 Subject: netfilter: ipset: Check extensions attributes before getting extensions. Make all extensions attributes checks within ip_set_get_extensions() and reduce number of duplicated code. Signed-off-by: Sergey Popovich Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/ip_set_bitmap_ip.c | 8 +------- net/netfilter/ipset/ip_set_bitmap_ipmac.c | 8 +------- net/netfilter/ipset/ip_set_bitmap_port.c | 8 +------- net/netfilter/ipset/ip_set_core.c | 9 +++++++++ net/netfilter/ipset/ip_set_hash_ip.c | 14 +------------- net/netfilter/ipset/ip_set_hash_ipmark.c | 14 +------------- net/netfilter/ipset/ip_set_hash_ipport.c | 14 +------------- net/netfilter/ipset/ip_set_hash_ipportip.c | 14 +------------- net/netfilter/ipset/ip_set_hash_ipportnet.c | 14 +------------- net/netfilter/ipset/ip_set_hash_mac.c | 8 +------- net/netfilter/ipset/ip_set_hash_net.c | 16 ++-------------- net/netfilter/ipset/ip_set_hash_netiface.c | 16 ++-------------- net/netfilter/ipset/ip_set_hash_netnet.c | 16 ++-------------- net/netfilter/ipset/ip_set_hash_netport.c | 16 ++-------------- net/netfilter/ipset/ip_set_hash_netportnet.c | 16 ++-------------- net/netfilter/ipset/ip_set_list_set.c | 8 +------- 16 files changed, 29 insertions(+), 170 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index 2fe6de46f6d0..212005e7d735 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -138,13 +138,7 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[], struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret = 0; - if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + if (unlikely(!tb[IPSET_ATTR_IP])) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index eb188561d65f..0648e749114b 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -239,13 +239,7 @@ bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[], u32 ip = 0; int ret = 0; - if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + if (unlikely(!tb[IPSET_ATTR_IP])) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c index 898edb693b3f..10ed264dbe91 100644 --- a/net/netfilter/ipset/ip_set_bitmap_port.c +++ b/net/netfilter/ipset/ip_set_bitmap_port.c @@ -137,13 +137,7 @@ bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[], int ret = 0; if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 347d97afc887..68ae551f2b39 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -389,6 +389,15 @@ ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[], struct ip_set_ext *ext) { u64 fullmark; + + if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + return -IPSET_ERR_PROTOCOL; + if (tb[IPSET_ATTR_TIMEOUT]) { if (!SET_WITH_TIMEOUT(set)) return -IPSET_ERR_TIMEOUT; diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index 54df48b5c455..0d955a480dee 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -108,13 +108,7 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], u32 ip = 0, ip_to = 0, hosts; int ret = 0; - if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + if (unlikely(!tb[IPSET_ATTR_IP])) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -247,12 +241,6 @@ hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[], int ret; if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) || tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) return -IPSET_ERR_PROTOCOL; diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c index d231248eb3e2..f4e14ec4684a 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmark.c +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -109,13 +109,7 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], int ret; if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_attr_netorder(tb, IPSET_ATTR_MARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_attr_netorder(tb, IPSET_ATTR_MARK))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -242,12 +236,6 @@ hash_ipmark6_uadt(struct ip_set *set, struct nlattr *tb[], if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_MARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) || tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) return -IPSET_ERR_PROTOCOL; diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index a47c29f12090..02d9dbaeea6b 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -118,13 +118,7 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -282,12 +276,6 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) || tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) return -IPSET_ERR_PROTOCOL; diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index 89615f134845..4e3b1c3f32b2 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -121,13 +121,7 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -293,12 +287,6 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) || tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) return -IPSET_ERR_PROTOCOL; diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index 6ba7a7e083f9..988567e7dbce 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -176,13 +176,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -429,13 +423,7 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) || tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) return -IPSET_ERR_PROTOCOL; diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c index 1f8668d7a538..0d4951997943 100644 --- a/net/netfilter/ipset/ip_set_hash_mac.c +++ b/net/netfilter/ipset/ip_set_hash_mac.c @@ -107,13 +107,7 @@ hash_mac4_uadt(struct ip_set *set, struct nlattr *tb[], struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret; - if (unlikely(!tb[IPSET_ATTR_ETHER] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + if (unlikely(!tb[IPSET_ATTR_ETHER])) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index 2e63dad8644d..5eb334dea163 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -147,13 +147,7 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], int ret; if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -319,13 +313,7 @@ hash_net6_uadt(struct ip_set *set, struct nlattr *tb[], int ret; if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index fe481f677f56..5eef6f315ccd 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -295,13 +295,7 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IFACE] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -531,13 +525,7 @@ hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[], if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IFACE] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index 847047483560..775b1b05a318 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -169,13 +169,7 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], e.cidr[0] = e.cidr[1] = HOST_MASK; if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -400,13 +394,7 @@ hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[], e.cidr[0] = e.cidr[1] = HOST_MASK; if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_IP2_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index 8273819c1a2f..0eb73daf375e 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -169,13 +169,7 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -387,13 +381,7 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index 1451a8ac938f..c3634add9084 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -187,13 +187,7 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -463,13 +457,7 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[], if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_IP2_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 5bd3b1eae3fa..7d1377d6b8bb 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -384,13 +384,7 @@ list_set_uadt(struct ip_set *set, struct nlattr *tb[], int ret = 0; if (unlikely(!tb[IPSET_ATTR_NAME] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) -- cgit From 2c227f278a92ca3a1515373cdf9cce3766433e40 Mon Sep 17 00:00:00 2001 From: Sergey Popovich Date: Fri, 12 Jun 2015 21:23:31 +0200 Subject: netfilter: ipset: Permit CIDR equal to the host address CIDR in IPv6 Permit userspace to supply CIDR length equal to the host address CIDR length in netlink message. Prohibit any other CIDR length for IPv6 variant of the set. Also return -IPSET_ERR_HASH_RANGE_UNSUPPORTED instead of generic -IPSET_ERR_PROTOCOL in IPv6 variant of hash:ip,port,net when IPSET_ATTR_IP_TO attribute is given. Signed-off-by: Sergey Popovich Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/ip_set_hash_ip.c | 12 +++++++++--- net/netfilter/ipset/ip_set_hash_ipmark.c | 12 +++++++++--- net/netfilter/ipset/ip_set_hash_ipport.c | 12 +++++++++--- net/netfilter/ipset/ip_set_hash_ipportip.c | 12 +++++++++--- net/netfilter/ipset/ip_set_hash_ipportnet.c | 10 +++++++--- 5 files changed, 43 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index 0d955a480dee..2b08b1bf6e3f 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -240,10 +240,16 @@ hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[], struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret; - if (unlikely(!tb[IPSET_ATTR_IP] || - tb[IPSET_ATTR_IP_TO] || - tb[IPSET_ATTR_CIDR])) + if (unlikely(!tb[IPSET_ATTR_IP])) return -IPSET_ERR_PROTOCOL; + if (unlikely(tb[IPSET_ATTR_IP_TO])) + return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; + if (unlikely(tb[IPSET_ATTR_CIDR])) { + u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + + if (cidr != HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c index f4e14ec4684a..68fe40ca4a1f 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmark.c +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -235,10 +235,16 @@ hash_ipmark6_uadt(struct ip_set *set, struct nlattr *tb[], int ret; if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_attr_netorder(tb, IPSET_ATTR_MARK) || - tb[IPSET_ATTR_IP_TO] || - tb[IPSET_ATTR_CIDR])) + !ip_set_attr_netorder(tb, IPSET_ATTR_MARK))) return -IPSET_ERR_PROTOCOL; + if (unlikely(tb[IPSET_ATTR_IP_TO])) + return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; + if (unlikely(tb[IPSET_ATTR_CIDR])) { + u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + + if (cidr != HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index 02d9dbaeea6b..9f036393a6e6 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -275,10 +275,16 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - tb[IPSET_ATTR_IP_TO] || - tb[IPSET_ATTR_CIDR])) + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO))) return -IPSET_ERR_PROTOCOL; + if (unlikely(tb[IPSET_ATTR_IP_TO])) + return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; + if (unlikely(tb[IPSET_ATTR_CIDR])) { + u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + + if (cidr != HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index 4e3b1c3f32b2..aa0966b145a9 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -286,10 +286,16 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - tb[IPSET_ATTR_IP_TO] || - tb[IPSET_ATTR_CIDR])) + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO))) return -IPSET_ERR_PROTOCOL; + if (unlikely(tb[IPSET_ATTR_IP_TO])) + return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; + if (unlikely(tb[IPSET_ATTR_CIDR])) { + u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + + if (cidr != HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index 988567e7dbce..f3c62565f38c 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -423,12 +423,16 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - tb[IPSET_ATTR_IP_TO] || - tb[IPSET_ATTR_CIDR])) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; + if (unlikely(tb[IPSET_ATTR_CIDR])) { + u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + + if (cidr != HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); -- cgit From a212e08e8e0a5c689e61dd175b6e99223dda835c Mon Sep 17 00:00:00 2001 From: Sergey Popovich Date: Fri, 12 Jun 2015 21:26:43 +0200 Subject: netfilter: ipset: Make sure we always return line number on batch Even if we return with generic IPSET_ERR_PROTOCOL it is good idea to return line number if we called in batch mode. Moreover we are not always exiting with IPSET_ERR_PROTOCOL. For example hash:ip,port,net may return IPSET_ERR_HASH_RANGE_UNSUPPORTED or IPSET_ERR_INVALID_CIDR. Signed-off-by: Sergey Popovich Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/ip_set_bitmap_ip.c | 6 +++--- net/netfilter/ipset/ip_set_bitmap_ipmac.c | 6 +++--- net/netfilter/ipset/ip_set_bitmap_port.c | 6 +++--- net/netfilter/ipset/ip_set_hash_ip.c | 12 ++++++------ net/netfilter/ipset/ip_set_hash_ipmark.c | 12 ++++++------ net/netfilter/ipset/ip_set_hash_ipport.c | 12 ++++++------ net/netfilter/ipset/ip_set_hash_ipportip.c | 12 ++++++------ net/netfilter/ipset/ip_set_hash_ipportnet.c | 12 ++++++------ net/netfilter/ipset/ip_set_hash_mac.c | 6 +++--- net/netfilter/ipset/ip_set_hash_net.c | 12 ++++++------ net/netfilter/ipset/ip_set_hash_netiface.c | 12 ++++++------ net/netfilter/ipset/ip_set_hash_netnet.c | 12 ++++++------ net/netfilter/ipset/ip_set_hash_netport.c | 12 ++++++------ net/netfilter/ipset/ip_set_hash_netportnet.c | 12 ++++++------ net/netfilter/ipset/ip_set_list_set.c | 6 +++--- 15 files changed, 75 insertions(+), 75 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index 212005e7d735..7af99c3e5a4d 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -138,12 +138,12 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[], struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret = 0; - if (unlikely(!tb[IPSET_ATTR_IP])) - return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP])) + return -IPSET_ERR_PROTOCOL; + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); if (ret) return ret; diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index 0648e749114b..773342292623 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -239,12 +239,12 @@ bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[], u32 ip = 0; int ret = 0; - if (unlikely(!tb[IPSET_ATTR_IP])) - return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP])) + return -IPSET_ERR_PROTOCOL; + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); if (ret) return ret; diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c index 10ed264dbe91..ec3bda1ec90e 100644 --- a/net/netfilter/ipset/ip_set_bitmap_port.c +++ b/net/netfilter/ipset/ip_set_bitmap_port.c @@ -136,13 +136,13 @@ bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[], u16 port_to; int ret = 0; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO))) return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - port = ip_set_get_h16(tb[IPSET_ATTR_PORT]); if (port < map->first_port || port > map->last_port) return -IPSET_ERR_BITMAP_RANGE; diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index 2b08b1bf6e3f..2bbadcc96ac5 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -108,12 +108,12 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], u32 ip = 0, ip_to = 0, hosts; int ret = 0; - if (unlikely(!tb[IPSET_ATTR_IP])) - return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP])) + return -IPSET_ERR_PROTOCOL; + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); if (ret) return ret; @@ -240,6 +240,9 @@ hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[], struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP])) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) @@ -251,9 +254,6 @@ hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[], return -IPSET_ERR_INVALID_CIDR; } - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c index 68fe40ca4a1f..3aafb36484b4 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmark.c +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -108,13 +108,13 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], u32 ip, ip_to = 0; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_MARK))) return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; @@ -234,6 +234,9 @@ hash_ipmark6_uadt(struct ip_set *set, struct nlattr *tb[], struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_MARK))) return -IPSET_ERR_PROTOCOL; @@ -246,9 +249,6 @@ hash_ipmark6_uadt(struct ip_set *set, struct nlattr *tb[], return -IPSET_ERR_INVALID_CIDR; } - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index 9f036393a6e6..4db1270f1197 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -116,14 +116,14 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], bool with_ports = false; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO))) return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; @@ -273,6 +273,9 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], bool with_ports = false; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO))) @@ -286,9 +289,6 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], return -IPSET_ERR_INVALID_CIDR; } - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index aa0966b145a9..c01bf68708ec 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -119,14 +119,14 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], bool with_ports = false; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO))) return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; @@ -284,6 +284,9 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], bool with_ports = false; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO))) @@ -297,9 +300,6 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], return -IPSET_ERR_INVALID_CIDR; } - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index f3c62565f38c..e38a029f3002 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -173,15 +173,15 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], u8 cidr; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); if (ret) return ret; @@ -420,6 +420,9 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], u8 cidr; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || @@ -434,9 +437,6 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], return -IPSET_ERR_INVALID_CIDR; } - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c index 0d4951997943..8981c8b242b3 100644 --- a/net/netfilter/ipset/ip_set_hash_mac.c +++ b/net/netfilter/ipset/ip_set_hash_mac.c @@ -107,12 +107,12 @@ hash_mac4_uadt(struct ip_set *set, struct nlattr *tb[], struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret; - if (unlikely(!tb[IPSET_ATTR_ETHER])) - return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_ETHER])) + return -IPSET_ERR_PROTOCOL; + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index 5eb334dea163..2988ec5c4c4a 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -146,13 +146,13 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], u32 ip = 0, ip_to = 0, last; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); if (ret) return ret; @@ -312,15 +312,15 @@ hash_net6_uadt(struct ip_set *set, struct nlattr *tb[], struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index 5eef6f315ccd..c80588c3071c 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -293,14 +293,14 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], char iface[IFNAMSIZ]; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IFACE] || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); if (ret) return ret; @@ -523,6 +523,9 @@ hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[], char iface[IFNAMSIZ]; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IFACE] || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) @@ -530,9 +533,6 @@ hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[], if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index 775b1b05a318..2e6a1ae705a6 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -167,14 +167,14 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], u8 cidr, cidr2; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + e.cidr[0] = e.cidr[1] = HOST_MASK; if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); if (ret) return ret; @@ -392,6 +392,9 @@ hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[], struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + e.cidr[0] = e.cidr[1] = HOST_MASK; if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) @@ -399,9 +402,6 @@ hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[], if (unlikely(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_IP2_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip[0]); if (ret) return ret; diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index 0eb73daf375e..7a6448cbd8fb 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -166,15 +166,15 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], u8 cidr; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); if (ret) return ret; @@ -378,6 +378,9 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], u8 cidr; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || @@ -386,9 +389,6 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index c3634add9084..7ad9a77ef957 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -183,6 +183,9 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], u8 cidr, cidr2; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + e.cidr[0] = e.cidr[1] = HOST_MASK; if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || @@ -190,9 +193,6 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); if (ret) return ret; @@ -453,6 +453,9 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[], bool with_ports = false; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + e.cidr[0] = e.cidr[1] = HOST_MASK; if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || @@ -462,9 +465,6 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[], if (unlikely(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_IP2_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip[0]); if (ret) return ret; diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 7d1377d6b8bb..107ea6cc47f1 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -383,13 +383,13 @@ list_set_uadt(struct ip_set *set, struct nlattr *tb[], struct ip_set *s; int ret = 0; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_NAME] || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; -- cgit From aff227581ed1ac299e3a50eef4bb1cef944e1404 Mon Sep 17 00:00:00 2001 From: Sergey Popovich Date: Fri, 12 Jun 2015 21:30:57 +0200 Subject: netfilter: ipset: Check CIDR value only when attribute is given There is no reason to check CIDR value regardless attribute specifying CIDR is given. Initialize cidr array in element structure on element structure declaration to let more freedom to the compiler to optimize initialization right before element structure is used. Remove local variables cidr and cidr2 for netnet and netportnet hashes as we do not use packed cidr value for such set types and can store value directly in e.cidr[]. Signed-off-by: Sergey Popovich Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/ip_set_hash_net.c | 8 +++--- net/netfilter/ipset/ip_set_hash_netiface.c | 8 +++--- net/netfilter/ipset/ip_set_hash_netnet.c | 37 ++++++++++++---------------- net/netfilter/ipset/ip_set_hash_netportnet.c | 37 ++++++++++++---------------- 4 files changed, 41 insertions(+), 49 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index 2988ec5c4c4a..d19926ac3a03 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -329,11 +329,11 @@ hash_net6_uadt(struct ip_set *set, struct nlattr *tb[], if (ret) return ret; - if (tb[IPSET_ATTR_CIDR]) + if (tb[IPSET_ATTR_CIDR]) { e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - - if (!e.cidr || e.cidr > HOST_MASK) - return -IPSET_ERR_INVALID_CIDR; + if (!e.cidr || e.cidr > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } ip6_netmask(&e.ip, e.cidr); diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index c80588c3071c..7b69fa28d774 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -541,10 +541,12 @@ hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[], if (ret) return ret; - if (tb[IPSET_ATTR_CIDR]) + if (tb[IPSET_ATTR_CIDR]) { e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (e.cidr > HOST_MASK) - return -IPSET_ERR_INVALID_CIDR; + if (e.cidr > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } + ip6_netmask(&e.ip, e.cidr); strcpy(iface, nla_data(tb[IPSET_ATTR_IFACE])); diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index 2e6a1ae705a6..11eee0077b8b 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -160,17 +160,15 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], { const struct hash_netnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_netnet4_elem e = { }; + struct hash_netnet4_elem e = { .cidr = { HOST_MASK, HOST_MASK, }, }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0, last; u32 ip2 = 0, ip2_from = 0, ip2_to = 0, last2; - u8 cidr, cidr2; int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - e.cidr[0] = e.cidr[1] = HOST_MASK; if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; @@ -188,17 +186,15 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], return ret; if (tb[IPSET_ATTR_CIDR]) { - cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!cidr || cidr > HOST_MASK) + e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]); + if (!e.cidr[0] || e.cidr[0] > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; - e.cidr[0] = cidr; } if (tb[IPSET_ATTR_CIDR2]) { - cidr2 = nla_get_u8(tb[IPSET_ATTR_CIDR2]); - if (!cidr2 || cidr2 > HOST_MASK) + e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]); + if (!e.cidr[1] || e.cidr[1] > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; - e.cidr[1] = cidr2; } if (tb[IPSET_ATTR_CADT_FLAGS]) { @@ -245,15 +241,13 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], while (!after(ip, ip_to)) { e.ip[0] = htonl(ip); - last = ip_set_range_to_cidr(ip, ip_to, &cidr); - e.cidr[0] = cidr; + last = ip_set_range_to_cidr(ip, ip_to, &e.cidr[0]); ip2 = (retried && ip == ntohl(h->next.ip[0])) ? ntohl(h->next.ip[1]) : ip2_from; while (!after(ip2, ip2_to)) { e.ip[1] = htonl(ip2); - last2 = ip_set_range_to_cidr(ip2, ip2_to, &cidr2); - e.cidr[1] = cidr2; + last2 = ip_set_range_to_cidr(ip2, ip2_to, &e.cidr[1]); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; @@ -388,14 +382,13 @@ hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_netnet6_elem e = { }; + struct hash_netnet6_elem e = { .cidr = { HOST_MASK, HOST_MASK, }, }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - e.cidr[0] = e.cidr[1] = HOST_MASK; if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; @@ -414,15 +407,17 @@ hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[], if (ret) return ret; - if (tb[IPSET_ATTR_CIDR]) + if (tb[IPSET_ATTR_CIDR]) { e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]); + if (!e.cidr[0] || e.cidr[0] > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } - if (tb[IPSET_ATTR_CIDR2]) + if (tb[IPSET_ATTR_CIDR2]) { e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]); - - if (!e.cidr[0] || e.cidr[0] > HOST_MASK || !e.cidr[1] || - e.cidr[1] > HOST_MASK) - return -IPSET_ERR_INVALID_CIDR; + if (!e.cidr[1] || e.cidr[1] > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } ip6_netmask(&e.ip[0], e.cidr[0]); ip6_netmask(&e.ip[1], e.cidr[1]); diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index 7ad9a77ef957..6eb5f8798304 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -175,18 +175,16 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], { const struct hash_netportnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_netportnet4_elem e = { }; + struct hash_netportnet4_elem e = { .cidr = { HOST_MASK, HOST_MASK, }, }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0, ip_last, p = 0, port, port_to; u32 ip2_from = 0, ip2_to = 0, ip2_last, ip2; bool with_ports = false; - u8 cidr, cidr2; int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - e.cidr[0] = e.cidr[1] = HOST_MASK; if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || @@ -206,17 +204,15 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], return ret; if (tb[IPSET_ATTR_CIDR]) { - cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!cidr || cidr > HOST_MASK) + e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]); + if (!e.cidr[0] || e.cidr[0] > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; - e.cidr[0] = cidr; } if (tb[IPSET_ATTR_CIDR2]) { - cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]); - if (!cidr || cidr > HOST_MASK) + e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]); + if (!e.cidr[1] || e.cidr[1] > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; - e.cidr[1] = cidr; } e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); @@ -285,8 +281,7 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], while (!after(ip, ip_to)) { e.ip[0] = htonl(ip); - ip_last = ip_set_range_to_cidr(ip, ip_to, &cidr); - e.cidr[0] = cidr; + ip_last = ip_set_range_to_cidr(ip, ip_to, &e.cidr[0]); p = retried && ip == ntohl(h->next.ip[0]) ? ntohs(h->next.port) : port; for (; p <= port_to; p++) { @@ -297,8 +292,7 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], while (!after(ip2, ip2_to)) { e.ip[1] = htonl(ip2); ip2_last = ip_set_range_to_cidr(ip2, ip2_to, - &cidr2); - e.cidr[1] = cidr2; + &e.cidr[1]); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; @@ -447,7 +441,7 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[], { const struct hash_netportnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_netportnet6_elem e = { }; + struct hash_netportnet6_elem e = { .cidr = { HOST_MASK, HOST_MASK, }, }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 port, port_to; bool with_ports = false; @@ -456,7 +450,6 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - e.cidr[0] = e.cidr[1] = HOST_MASK; if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || @@ -477,15 +470,17 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[], if (ret) return ret; - if (tb[IPSET_ATTR_CIDR]) + if (tb[IPSET_ATTR_CIDR]) { e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]); + if (!e.cidr[0] || e.cidr[0] > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } - if (tb[IPSET_ATTR_CIDR2]) + if (tb[IPSET_ATTR_CIDR2]) { e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]); - - if (unlikely(!e.cidr[0] || e.cidr[0] > HOST_MASK || !e.cidr[1] || - e.cidr[1] > HOST_MASK)) - return -IPSET_ERR_INVALID_CIDR; + if (!e.cidr[1] || e.cidr[1] > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } ip6_netmask(&e.ip[0], e.cidr[0]); ip6_netmask(&e.ip[1], e.cidr[1]); -- cgit From f690cbaed9fe4d77592e24139db7ad790641c4fd Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Fri, 12 Jun 2015 22:11:00 +0200 Subject: netfilter: ipset: Fix cidr handling for hash:*net* types Commit "Simplify cidr handling for hash:*net* types" broke the cidr handling for the hash:*net* types when the sets were used by the SET target: entries with invalid cidr values were added to the sets. Reported by Jonathan Johnson. Testsuite entry is added to verify the fix. Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/ip_set_hash_gen.h | 44 ++++++++++++++++------------ net/netfilter/ipset/ip_set_hash_ipportnet.c | 4 +-- net/netfilter/ipset/ip_set_hash_net.c | 4 +-- net/netfilter/ipset/ip_set_hash_netiface.c | 4 +-- net/netfilter/ipset/ip_set_hash_netnet.c | 8 ++--- net/netfilter/ipset/ip_set_hash_netport.c | 4 +-- net/netfilter/ipset/ip_set_hash_netportnet.c | 8 ++--- 7 files changed, 42 insertions(+), 34 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 8dd82db05ed0..2f1985e71f6c 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -149,17 +149,21 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize) #endif /* cidr + 1 is stored in net_prefixes to support /0 */ -#define SCIDR(cidr, i) (__CIDR(cidr, i) + 1) +#define NCIDR_PUT(cidr) ((cidr) + 1) +#define NCIDR_GET(cidr) ((cidr) - 1) #ifdef IP_SET_HASH_WITH_NETS_PACKED /* When cidr is packed with nomatch, cidr - 1 is stored in the data entry */ -#define GCIDR(cidr, i) (__CIDR(cidr, i) + 1) -#define NCIDR(cidr) (cidr) +#define DCIDR_PUT(cidr) ((cidr) - 1) +#define DCIDR_GET(cidr, i) (__CIDR(cidr, i) + 1) #else -#define GCIDR(cidr, i) (__CIDR(cidr, i)) -#define NCIDR(cidr) (cidr - 1) +#define DCIDR_PUT(cidr) (cidr) +#define DCIDR_GET(cidr, i) __CIDR(cidr, i) #endif +#define INIT_CIDR(cidr, host_mask) \ + DCIDR_PUT(((cidr) ? NCIDR_GET(cidr) : host_mask)) + #define SET_HOST_MASK(family) (family == AF_INET ? 32 : 128) #ifdef IP_SET_HASH_WITH_NET0 @@ -303,7 +307,8 @@ struct htype { #ifdef IP_SET_HASH_WITH_NETS /* Network cidr size book keeping when the hash stores different - * sized networks */ + * sized networks. cidr == real cidr + 1 to support /0. + */ static void mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n) { @@ -498,8 +503,10 @@ mtype_expire(struct ip_set *set, struct htype *h, u8 nets_length, size_t dsize) pr_debug("expired %u/%u\n", i, j); #ifdef IP_SET_HASH_WITH_NETS for (k = 0; k < IPSET_NET_COUNT; k++) - mtype_del_cidr(h, SCIDR(data->cidr, k), - nets_length, k); + mtype_del_cidr(h, + NCIDR_PUT(DCIDR_GET(data->cidr, + k)), + nets_length, k); #endif ip_set_ext_destroy(set, data); if (j != n->pos - 1) @@ -692,9 +699,9 @@ reuse_slot: data = ahash_data(n, j, set->dsize); #ifdef IP_SET_HASH_WITH_NETS for (i = 0; i < IPSET_NET_COUNT; i++) { - mtype_del_cidr(h, SCIDR(data->cidr, i), + mtype_del_cidr(h, NCIDR_PUT(DCIDR_GET(data->cidr, i)), NLEN(set->family), i); - mtype_add_cidr(h, SCIDR(d->cidr, i), + mtype_add_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, i)), NLEN(set->family), i); } #endif @@ -711,8 +718,8 @@ reuse_slot: data = ahash_data(n, n->pos++, set->dsize); #ifdef IP_SET_HASH_WITH_NETS for (i = 0; i < IPSET_NET_COUNT; i++) - mtype_add_cidr(h, SCIDR(d->cidr, i), NLEN(set->family), - i); + mtype_add_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, i)), + NLEN(set->family), i); #endif h->elements++; } @@ -772,8 +779,8 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, h->elements--; #ifdef IP_SET_HASH_WITH_NETS for (j = 0; j < IPSET_NET_COUNT; j++) - mtype_del_cidr(h, SCIDR(d->cidr, j), NLEN(set->family), - j); + mtype_del_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, j)), + NLEN(set->family), j); #endif ip_set_ext_destroy(set, data); if (n->pos + AHASH_INIT_SIZE < n->size) { @@ -836,12 +843,13 @@ mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d, for (; j < nets_length && h->nets[j].cidr[0] && !multi; j++) { #if IPSET_NET_COUNT == 2 mtype_data_reset_elem(d, &orig); - mtype_data_netmask(d, NCIDR(h->nets[j].cidr[0]), false); + mtype_data_netmask(d, NCIDR_GET(h->nets[j].cidr[0]), false); for (k = 0; k < nets_length && h->nets[k].cidr[1] && !multi; k++) { - mtype_data_netmask(d, NCIDR(h->nets[k].cidr[1]), true); + mtype_data_netmask(d, NCIDR_GET(h->nets[k].cidr[1]), + true); #else - mtype_data_netmask(d, NCIDR(h->nets[j].cidr[0])); + mtype_data_netmask(d, NCIDR_GET(h->nets[j].cidr[0])); #endif key = HKEY(d, h->initval, t->htable_bits); n = hbucket(t, key); @@ -889,7 +897,7 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, /* If we test an IP address and not a network address, * try all possible network sizes */ for (i = 0; i < IPSET_NET_COUNT; i++) - if (GCIDR(d->cidr, i) != SET_HOST_MASK(set->family)) + if (DCIDR_GET(d->cidr, i) != SET_HOST_MASK(set->family)) break; if (i == IPSET_NET_COUNT) { ret = mtype_test_cidrs(set, d, ext, mext, flags); diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index e38a029f3002..50248debdc8b 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -141,7 +141,7 @@ hash_ipportnet4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct hash_ipportnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportnet4_elem e = { - .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1, + .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); @@ -389,7 +389,7 @@ hash_ipportnet6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct hash_ipportnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportnet6_elem e = { - .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1, + .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index d19926ac3a03..089b23fd1a94 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -120,7 +120,7 @@ hash_net4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct hash_net *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_net4_elem e = { - .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), + .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); @@ -288,7 +288,7 @@ hash_net6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct hash_net *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_net6_elem e = { - .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), + .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index 7b69fa28d774..aac20768f6f0 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -235,7 +235,7 @@ hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb, struct hash_netiface *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netiface4_elem e = { - .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), + .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), .elem = 1, }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); @@ -469,7 +469,7 @@ hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb, struct hash_netiface *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netiface6_elem e = { - .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), + .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), .elem = 1, }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index 11eee0077b8b..ed9cc45084dd 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -141,8 +141,8 @@ hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb, struct hash_netnet4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); - e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); - e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); + e.cidr[0] = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); + e.cidr[1] = INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); if (adt == IPSET_TEST) e.ccmp = (HOST_MASK << (sizeof(e.cidr[0]) * 8)) | HOST_MASK; @@ -364,8 +364,8 @@ hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb, struct hash_netnet6_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); - e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); - e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); + e.cidr[0] = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); + e.cidr[1] = INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); if (adt == IPSET_TEST) e.ccmp = (HOST_MASK << (sizeof(u8)*8)) | HOST_MASK; diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index 7a6448cbd8fb..fbaf8138e5d4 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -136,7 +136,7 @@ hash_netport4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct hash_netport *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netport4_elem e = { - .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1, + .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); @@ -348,7 +348,7 @@ hash_netport6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct hash_netport *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netport6_elem e = { - .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1, + .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index 6eb5f8798304..a828cbc8bed7 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -152,8 +152,8 @@ hash_netportnet4_kadt(struct ip_set *set, const struct sk_buff *skb, struct hash_netportnet4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); - e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); - e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); + e.cidr[0] = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); + e.cidr[1] = INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); if (adt == IPSET_TEST) e.ccmp = (HOST_MASK << (sizeof(e.cidr[0]) * 8)) | HOST_MASK; @@ -418,8 +418,8 @@ hash_netportnet6_kadt(struct ip_set *set, const struct sk_buff *skb, struct hash_netportnet6_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); - e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); - e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); + e.cidr[0] = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); + e.cidr[1] = INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); if (adt == IPSET_TEST) e.ccmp = (HOST_MASK << (sizeof(u8) * 8)) | HOST_MASK; -- cgit From c4c997839cf92cb1037e43a85cdb4cbf44ed39a5 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Sat, 13 Jun 2015 11:59:45 +0200 Subject: netfilter: ipset: Fix parallel resizing and listing of the same set When elements added to a hash:* type of set and resizing triggered, parallel listing could start to list the original set (before resizing) and "continue" with listing the new set. Fix it by references and using the original hash table for listing. Therefore the destroying of the original hash table may happen from the resizing or listing functions. Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/ip_set_core.c | 29 +++++++++++++---------- net/netfilter/ipset/ip_set_hash_gen.h | 44 ++++++++++++++++++++++++++++++++--- 2 files changed, 58 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 68ae551f2b39..777cac6fd64d 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1211,12 +1211,16 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb, static int ip_set_dump_done(struct netlink_callback *cb) { - struct ip_set_net *inst = (struct ip_set_net *)cb->args[IPSET_CB_NET]; if (cb->args[IPSET_CB_ARG0]) { - pr_debug("release set %s\n", - ip_set(inst, cb->args[IPSET_CB_INDEX])->name); - __ip_set_put_byindex(inst, - (ip_set_id_t) cb->args[IPSET_CB_INDEX]); + struct ip_set_net *inst = + (struct ip_set_net *)cb->args[IPSET_CB_NET]; + ip_set_id_t index = (ip_set_id_t)cb->args[IPSET_CB_INDEX]; + struct ip_set *set = ip_set(inst, index); + + if (set->variant->uref) + set->variant->uref(set, cb, false); + pr_debug("release set %s\n", set->name); + __ip_set_put_byindex(inst, index); } return 0; } @@ -1247,12 +1251,6 @@ dump_init(struct netlink_callback *cb, struct ip_set_net *inst) nla_parse(cda, IPSET_ATTR_CMD_MAX, attr, nlh->nlmsg_len - min_len, ip_set_setname_policy); - /* cb->args[IPSET_CB_NET]: net namespace - * [IPSET_CB_DUMP]: dump single set/all sets - * [IPSET_CB_INDEX]: set index - * [IPSET_CB_ARG0]: type specific - */ - if (cda[IPSET_ATTR_SETNAME]) { struct ip_set *set; @@ -1359,6 +1357,8 @@ dump_last: goto release_refcount; if (dump_flags & IPSET_FLAG_LIST_HEADER) goto next_set; + if (set->variant->uref) + set->variant->uref(set, cb, true); /* Fall through and add elements */ default: read_lock_bh(&set->lock); @@ -1375,6 +1375,8 @@ dump_last: dump_type = DUMP_LAST; cb->args[IPSET_CB_DUMP] = dump_type | (dump_flags << 16); cb->args[IPSET_CB_INDEX] = 0; + if (set && set->variant->uref) + set->variant->uref(set, cb, false); goto dump_last; } goto out; @@ -1389,7 +1391,10 @@ next_set: release_refcount: /* If there was an error or set is done, release set */ if (ret || !cb->args[IPSET_CB_ARG0]) { - pr_debug("release set %s\n", ip_set(inst, index)->name); + set = ip_set(inst, index); + if (set->variant->uref) + set->variant->uref(set, cb, false); + pr_debug("release set %s\n", set->name); __ip_set_put_byindex(inst, index); cb->args[IPSET_CB_ARG0] = 0; } diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 2f1985e71f6c..5fcf70b0ebc2 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -71,6 +71,8 @@ struct hbucket { /* The hash table: the table size stored here in order to make resizing easy */ struct htable { + atomic_t ref; /* References for resizing */ + atomic_t uref; /* References for dumping */ u8 htable_bits; /* size of hash table == 2^htable_bits */ struct hbucket bucket[0]; /* hashtable buckets */ }; @@ -207,6 +209,7 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize) #undef mtype_del #undef mtype_test_cidrs #undef mtype_test +#undef mtype_uref #undef mtype_expire #undef mtype_resize #undef mtype_head @@ -248,6 +251,7 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize) #define mtype_del IPSET_TOKEN(MTYPE, _del) #define mtype_test_cidrs IPSET_TOKEN(MTYPE, _test_cidrs) #define mtype_test IPSET_TOKEN(MTYPE, _test) +#define mtype_uref IPSET_TOKEN(MTYPE, _uref) #define mtype_expire IPSET_TOKEN(MTYPE, _expire) #define mtype_resize IPSET_TOKEN(MTYPE, _resize) #define mtype_head IPSET_TOKEN(MTYPE, _head) @@ -595,6 +599,9 @@ retry: t->htable_bits = htable_bits; read_lock_bh(&set->lock); + /* There can't be another parallel resizing, but dumping is possible */ + atomic_set(&orig->ref, 1); + atomic_inc(&orig->uref); for (i = 0; i < jhash_size(orig->htable_bits); i++) { n = hbucket(orig, i); for (j = 0; j < n->pos; j++) { @@ -609,6 +616,8 @@ retry: #ifdef IP_SET_HASH_WITH_NETS mtype_data_reset_flags(data, &flags); #endif + atomic_set(&orig->ref, 0); + atomic_dec(&orig->uref); read_unlock_bh(&set->lock); mtype_ahash_destroy(set, t, false); if (ret == -EAGAIN) @@ -631,7 +640,11 @@ retry: pr_debug("set %s resized from %u (%p) to %u (%p)\n", set->name, orig->htable_bits, orig, t->htable_bits, t); - mtype_ahash_destroy(set, orig, false); + /* If there's nobody else dumping the table, destroy it */ + if (atomic_dec_and_test(&orig->uref)) { + pr_debug("Table destroy by resize %p\n", orig); + mtype_ahash_destroy(set, orig, false); + } return 0; } @@ -961,13 +974,36 @@ nla_put_failure: return -EMSGSIZE; } +/* Make possible to run dumping parallel with resizing */ +static void +mtype_uref(struct ip_set *set, struct netlink_callback *cb, bool start) +{ + struct htype *h = set->data; + struct htable *t; + + if (start) { + rcu_read_lock_bh(); + t = rcu_dereference_bh_nfnl(h->table); + atomic_inc(&t->uref); + cb->args[IPSET_CB_PRIVATE] = (unsigned long)t; + rcu_read_unlock_bh(); + } else if (cb->args[IPSET_CB_PRIVATE]) { + t = (struct htable *)cb->args[IPSET_CB_PRIVATE]; + if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) { + /* Resizing didn't destroy the hash table */ + pr_debug("Table destroy by dump: %p\n", t); + mtype_ahash_destroy(set, t, false); + } + cb->args[IPSET_CB_PRIVATE] = 0; + } +} + /* Reply a LIST/SAVE request: dump the elements of the specified set */ static int mtype_list(const struct ip_set *set, struct sk_buff *skb, struct netlink_callback *cb) { - const struct htype *h = set->data; - const struct htable *t = rcu_dereference_bh_nfnl(h->table); + const struct htable *t; struct nlattr *atd, *nested; const struct hbucket *n; const struct mtype_elem *e; @@ -980,6 +1016,7 @@ mtype_list(const struct ip_set *set, if (!atd) return -EMSGSIZE; pr_debug("list hash set %s\n", set->name); + t = (const struct htable *)cb->args[IPSET_CB_PRIVATE]; for (; cb->args[IPSET_CB_ARG0] < jhash_size(t->htable_bits); cb->args[IPSET_CB_ARG0]++) { incomplete = skb_tail_pointer(skb); @@ -1047,6 +1084,7 @@ static const struct ip_set_type_variant mtype_variant = { .flush = mtype_flush, .head = mtype_head, .list = mtype_list, + .uref = mtype_uref, .resize = mtype_resize, .same_set = mtype_same_set, }; -- cgit From 9c1ba5c809381fb9fb779e2cc22a1c878a269ffb Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Sat, 13 Jun 2015 13:39:38 +0200 Subject: netfilter: ipset: Make sure listing doesn't grab a set which is just being destroyed. There was a small window when all sets are destroyed and a concurrent listing of all sets could grab a set which is just being destroyed. Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/ip_set_core.c | 49 +++++++++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 777cac6fd64d..87b4182660ba 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -32,7 +32,8 @@ static DEFINE_RWLOCK(ip_set_ref_lock); /* protects the set refs */ struct ip_set_net { struct ip_set * __rcu *ip_set_list; /* all individual sets */ ip_set_id_t ip_set_max; /* max number of sets */ - int is_deleted; /* deleted by ip_set_net_exit */ + bool is_deleted; /* deleted by ip_set_net_exit */ + bool is_destroyed; /* all sets are destroyed */ }; static int ip_set_net_id __read_mostly; @@ -980,12 +981,9 @@ ip_set_setname_policy[IPSET_ATTR_CMD_MAX + 1] = { }; static void -ip_set_destroy_set(struct ip_set_net *inst, ip_set_id_t index) +ip_set_destroy_set(struct ip_set *set) { - struct ip_set *set = ip_set(inst, index); - pr_debug("set: %s\n", set->name); - ip_set(inst, index) = NULL; /* Must call it without holding any lock */ set->variant->destroy(set); @@ -1025,12 +1023,17 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb, goto out; } } + inst->is_destroyed = true; read_unlock_bh(&ip_set_ref_lock); for (i = 0; i < inst->ip_set_max; i++) { s = ip_set(inst, i); - if (s != NULL) - ip_set_destroy_set(inst, i); + if (s) { + ip_set(inst, i) = NULL; + ip_set_destroy_set(s); + } } + /* Modified by ip_set_destroy() only, which is serialized */ + inst->is_destroyed = false; } else { s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]), &i); @@ -1041,9 +1044,10 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb, ret = -IPSET_ERR_BUSY; goto out; } + ip_set(inst, i) = NULL; read_unlock_bh(&ip_set_ref_lock); - ip_set_destroy_set(inst, i); + ip_set_destroy_set(s); } return 0; out: @@ -1283,6 +1287,7 @@ ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb) unsigned int flags = NETLINK_CB(cb->skb).portid ? NLM_F_MULTI : 0; struct ip_set_net *inst = ip_set_pernet(sock_net(skb->sk)); u32 dump_type, dump_flags; + bool is_destroyed; int ret = 0; if (!cb->args[IPSET_CB_DUMP]) { @@ -1309,12 +1314,20 @@ dump_last: dump_type, dump_flags, cb->args[IPSET_CB_INDEX]); for (; cb->args[IPSET_CB_INDEX] < max; cb->args[IPSET_CB_INDEX]++) { index = (ip_set_id_t) cb->args[IPSET_CB_INDEX]; + write_lock_bh(&ip_set_ref_lock); set = ip_set(inst, index); - if (set == NULL) { + is_destroyed = inst->is_destroyed; + if (!set || is_destroyed) { + write_unlock_bh(&ip_set_ref_lock); if (dump_type == DUMP_ONE) { ret = -ENOENT; goto out; } + if (is_destroyed) { + /* All sets are just being destroyed */ + ret = 0; + goto out; + } continue; } /* When dumping all sets, we must dump "sorted" @@ -1322,14 +1335,17 @@ dump_last: */ if (dump_type != DUMP_ONE && ((dump_type == DUMP_ALL) == - !!(set->type->features & IPSET_DUMP_LAST))) + !!(set->type->features & IPSET_DUMP_LAST))) { + write_unlock_bh(&ip_set_ref_lock); continue; + } pr_debug("List set: %s\n", set->name); if (!cb->args[IPSET_CB_ARG0]) { /* Start listing: make sure set won't be destroyed */ pr_debug("reference set\n"); - __ip_set_get(set); + set->ref++; } + write_unlock_bh(&ip_set_ref_lock); nlh = start_msg(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags, IPSET_CMD_LIST); @@ -2012,7 +2028,8 @@ ip_set_net_init(struct net *net) list = kzalloc(sizeof(struct ip_set *) * inst->ip_set_max, GFP_KERNEL); if (!list) return -ENOMEM; - inst->is_deleted = 0; + inst->is_deleted = false; + inst->is_destroyed = false; rcu_assign_pointer(inst->ip_set_list, list); return 0; } @@ -2025,12 +2042,14 @@ ip_set_net_exit(struct net *net) struct ip_set *set = NULL; ip_set_id_t i; - inst->is_deleted = 1; /* flag for ip_set_nfnl_put */ + inst->is_deleted = true; /* flag for ip_set_nfnl_put */ for (i = 0; i < inst->ip_set_max; i++) { set = ip_set(inst, i); - if (set != NULL) - ip_set_destroy_set(inst, i); + if (set) { + ip_set(inst, i) = NULL; + ip_set_destroy_set(set); + } } kfree(rcu_dereference_protected(inst->ip_set_list, 1)); } -- cgit From bd55389cc34b75948c2876c821175a976bbac5b1 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Sat, 13 Jun 2015 14:02:51 +0200 Subject: netfilter:ipset Remove rbtree from hash:net,iface Remove rbtree in order to introduce RCU instead of rwlock in ipset Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/ip_set_hash_netiface.c | 163 ++++------------------------- 1 file changed, 20 insertions(+), 143 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index aac20768f6f0..3258189a296f 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include @@ -37,88 +36,13 @@ MODULE_AUTHOR("Jozsef Kadlecsik "); IP_SET_MODULE_DESC("hash:net,iface", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:net,iface"); -/* Interface name rbtree */ - -struct iface_node { - struct rb_node node; - char iface[IFNAMSIZ]; -}; - -#define iface_data(n) (rb_entry(n, struct iface_node, node)->iface) - -static void -rbtree_destroy(struct rb_root *root) -{ - struct iface_node *node, *next; - - rbtree_postorder_for_each_entry_safe(node, next, root, node) - kfree(node); - - *root = RB_ROOT; -} - -static int -iface_test(struct rb_root *root, const char **iface) -{ - struct rb_node *n = root->rb_node; - - while (n) { - const char *d = iface_data(n); - int res = strcmp(*iface, d); - - if (res < 0) - n = n->rb_left; - else if (res > 0) - n = n->rb_right; - else { - *iface = d; - return 1; - } - } - return 0; -} - -static int -iface_add(struct rb_root *root, const char **iface) -{ - struct rb_node **n = &(root->rb_node), *p = NULL; - struct iface_node *d; - - while (*n) { - char *ifname = iface_data(*n); - int res = strcmp(*iface, ifname); - - p = *n; - if (res < 0) - n = &((*n)->rb_left); - else if (res > 0) - n = &((*n)->rb_right); - else { - *iface = ifname; - return 0; - } - } - - d = kzalloc(sizeof(*d), GFP_ATOMIC); - if (!d) - return -ENOMEM; - strcpy(d->iface, *iface); - - rb_link_node(&d->node, p, n); - rb_insert_color(&d->node, root); - - *iface = d->iface; - return 0; -} - /* Type specific function prefix */ #define HTYPE hash_netiface #define IP_SET_HASH_WITH_NETS -#define IP_SET_HASH_WITH_RBTREE #define IP_SET_HASH_WITH_MULTI #define IP_SET_HASH_WITH_NET0 -#define STREQ(a, b) (strcmp(a, b) == 0) +#define STRLCPY(a, b) strlcpy(a, b, IFNAMSIZ) /* IPv4 variant */ @@ -137,7 +61,7 @@ struct hash_netiface4_elem { u8 cidr; u8 nomatch; u8 elem; - const char *iface; + char iface[IFNAMSIZ]; }; /* Common functions */ @@ -151,7 +75,7 @@ hash_netiface4_data_equal(const struct hash_netiface4_elem *ip1, ip1->cidr == ip2->cidr && (++*multi) && ip1->physdev == ip2->physdev && - ip1->iface == ip2->iface; + strcmp(ip1->iface, ip2->iface) == 0; } static inline int @@ -239,7 +163,6 @@ hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb, .elem = 1, }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); - int ret; if (e.cidr == 0) return -EINVAL; @@ -249,35 +172,24 @@ hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb, ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); e.ip &= ip_set_netmask(e.cidr); -#define IFACE(dir) (par->dir ? par->dir->name : NULL) +#define IFACE(dir) (par->dir ? par->dir->name : "") #define SRCDIR (opt->flags & IPSET_DIM_TWO_SRC) if (opt->cmdflags & IPSET_FLAG_PHYSDEV) { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - e.iface = SRCDIR ? get_physindev_name(skb) : - get_phyoutdev_name(skb); + const char *eiface = SRCDIR ? get_physindev_name(skb) : + get_phyoutdev_name(skb); - if (!e.iface) + if (!eiface) return -EINVAL; + STRLCPY(e.iface, eiface); e.physdev = 1; -#else - e.iface = NULL; #endif } else - e.iface = SRCDIR ? IFACE(in) : IFACE(out); + STRLCPY(e.iface, SRCDIR ? IFACE(in) : IFACE(out)); - if (!e.iface) + if (strlen(e.iface) == 0) return -EINVAL; - ret = iface_test(&h->rbtree, &e.iface); - if (adt == IPSET_ADD) { - if (!ret) { - ret = iface_add(&h->rbtree, &e.iface); - if (ret) - return ret; - } - } else if (!ret) - return ret; - return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } @@ -290,7 +202,6 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], struct hash_netiface4_elem e = { .cidr = HOST_MASK, .elem = 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0, last; - char iface[IFNAMSIZ]; int ret; if (tb[IPSET_ATTR_LINENO]) @@ -314,18 +225,7 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], if (e.cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } - - strcpy(iface, nla_data(tb[IPSET_ATTR_IFACE])); - e.iface = iface; - ret = iface_test(&h->rbtree, &e.iface); - if (adt == IPSET_ADD) { - if (!ret) { - ret = iface_add(&h->rbtree, &e.iface); - if (ret) - return ret; - } - } else if (!ret) - return ret; + nla_strlcpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ); if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); @@ -384,7 +284,7 @@ struct hash_netiface6_elem { u8 cidr; u8 nomatch; u8 elem; - const char *iface; + char iface[IFNAMSIZ]; }; /* Common functions */ @@ -398,7 +298,7 @@ hash_netiface6_data_equal(const struct hash_netiface6_elem *ip1, ip1->cidr == ip2->cidr && (++*multi) && ip1->physdev == ip2->physdev && - ip1->iface == ip2->iface; + strcmp(ip1->iface, ip2->iface) == 0; } static inline int @@ -473,7 +373,6 @@ hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb, .elem = 1, }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); - int ret; if (e.cidr == 0) return -EINVAL; @@ -485,29 +384,19 @@ hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb, if (opt->cmdflags & IPSET_FLAG_PHYSDEV) { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - e.iface = SRCDIR ? get_physindev_name(skb) : - get_phyoutdev_name(skb); - if (!e.iface) + const char *eiface = SRCDIR ? get_physindev_name(skb) : + get_phyoutdev_name(skb); + if (!eiface) return -EINVAL; + STRLCPY(e.iface, eiface); e.physdev = 1; -#else - e.iface = NULL; #endif } else - e.iface = SRCDIR ? IFACE(in) : IFACE(out); + STRLCPY(e.iface, SRCDIR ? IFACE(in) : IFACE(out)); - if (!e.iface) + if (strlen(e.iface) == 0) return -EINVAL; - ret = iface_test(&h->rbtree, &e.iface); - if (adt == IPSET_ADD) { - if (!ret) { - ret = iface_add(&h->rbtree, &e.iface); - if (ret) - return ret; - } - } else if (!ret) - return ret; return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } @@ -516,11 +405,9 @@ static int hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - struct hash_netiface *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netiface6_elem e = { .cidr = HOST_MASK, .elem = 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - char iface[IFNAMSIZ]; int ret; if (tb[IPSET_ATTR_LINENO]) @@ -549,17 +436,7 @@ hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[], ip6_netmask(&e.ip, e.cidr); - strcpy(iface, nla_data(tb[IPSET_ATTR_IFACE])); - e.iface = iface; - ret = iface_test(&h->rbtree, &e.iface); - if (adt == IPSET_ADD) { - if (!ret) { - ret = iface_add(&h->rbtree, &e.iface); - if (ret) - return ret; - } - } else if (!ret) - return ret; + nla_strlcpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ); if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); -- cgit From b57b2d1fa53fe8563bdfc66a33b844463b9af285 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Sat, 13 Jun 2015 14:22:25 +0200 Subject: netfilter: ipset: Prepare the ipset core to use RCU at set level Replace rwlock_t with spinlock_t in "struct ip_set" and change the locking accordingly. Convert the comment extension into an rcu-avare object. Also, simplify the timeout routines. Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/ip_set_core.c | 44 +++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 87b4182660ba..2b21a1983a98 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -209,15 +209,15 @@ ip_set_type_register(struct ip_set_type *type) pr_warn("ip_set type %s, family %s with revision min %u already registered!\n", type->name, family_name(type->family), type->revision_min); - ret = -EINVAL; - goto unlock; + ip_set_type_unlock(); + return -EINVAL; } list_add_rcu(&type->list, &ip_set_type_list); pr_debug("type %s, family %s, revision %u:%u registered.\n", type->name, family_name(type->family), type->revision_min, type->revision_max); -unlock: ip_set_type_unlock(); + return ret; } EXPORT_SYMBOL_GPL(ip_set_type_register); @@ -231,12 +231,12 @@ ip_set_type_unregister(struct ip_set_type *type) pr_warn("ip_set type %s, family %s with revision min %u not registered\n", type->name, family_name(type->family), type->revision_min); - goto unlock; + ip_set_type_unlock(); + return; } list_del_rcu(&type->list); pr_debug("type %s, family %s with revision min %u unregistered.\n", type->name, family_name(type->family), type->revision_min); -unlock: ip_set_type_unlock(); synchronize_rcu(); @@ -531,16 +531,16 @@ ip_set_test(ip_set_id_t index, const struct sk_buff *skb, !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) return 0; - read_lock_bh(&set->lock); + rcu_read_lock_bh(); ret = set->variant->kadt(set, skb, par, IPSET_TEST, opt); - read_unlock_bh(&set->lock); + rcu_read_unlock_bh(); if (ret == -EAGAIN) { /* Type requests element to be completed */ pr_debug("element must be completed, ADD is triggered\n"); - write_lock_bh(&set->lock); + spin_lock_bh(&set->lock); set->variant->kadt(set, skb, par, IPSET_ADD, opt); - write_unlock_bh(&set->lock); + spin_unlock_bh(&set->lock); ret = 1; } else { /* --return-nomatch: invert matched element */ @@ -570,9 +570,9 @@ ip_set_add(ip_set_id_t index, const struct sk_buff *skb, !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) return -IPSET_ERR_TYPE_MISMATCH; - write_lock_bh(&set->lock); + spin_lock_bh(&set->lock); ret = set->variant->kadt(set, skb, par, IPSET_ADD, opt); - write_unlock_bh(&set->lock); + spin_unlock_bh(&set->lock); return ret; } @@ -593,9 +593,9 @@ ip_set_del(ip_set_id_t index, const struct sk_buff *skb, !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) return -IPSET_ERR_TYPE_MISMATCH; - write_lock_bh(&set->lock); + spin_lock_bh(&set->lock); ret = set->variant->kadt(set, skb, par, IPSET_DEL, opt); - write_unlock_bh(&set->lock); + spin_unlock_bh(&set->lock); return ret; } @@ -880,7 +880,7 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb, set = kzalloc(sizeof(struct ip_set), GFP_KERNEL); if (!set) return -ENOMEM; - rwlock_init(&set->lock); + spin_lock_init(&set->lock); strlcpy(set->name, name, IPSET_MAXNAMELEN); set->family = family; set->revision = revision; @@ -1062,9 +1062,9 @@ ip_set_flush_set(struct ip_set *set) { pr_debug("set: %s\n", set->name); - write_lock_bh(&set->lock); + spin_lock_bh(&set->lock); set->variant->flush(set); - write_unlock_bh(&set->lock); + spin_unlock_bh(&set->lock); } static int @@ -1377,9 +1377,9 @@ dump_last: set->variant->uref(set, cb, true); /* Fall through and add elements */ default: - read_lock_bh(&set->lock); + rcu_read_lock_bh(); ret = set->variant->list(set, skb, cb); - read_unlock_bh(&set->lock); + rcu_read_unlock_bh(); if (!cb->args[IPSET_CB_ARG0]) /* Set is done, proceed with next one */ goto next_set; @@ -1462,9 +1462,9 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, bool eexist = flags & IPSET_FLAG_EXIST, retried = false; do { - write_lock_bh(&set->lock); + spin_lock_bh(&set->lock); ret = set->variant->uadt(set, tb, adt, &lineno, flags, retried); - write_unlock_bh(&set->lock); + spin_unlock_bh(&set->lock); retried = true; } while (ret == -EAGAIN && set->variant->resize && @@ -1644,9 +1644,9 @@ ip_set_utest(struct sock *ctnl, struct sk_buff *skb, set->type->adt_policy)) return -IPSET_ERR_PROTOCOL; - read_lock_bh(&set->lock); + rcu_read_lock_bh(); ret = set->variant->uadt(set, tb, IPSET_TEST, NULL, 0, 0); - read_unlock_bh(&set->lock); + rcu_read_unlock_bh(); /* Userspace can't trigger element to be re-added */ if (ret == -EAGAIN) ret = 1; -- cgit From 96f51428c43de20723630f0d756a7a9a42cbd974 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Sat, 13 Jun 2015 14:39:59 +0200 Subject: netfilter: ipset: Introduce RCU locking in bitmap:* types There's nothing much required because the bitmap types use atomic bit operations. However the logic of adding elements slightly changed: first the MAC address updated (which is not atomic), then the element activated (added). The extensions may call kfree_rcu() therefore we call rcu_barrier() at module removal. Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/ip_set_bitmap_gen.h | 33 +++++++++++++++++++++---------- net/netfilter/ipset/ip_set_bitmap_ip.c | 3 ++- net/netfilter/ipset/ip_set_bitmap_ipmac.c | 13 ++++++++++-- net/netfilter/ipset/ip_set_bitmap_port.c | 3 ++- 4 files changed, 38 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h index 6f024a8a1534..86429f369128 100644 --- a/net/netfilter/ipset/ip_set_bitmap_gen.h +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -144,10 +144,12 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, if (ret == IPSET_ADD_FAILED) { if (SET_WITH_TIMEOUT(set) && - ip_set_timeout_expired(ext_timeout(x, set))) + ip_set_timeout_expired(ext_timeout(x, set))) { ret = 0; - else if (!(flags & IPSET_FLAG_EXIST)) + } else if (!(flags & IPSET_FLAG_EXIST)) { + set_bit(e->id, map->members); return -IPSET_ERR_EXIST; + } /* Element is re-added, cleanup extensions */ ip_set_ext_destroy(set, x); } @@ -165,6 +167,10 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, ip_set_init_comment(ext_comment(x, set), ext); if (SET_WITH_SKBINFO(set)) ip_set_init_skbinfo(ext_skbinfo(x, set), ext); + + /* Activate element */ + set_bit(e->id, map->members); + return 0; } @@ -203,10 +209,13 @@ mtype_list(const struct ip_set *set, struct nlattr *adt, *nested; void *x; u32 id, first = cb->args[IPSET_CB_ARG0]; + int ret = 0; adt = ipset_nest_start(skb, IPSET_ATTR_ADT); if (!adt) return -EMSGSIZE; + /* Extensions may be replaced */ + rcu_read_lock(); for (; cb->args[IPSET_CB_ARG0] < map->elements; cb->args[IPSET_CB_ARG0]++) { id = cb->args[IPSET_CB_ARG0]; @@ -222,9 +231,11 @@ mtype_list(const struct ip_set *set, if (!nested) { if (id == first) { nla_nest_cancel(skb, adt); - return -EMSGSIZE; - } else - goto nla_put_failure; + ret = -EMSGSIZE; + goto out; + } + + goto nla_put_failure; } if (mtype_do_list(skb, map, id, set->dsize)) goto nla_put_failure; @@ -238,16 +249,18 @@ mtype_list(const struct ip_set *set, /* Set listing finished */ cb->args[IPSET_CB_ARG0] = 0; - return 0; + goto out; nla_put_failure: nla_nest_cancel(skb, nested); if (unlikely(id == first)) { cb->args[IPSET_CB_ARG0] = 0; - return -EMSGSIZE; + ret = -EMSGSIZE; } ipset_nest_end(skb, adt); - return 0; +out: + rcu_read_unlock(); + return ret; } static void @@ -260,7 +273,7 @@ mtype_gc(unsigned long ul_set) /* We run parallel with other readers (test element) * but adding/deleting new entries is locked out */ - read_lock_bh(&set->lock); + spin_lock_bh(&set->lock); for (id = 0; id < map->elements; id++) if (mtype_gc_test(id, map, set->dsize)) { x = get_ext(set, map, id); @@ -269,7 +282,7 @@ mtype_gc(unsigned long ul_set) ip_set_ext_destroy(set, x); } } - read_unlock_bh(&set->lock); + spin_unlock_bh(&set->lock); map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; add_timer(&map->gc); diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index 7af99c3e5a4d..b8ce474c038d 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -81,7 +81,7 @@ static inline int bitmap_ip_do_add(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map, u32 flags, size_t dsize) { - return !!test_and_set_bit(e->id, map->members); + return !!test_bit(e->id, map->members); } static inline int @@ -376,6 +376,7 @@ bitmap_ip_init(void) static void __exit bitmap_ip_fini(void) { + rcu_barrier(); ip_set_type_unregister(&bitmap_ip_type); } diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index 773342292623..fe00e87decc8 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -147,15 +147,23 @@ bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e, struct bitmap_ipmac_elem *elem; elem = get_elem(map->extensions, e->id, dsize); - if (test_and_set_bit(e->id, map->members)) { + if (test_bit(e->id, map->members)) { if (elem->filled == MAC_FILLED) { - if (e->ether && (flags & IPSET_FLAG_EXIST)) + if (e->ether && + (flags & IPSET_FLAG_EXIST) && + !ether_addr_equal(e->ether, elem->ether)) { + /* memcpy isn't atomic */ + clear_bit(e->id, map->members); + smp_mb__after_atomic(); memcpy(elem->ether, e->ether, ETH_ALEN); + } return IPSET_ADD_FAILED; } else if (!e->ether) /* Already added without ethernet address */ return IPSET_ADD_FAILED; /* Fill the MAC address and trigger the timer activation */ + clear_bit(e->id, map->members); + smp_mb__after_atomic(); memcpy(elem->ether, e->ether, ETH_ALEN); elem->filled = MAC_FILLED; return IPSET_ADD_START_STORED_TIMEOUT; @@ -413,6 +421,7 @@ bitmap_ipmac_init(void) static void __exit bitmap_ipmac_fini(void) { + rcu_barrier(); ip_set_type_unregister(&bitmap_ipmac_type); } diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c index ec3bda1ec90e..2d360f951d18 100644 --- a/net/netfilter/ipset/ip_set_bitmap_port.c +++ b/net/netfilter/ipset/ip_set_bitmap_port.c @@ -73,7 +73,7 @@ static inline int bitmap_port_do_add(const struct bitmap_port_adt_elem *e, struct bitmap_port *map, u32 flags, size_t dsize) { - return !!test_and_set_bit(e->id, map->members); + return !!test_bit(e->id, map->members); } static inline int @@ -306,6 +306,7 @@ bitmap_port_init(void) static void __exit bitmap_port_fini(void) { + rcu_barrier(); ip_set_type_unregister(&bitmap_port_type); } -- cgit From 18f84d41d34fa35d0d64bbaea01fe664553ecc06 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Sat, 13 Jun 2015 17:29:56 +0200 Subject: netfilter: ipset: Introduce RCU locking in hash:* types Three types of data need to be protected in the case of the hash types: a. The hash buckets: standard rcu pointer operations are used. b. The element blobs in the hash buckets are stored in an array and a bitmap is used for book-keeping to tell which elements in the array are used or free. c. Networks per cidr values and the cidr values themselves are stored in fix sized arrays and need no protection. The values are modified in such an order that in the worst case an element testing is repeated once with the same cidr value. The ipset hash approach uses arrays instead of lists and therefore is incompatible with rhashtable. Performance is tested by Jesper Dangaard Brouer: Simple drop in FORWARD ~~~~~~~~~~~~~~~~~~~~~~ Dropping via simple iptables net-mask match:: iptables -t raw -N simple || iptables -t raw -F simple iptables -t raw -I simple -s 198.18.0.0/15 -j DROP iptables -t raw -D PREROUTING -j simple iptables -t raw -I PREROUTING -j simple Drop performance in "raw": 11.3Mpps Generator: sending 12.2Mpps (tx:12264083 pps) Drop via original ipset in RAW table ~~~~~~~~~~~~~~~~~~~~~~~~~~~ Create a set with lots of elements:: sudo ./ipset destroy test echo "create test hash:ip hashsize 65536" > test.set for x in `seq 0 255`; do for y in `seq 0 255`; do echo "add test 198.18.$x.$y" >> test.set done done sudo ./ipset restore < test.set Dropping via ipset:: iptables -t raw -F iptables -t raw -N net198 || iptables -t raw -F net198 iptables -t raw -I net198 -m set --match-set test src -j DROP iptables -t raw -I PREROUTING -j net198 Drop performance in "raw" with ipset: 8Mpps Perf report numbers ipset drop in "raw":: + 24.65% ksoftirqd/1 [ip_set] [k] ip_set_test - 21.42% ksoftirqd/1 [kernel.kallsyms] [k] _raw_read_lock_bh - _raw_read_lock_bh + 99.88% ip_set_test - 19.42% ksoftirqd/1 [kernel.kallsyms] [k] _raw_read_unlock_bh - _raw_read_unlock_bh + 99.72% ip_set_test + 4.31% ksoftirqd/1 [ip_set_hash_ip] [k] hash_ip4_kadt + 2.27% ksoftirqd/1 [ixgbe] [k] ixgbe_fetch_rx_buffer + 2.18% ksoftirqd/1 [ip_tables] [k] ipt_do_table + 1.81% ksoftirqd/1 [ip_set_hash_ip] [k] hash_ip4_test + 1.61% ksoftirqd/1 [kernel.kallsyms] [k] __netif_receive_skb_core + 1.44% ksoftirqd/1 [kernel.kallsyms] [k] build_skb + 1.42% ksoftirqd/1 [kernel.kallsyms] [k] ip_rcv + 1.36% ksoftirqd/1 [kernel.kallsyms] [k] __local_bh_enable_ip + 1.16% ksoftirqd/1 [kernel.kallsyms] [k] dev_gro_receive + 1.09% ksoftirqd/1 [kernel.kallsyms] [k] __rcu_read_unlock + 0.96% ksoftirqd/1 [ixgbe] [k] ixgbe_clean_rx_irq + 0.95% ksoftirqd/1 [kernel.kallsyms] [k] __netdev_alloc_frag + 0.88% ksoftirqd/1 [kernel.kallsyms] [k] kmem_cache_alloc + 0.87% ksoftirqd/1 [xt_set] [k] set_match_v3 + 0.85% ksoftirqd/1 [kernel.kallsyms] [k] inet_gro_receive + 0.83% ksoftirqd/1 [kernel.kallsyms] [k] nf_iterate + 0.76% ksoftirqd/1 [kernel.kallsyms] [k] put_compound_page + 0.75% ksoftirqd/1 [kernel.kallsyms] [k] __rcu_read_lock Drop via ipset in RAW table with RCU-locking ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ With RCU locking, the RW-lock is gone. Drop performance in "raw" with ipset with RCU-locking: 11.3Mpps Performance-tested-by: Jesper Dangaard Brouer Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/ip_set_hash_gen.h | 589 ++++++++++++++++----------- net/netfilter/ipset/ip_set_hash_ip.c | 1 + net/netfilter/ipset/ip_set_hash_ipmark.c | 1 + net/netfilter/ipset/ip_set_hash_ipport.c | 1 + net/netfilter/ipset/ip_set_hash_ipportip.c | 1 + net/netfilter/ipset/ip_set_hash_ipportnet.c | 1 + net/netfilter/ipset/ip_set_hash_mac.c | 1 + net/netfilter/ipset/ip_set_hash_net.c | 1 + net/netfilter/ipset/ip_set_hash_netiface.c | 1 + net/netfilter/ipset/ip_set_hash_netnet.c | 1 + net/netfilter/ipset/ip_set_hash_netport.c | 1 + net/netfilter/ipset/ip_set_hash_netportnet.c | 1 + 12 files changed, 364 insertions(+), 236 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 5fcf70b0ebc2..f352cc022010 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -10,19 +10,19 @@ #include #include +#include #include -#ifndef rcu_dereference_bh -#define rcu_dereference_bh(p) rcu_dereference(p) -#endif + +#define __ipset_dereference_protected(p, c) rcu_dereference_protected(p, c) +#define ipset_dereference_protected(p, set) \ + __ipset_dereference_protected(p, spin_is_locked(&(set)->lock)) #define rcu_dereference_bh_nfnl(p) rcu_dereference_bh_check(p, 1) /* Hashing which uses arrays to resolve clashing. The hash table is resized * (doubled) when searching becomes too long. * Internally jhash is used with the assumption that the size of the - * stored data is a multiple of sizeof(u32). If storage supports timeout, - * the timeout field must be the last one in the data structure - that field - * is ignored when computing the hash key. + * stored data is a multiple of sizeof(u32). * * Readers and resizing * @@ -36,6 +36,8 @@ #define AHASH_INIT_SIZE 4 /* Max number of elements to store in an array block */ #define AHASH_MAX_SIZE (3*AHASH_INIT_SIZE) +/* Max muber of elements in the array block when tuned */ +#define AHASH_MAX_TUNED 64 /* Max number of elements can be tuned */ #ifdef IP_SET_HASH_WITH_MULTI @@ -53,7 +55,7 @@ tune_ahash_max(u8 curr, u32 multi) /* Currently, at listing one hash bucket must fit into a message. * Therefore we have a hard limit here. */ - return n > curr && n <= 64 ? n : curr; + return n > curr && n <= AHASH_MAX_TUNED ? n : curr; } #define TUNE_AHASH_MAX(h, multi) \ ((h)->ahash_max = tune_ahash_max((h)->ahash_max, multi)) @@ -64,20 +66,23 @@ tune_ahash_max(u8 curr, u32 multi) /* A hash bucket */ struct hbucket { - void *value; /* the array of the values */ + struct rcu_head rcu; /* for call_rcu_bh */ + /* Which positions are used in the array */ + DECLARE_BITMAP(used, AHASH_MAX_TUNED); u8 size; /* size of the array */ u8 pos; /* position of the first free entry */ -}; + unsigned char value[0]; /* the array of the values */ +} __attribute__ ((aligned)); /* The hash table: the table size stored here in order to make resizing easy */ struct htable { atomic_t ref; /* References for resizing */ atomic_t uref; /* References for dumping */ u8 htable_bits; /* size of hash table == 2^htable_bits */ - struct hbucket bucket[0]; /* hashtable buckets */ + struct hbucket __rcu *bucket[0]; /* hashtable buckets */ }; -#define hbucket(h, i) (&((h)->bucket[i])) +#define hbucket(h, i) ((h)->bucket[i]) #ifndef IPSET_NET_COUNT #define IPSET_NET_COUNT 1 @@ -85,8 +90,8 @@ struct htable { /* Book-keeping of the prefixes added to the set */ struct net_prefixes { - u32 nets[IPSET_NET_COUNT]; /* number of elements per cidr */ - u8 cidr[IPSET_NET_COUNT]; /* the different cidr values in the set */ + u32 nets[IPSET_NET_COUNT]; /* number of elements for this cidr */ + u8 cidr[IPSET_NET_COUNT]; /* the cidr value */ }; /* Compute the hash table size */ @@ -99,11 +104,11 @@ htable_size(u8 hbits) if (hbits > 31) return 0; hsize = jhash_size(hbits); - if ((((size_t)-1) - sizeof(struct htable))/sizeof(struct hbucket) + if ((((size_t)-1) - sizeof(struct htable)) / sizeof(struct hbucket *) < hsize) return 0; - return hsize * sizeof(struct hbucket) + sizeof(struct htable); + return hsize * sizeof(struct hbucket *) + sizeof(struct htable); } /* Compute htable_bits from the user input parameter hashsize */ @@ -112,6 +117,7 @@ htable_bits(u32 hashsize) { /* Assume that hashsize == 2^htable_bits */ u8 bits = fls(hashsize - 1); + if (jhash_size(bits) != hashsize) /* Round up to the first 2^n value */ bits = fls(hashsize); @@ -119,30 +125,6 @@ htable_bits(u32 hashsize) return bits; } -static int -hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize) -{ - if (n->pos >= n->size) { - void *tmp; - - if (n->size >= ahash_max) - /* Trigger rehashing */ - return -EAGAIN; - - tmp = kzalloc((n->size + AHASH_INIT_SIZE) * dsize, - GFP_ATOMIC); - if (!tmp) - return -ENOMEM; - if (n->size) { - memcpy(tmp, n->value, n->size * dsize); - kfree(n->value); - } - n->value = tmp; - n->size += AHASH_INIT_SIZE; - } - return 0; -} - #ifdef IP_SET_HASH_WITH_NETS #if IPSET_NET_COUNT > 1 #define __CIDR(cidr, i) (cidr[i]) @@ -300,9 +282,6 @@ struct htype { #ifdef IP_SET_HASH_WITH_NETMASK u8 netmask; /* netmask value for subnets to store */ #endif -#ifdef IP_SET_HASH_WITH_RBTREE - struct rb_root rbtree; -#endif #ifdef IP_SET_HASH_WITH_NETS struct net_prefixes nets[0]; /* book-keeping of prefixes */ #endif @@ -345,8 +324,8 @@ mtype_del_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n) for (i = 0; i < nets_length; i++) { if (h->nets[i].cidr[n] != cidr) continue; - h->nets[cidr -1].nets[n]--; - if (h->nets[cidr -1].nets[n] > 0) + h->nets[cidr - 1].nets[n]--; + if (h->nets[cidr - 1].nets[n] > 0) return; for (j = i; j < net_end && h->nets[j].cidr[n]; j++) h->nets[j].cidr[n] = h->nets[j + 1].cidr[n]; @@ -362,15 +341,18 @@ mtype_ahash_memsize(const struct htype *h, const struct htable *t, u8 nets_length, size_t dsize) { u32 i; - size_t memsize = sizeof(*h) - + sizeof(*t) + struct hbucket *n; + size_t memsize = sizeof(*h) + sizeof(*t); + #ifdef IP_SET_HASH_WITH_NETS - + sizeof(struct net_prefixes) * nets_length + memsize += sizeof(struct net_prefixes) * nets_length; #endif - + jhash_size(t->htable_bits) * sizeof(struct hbucket); - - for (i = 0; i < jhash_size(t->htable_bits); i++) - memsize += t->bucket[i].size * dsize; + for (i = 0; i < jhash_size(t->htable_bits); i++) { + n = rcu_dereference_bh(hbucket(t, i)); + if (!n) + continue; + memsize += sizeof(struct hbucket) + n->size * dsize; + } return memsize; } @@ -385,7 +367,8 @@ mtype_ext_cleanup(struct ip_set *set, struct hbucket *n) int i; for (i = 0; i < n->pos; i++) - ip_set_ext_destroy(set, ahash_data(n, i, set->dsize)); + if (test_bit(i, n->used)) + ip_set_ext_destroy(set, ahash_data(n, i, set->dsize)); } /* Flush a hash type of set: destroy all elements */ @@ -397,16 +380,16 @@ mtype_flush(struct ip_set *set) struct hbucket *n; u32 i; - t = rcu_dereference_bh_nfnl(h->table); + t = ipset_dereference_protected(h->table, set); for (i = 0; i < jhash_size(t->htable_bits); i++) { - n = hbucket(t, i); - if (n->size) { - if (set->extensions & IPSET_EXT_DESTROY) - mtype_ext_cleanup(set, n); - n->size = n->pos = 0; - /* FIXME: use slab cache */ - kfree(n->value); - } + n = __ipset_dereference_protected(hbucket(t, i), 1); + if (!n) + continue; + if (set->extensions & IPSET_EXT_DESTROY) + mtype_ext_cleanup(set, n); + /* FIXME: use slab cache */ + rcu_assign_pointer(hbucket(t, i), NULL); + kfree_rcu(n, rcu); } #ifdef IP_SET_HASH_WITH_NETS memset(h->nets, 0, sizeof(struct net_prefixes) * NLEN(set->family)); @@ -422,13 +405,13 @@ mtype_ahash_destroy(struct ip_set *set, struct htable *t, bool ext_destroy) u32 i; for (i = 0; i < jhash_size(t->htable_bits); i++) { - n = hbucket(t, i); - if (n->size) { - if (set->extensions & IPSET_EXT_DESTROY && ext_destroy) - mtype_ext_cleanup(set, n); - /* FIXME: use slab cache */ - kfree(n->value); - } + n = __ipset_dereference_protected(hbucket(t, i), 1); + if (!n) + continue; + if (set->extensions & IPSET_EXT_DESTROY && ext_destroy) + mtype_ext_cleanup(set, n); + /* FIXME: use slab cache */ + kfree(n); } ip_set_free(t); @@ -443,10 +426,8 @@ mtype_destroy(struct ip_set *set) if (SET_WITH_TIMEOUT(set)) del_timer_sync(&h->gc); - mtype_ahash_destroy(set, rcu_dereference_bh_nfnl(h->table), true); -#ifdef IP_SET_HASH_WITH_RBTREE - rbtree_destroy(&h->rbtree); -#endif + mtype_ahash_destroy(set, __ipset_dereference_protected(h->table, 1), + true); kfree(h); set->data = NULL; @@ -491,20 +472,26 @@ mtype_expire(struct ip_set *set, struct htype *h, u8 nets_length, size_t dsize) struct htable *t; struct hbucket *n; struct mtype_elem *data; - u32 i; - int j; + u32 i, j, d; #ifdef IP_SET_HASH_WITH_NETS u8 k; #endif - rcu_read_lock_bh(); - t = rcu_dereference_bh(h->table); + t = ipset_dereference_protected(h->table, set); for (i = 0; i < jhash_size(t->htable_bits); i++) { - n = hbucket(t, i); - for (j = 0; j < n->pos; j++) { + n = __ipset_dereference_protected(hbucket(t, i), 1); + if (!n) + continue; + for (j = 0, d = 0; j < n->pos; j++) { + if (!test_bit(j, n->used)) { + d++; + continue; + } data = ahash_data(n, j, dsize); if (ip_set_timeout_expired(ext_timeout(data, set))) { pr_debug("expired %u/%u\n", i, j); + clear_bit(j, n->used); + smp_mb__after_atomic(); #ifdef IP_SET_HASH_WITH_NETS for (k = 0; k < IPSET_NET_COUNT; k++) mtype_del_cidr(h, @@ -513,29 +500,31 @@ mtype_expire(struct ip_set *set, struct htype *h, u8 nets_length, size_t dsize) nets_length, k); #endif ip_set_ext_destroy(set, data); - if (j != n->pos - 1) - /* Not last one */ - memcpy(data, - ahash_data(n, n->pos - 1, dsize), - dsize); - n->pos--; h->elements--; + d++; } } - if (n->pos + AHASH_INIT_SIZE < n->size) { - void *tmp = kzalloc((n->size - AHASH_INIT_SIZE) - * dsize, - GFP_ATOMIC); + if (d >= AHASH_INIT_SIZE) { + struct hbucket *tmp = kzalloc(sizeof(*tmp) + + (n->size - AHASH_INIT_SIZE) * dsize, + GFP_ATOMIC); if (!tmp) /* Still try to delete expired elements */ continue; - n->size -= AHASH_INIT_SIZE; - memcpy(tmp, n->value, n->size * dsize); - kfree(n->value); - n->value = tmp; + tmp->size = n->size - AHASH_INIT_SIZE; + for (j = 0, d = 0; j < n->pos; j++) { + if (!test_bit(j, n->used)) + continue; + data = ahash_data(n, j, dsize); + memcpy(tmp->value + d * dsize, data, dsize); + set_bit(j, tmp->used); + d++; + } + tmp->pos = d; + rcu_assign_pointer(hbucket(t, i), tmp); + kfree_rcu(n, rcu); } } - rcu_read_unlock_bh(); } static void @@ -545,9 +534,9 @@ mtype_gc(unsigned long ul_set) struct htype *h = set->data; pr_debug("called\n"); - write_lock_bh(&set->lock); + spin_lock_bh(&set->lock); mtype_expire(set, h, NLEN(set->family), set->dsize); - write_unlock_bh(&set->lock); + spin_unlock_bh(&set->lock); h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; add_timer(&h->gc); @@ -560,80 +549,115 @@ static int mtype_resize(struct ip_set *set, bool retried) { struct htype *h = set->data; - struct htable *t, *orig = rcu_dereference_bh_nfnl(h->table); - u8 htable_bits = orig->htable_bits; + struct htable *t, *orig; + u8 htable_bits; + size_t dsize = set->dsize; #ifdef IP_SET_HASH_WITH_NETS u8 flags; + struct mtype_elem *tmp; #endif struct mtype_elem *data; struct mtype_elem *d; struct hbucket *n, *m; - u32 i, j; + u32 i, j, key; int ret; - /* Try to cleanup once */ - if (SET_WITH_TIMEOUT(set) && !retried) { - i = h->elements; - write_lock_bh(&set->lock); - mtype_expire(set, set->data, NLEN(set->family), set->dsize); - write_unlock_bh(&set->lock); - if (h->elements < i) - return 0; - } +#ifdef IP_SET_HASH_WITH_NETS + tmp = kmalloc(dsize, GFP_KERNEL); + if (!tmp) + return -ENOMEM; +#endif + rcu_read_lock_bh(); + orig = rcu_dereference_bh_nfnl(h->table); + htable_bits = orig->htable_bits; + rcu_read_unlock_bh(); retry: ret = 0; htable_bits++; - pr_debug("attempt to resize set %s from %u to %u, t %p\n", - set->name, orig->htable_bits, htable_bits, orig); if (!htable_bits) { /* In case we have plenty of memory :-) */ pr_warn("Cannot increase the hashsize of set %s further\n", set->name); - return -IPSET_ERR_HASH_FULL; + ret = -IPSET_ERR_HASH_FULL; + goto out; + } + t = ip_set_alloc(htable_size(htable_bits)); + if (!t) { + ret = -ENOMEM; + goto out; } - t = ip_set_alloc(sizeof(*t) - + jhash_size(htable_bits) * sizeof(struct hbucket)); - if (!t) - return -ENOMEM; t->htable_bits = htable_bits; - read_lock_bh(&set->lock); + spin_lock_bh(&set->lock); + orig = __ipset_dereference_protected(h->table, 1); /* There can't be another parallel resizing, but dumping is possible */ atomic_set(&orig->ref, 1); atomic_inc(&orig->uref); + pr_debug("attempt to resize set %s from %u to %u, t %p\n", + set->name, orig->htable_bits, htable_bits, orig); for (i = 0; i < jhash_size(orig->htable_bits); i++) { - n = hbucket(orig, i); + n = __ipset_dereference_protected(hbucket(orig, i), 1); + if (!n) + continue; for (j = 0; j < n->pos; j++) { - data = ahash_data(n, j, set->dsize); + if (!test_bit(j, n->used)) + continue; + data = ahash_data(n, j, dsize); #ifdef IP_SET_HASH_WITH_NETS + /* We have readers running parallel with us, + * so the live data cannot be modified. + */ flags = 0; + memcpy(tmp, data, dsize); + data = tmp; mtype_data_reset_flags(data, &flags); #endif - m = hbucket(t, HKEY(data, h->initval, htable_bits)); - ret = hbucket_elem_add(m, AHASH_MAX(h), set->dsize); - if (ret < 0) { -#ifdef IP_SET_HASH_WITH_NETS - mtype_data_reset_flags(data, &flags); -#endif - atomic_set(&orig->ref, 0); - atomic_dec(&orig->uref); - read_unlock_bh(&set->lock); - mtype_ahash_destroy(set, t, false); - if (ret == -EAGAIN) - goto retry; - return ret; + key = HKEY(data, h->initval, htable_bits); + m = __ipset_dereference_protected(hbucket(t, key), 1); + if (!m) { + m = kzalloc(sizeof(*m) + + AHASH_INIT_SIZE * dsize, + GFP_ATOMIC); + if (!m) { + ret = -ENOMEM; + goto cleanup; + } + m->size = AHASH_INIT_SIZE; + RCU_INIT_POINTER(hbucket(t, key), m); + } else if (m->pos >= m->size) { + struct hbucket *ht; + + if (m->size >= AHASH_MAX(h)) { + ret = -EAGAIN; + } else { + ht = kzalloc(sizeof(*ht) + + (m->size + AHASH_INIT_SIZE) + * dsize, + GFP_ATOMIC); + if (!ht) + ret = -ENOMEM; + } + if (ret < 0) + goto cleanup; + memcpy(ht, m, sizeof(struct hbucket) + + m->size * dsize); + ht->size = m->size + AHASH_INIT_SIZE; + kfree(m); + m = ht; + RCU_INIT_POINTER(hbucket(t, key), ht); } - d = ahash_data(m, m->pos++, set->dsize); - memcpy(d, data, set->dsize); + d = ahash_data(m, m->pos, dsize); + memcpy(d, data, dsize); + set_bit(m->pos++, m->used); #ifdef IP_SET_HASH_WITH_NETS mtype_data_reset_flags(d, &flags); #endif } } - rcu_assign_pointer(h->table, t); - read_unlock_bh(&set->lock); + + spin_unlock_bh(&set->lock); /* Give time to other readers of the set */ synchronize_rcu_bh(); @@ -646,7 +670,20 @@ retry: mtype_ahash_destroy(set, orig, false); } - return 0; +out: +#ifdef IP_SET_HASH_WITH_NETS + kfree(tmp); +#endif + return ret; + +cleanup: + atomic_set(&orig->ref, 0); + atomic_dec(&orig->uref); + spin_unlock_bh(&set->lock); + mtype_ahash_destroy(set, t, false); + if (ret == -EAGAIN) + goto retry; + goto out; } /* Add an element to a hash and update the internal counters when succeeded, @@ -659,17 +696,49 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct htable *t; const struct mtype_elem *d = value; struct mtype_elem *data; - struct hbucket *n; - int i, ret = 0; - int j = AHASH_MAX(h) + 1; + struct hbucket *n, *old = ERR_PTR(-ENOENT); + int i, j = -1; bool flag_exist = flags & IPSET_FLAG_EXIST; + bool deleted = false, forceadd = false, reuse = false; u32 key, multi = 0; - rcu_read_lock_bh(); - t = rcu_dereference_bh(h->table); + if (h->elements >= h->maxelem) { + if (SET_WITH_TIMEOUT(set)) + /* FIXME: when set is full, we slow down here */ + mtype_expire(set, h, NLEN(set->family), set->dsize); + if (h->elements >= h->maxelem && SET_WITH_FORCEADD(set)) + forceadd = true; + } + + t = ipset_dereference_protected(h->table, set); key = HKEY(value, h->initval, t->htable_bits); - n = hbucket(t, key); + n = __ipset_dereference_protected(hbucket(t, key), 1); + if (!n) { + if (forceadd) { + if (net_ratelimit()) + pr_warn("Set %s is full, maxelem %u reached\n", + set->name, h->maxelem); + return -IPSET_ERR_HASH_FULL; + } else if (h->elements >= h->maxelem) { + goto set_full; + } + old = NULL; + n = kzalloc(sizeof(*n) + AHASH_INIT_SIZE * set->dsize, + GFP_ATOMIC); + if (!n) + return -ENOMEM; + n->size = AHASH_INIT_SIZE; + goto copy_elem; + } for (i = 0; i < n->pos; i++) { + if (!test_bit(i, n->used)) { + /* Reuse first deleted entry */ + if (j == -1) { + deleted = reuse = true; + j = i; + } + continue; + } data = ahash_data(n, i, set->dsize); if (mtype_data_equal(data, d, &multi)) { if (flag_exist || @@ -677,85 +746,94 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, ip_set_timeout_expired(ext_timeout(data, set)))) { /* Just the extensions could be overwritten */ j = i; - goto reuse_slot; - } else { - ret = -IPSET_ERR_EXIST; - goto out; + goto overwrite_extensions; } + return -IPSET_ERR_EXIST; } /* Reuse first timed out entry */ if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(data, set)) && - j != AHASH_MAX(h) + 1) + j == -1) { j = i; + reuse = true; + } } - if (h->elements >= h->maxelem && SET_WITH_FORCEADD(set) && n->pos) { - /* Choosing the first entry in the array to replace */ - j = 0; - goto reuse_slot; - } - if (SET_WITH_TIMEOUT(set) && h->elements >= h->maxelem) - /* FIXME: when set is full, we slow down here */ - mtype_expire(set, h, NLEN(set->family), set->dsize); - - if (h->elements >= h->maxelem) { - if (net_ratelimit()) - pr_warn("Set %s is full, maxelem %u reached\n", - set->name, h->maxelem); - ret = -IPSET_ERR_HASH_FULL; - goto out; - } - -reuse_slot: - if (j != AHASH_MAX(h) + 1) { - /* Fill out reused slot */ + if (reuse || forceadd) { data = ahash_data(n, j, set->dsize); + if (!deleted) { #ifdef IP_SET_HASH_WITH_NETS - for (i = 0; i < IPSET_NET_COUNT; i++) { - mtype_del_cidr(h, NCIDR_PUT(DCIDR_GET(data->cidr, i)), - NLEN(set->family), i); - mtype_add_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, i)), - NLEN(set->family), i); - } + for (i = 0; i < IPSET_NET_COUNT; i++) + mtype_del_cidr(h, + NCIDR_PUT(DCIDR_GET(data->cidr, i)), + NLEN(set->family), i); #endif - ip_set_ext_destroy(set, data); - } else { - /* Use/create a new slot */ + ip_set_ext_destroy(set, data); + h->elements--; + } + goto copy_data; + } + if (h->elements >= h->maxelem) + goto set_full; + /* Create a new slot */ + if (n->pos >= n->size) { TUNE_AHASH_MAX(h, multi); - ret = hbucket_elem_add(n, AHASH_MAX(h), set->dsize); - if (ret != 0) { - if (ret == -EAGAIN) - mtype_data_next(&h->next, d); - goto out; + if (n->size >= AHASH_MAX(h)) { + /* Trigger rehashing */ + mtype_data_next(&h->next, d); + return -EAGAIN; } - data = ahash_data(n, n->pos++, set->dsize); + old = n; + n = kzalloc(sizeof(*n) + + (old->size + AHASH_INIT_SIZE) * set->dsize, + GFP_ATOMIC); + if (!n) + return -ENOMEM; + memcpy(n, old, sizeof(struct hbucket) + + old->size * set->dsize); + n->size = old->size + AHASH_INIT_SIZE; + } + +copy_elem: + j = n->pos++; + data = ahash_data(n, j, set->dsize); +copy_data: + h->elements++; #ifdef IP_SET_HASH_WITH_NETS - for (i = 0; i < IPSET_NET_COUNT; i++) - mtype_add_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, i)), - NLEN(set->family), i); + for (i = 0; i < IPSET_NET_COUNT; i++) + mtype_add_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, i)), + NLEN(set->family), i); #endif - h->elements++; - } memcpy(data, d, sizeof(struct mtype_elem)); +overwrite_extensions: #ifdef IP_SET_HASH_WITH_NETS mtype_data_set_flags(data, flags); #endif - if (SET_WITH_TIMEOUT(set)) - ip_set_timeout_set(ext_timeout(data, set), ext->timeout); if (SET_WITH_COUNTER(set)) ip_set_init_counter(ext_counter(data, set), ext); if (SET_WITH_COMMENT(set)) ip_set_init_comment(ext_comment(data, set), ext); if (SET_WITH_SKBINFO(set)) ip_set_init_skbinfo(ext_skbinfo(data, set), ext); + /* Must come last for the case when timed out entry is reused */ + if (SET_WITH_TIMEOUT(set)) + ip_set_timeout_set(ext_timeout(data, set), ext->timeout); + smp_mb__before_atomic(); + set_bit(j, n->used); + if (old != ERR_PTR(-ENOENT)) { + rcu_assign_pointer(hbucket(t, key), n); + if (old) + kfree_rcu(old, rcu); + } -out: - rcu_read_unlock_bh(); - return ret; + return 0; +set_full: + if (net_ratelimit()) + pr_warn("Set %s is full, maxelem %u reached\n", + set->name, h->maxelem); + return -IPSET_ERR_HASH_FULL; } -/* Delete an element from the hash: swap it with the last element - * and free up space if possible. +/* Delete an element from the hash and free up space if possible. */ static int mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, @@ -766,29 +844,32 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, const struct mtype_elem *d = value; struct mtype_elem *data; struct hbucket *n; - int i, ret = -IPSET_ERR_EXIST; -#ifdef IP_SET_HASH_WITH_NETS - u8 j; -#endif + int i, j, k, ret = -IPSET_ERR_EXIST; u32 key, multi = 0; + size_t dsize = set->dsize; - rcu_read_lock_bh(); - t = rcu_dereference_bh(h->table); + t = ipset_dereference_protected(h->table, set); key = HKEY(value, h->initval, t->htable_bits); - n = hbucket(t, key); - for (i = 0; i < n->pos; i++) { - data = ahash_data(n, i, set->dsize); + n = __ipset_dereference_protected(hbucket(t, key), 1); + if (!n) + goto out; + for (i = 0, k = 0; i < n->pos; i++) { + if (!test_bit(i, n->used)) { + k++; + continue; + } + data = ahash_data(n, i, dsize); if (!mtype_data_equal(data, d, &multi)) continue; if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(data, set))) goto out; - if (i != n->pos - 1) - /* Not last one */ - memcpy(data, ahash_data(n, n->pos - 1, set->dsize), - set->dsize); - n->pos--; + ret = 0; + clear_bit(i, n->used); + smp_mb__after_atomic(); + if (i + 1 == n->pos) + n->pos--; h->elements--; #ifdef IP_SET_HASH_WITH_NETS for (j = 0; j < IPSET_NET_COUNT; j++) @@ -796,25 +877,37 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, NLEN(set->family), j); #endif ip_set_ext_destroy(set, data); - if (n->pos + AHASH_INIT_SIZE < n->size) { - void *tmp = kzalloc((n->size - AHASH_INIT_SIZE) - * set->dsize, - GFP_ATOMIC); - if (!tmp) { - ret = 0; + + for (; i < n->pos; i++) { + if (!test_bit(i, n->used)) + k++; + } + if (n->pos == 0 && k == 0) { + rcu_assign_pointer(hbucket(t, key), NULL); + kfree_rcu(n, rcu); + } else if (k >= AHASH_INIT_SIZE) { + struct hbucket *tmp = kzalloc(sizeof(*tmp) + + (n->size - AHASH_INIT_SIZE) * dsize, + GFP_ATOMIC); + if (!tmp) goto out; + tmp->size = n->size - AHASH_INIT_SIZE; + for (j = 0, k = 0; j < n->pos; j++) { + if (!test_bit(j, n->used)) + continue; + data = ahash_data(n, j, dsize); + memcpy(tmp->value + k * dsize, data, dsize); + set_bit(j, tmp->used); + k++; } - n->size -= AHASH_INIT_SIZE; - memcpy(tmp, n->value, n->size * set->dsize); - kfree(n->value); - n->value = tmp; + tmp->pos = k; + rcu_assign_pointer(hbucket(t, key), tmp); + kfree_rcu(n, rcu); } - ret = 0; goto out; } out: - rcu_read_unlock_bh(); return ret; } @@ -865,8 +958,12 @@ mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d, mtype_data_netmask(d, NCIDR_GET(h->nets[j].cidr[0])); #endif key = HKEY(d, h->initval, t->htable_bits); - n = hbucket(t, key); + n = rcu_dereference_bh(hbucket(t, key)); + if (!n) + continue; for (i = 0; i < n->pos; i++) { + if (!test_bit(i, n->used)) + continue; data = ahash_data(n, i, set->dsize); if (!mtype_data_equal(data, d, &multi)) continue; @@ -904,7 +1001,6 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, int i, ret = 0; u32 key, multi = 0; - rcu_read_lock_bh(); t = rcu_dereference_bh(h->table); #ifdef IP_SET_HASH_WITH_NETS /* If we test an IP address and not a network address, @@ -919,8 +1015,14 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, #endif key = HKEY(d, h->initval, t->htable_bits); - n = hbucket(t, key); + n = rcu_dereference_bh(hbucket(t, key)); + if (!n) { + ret = 0; + goto out; + } for (i = 0; i < n->pos; i++) { + if (!test_bit(i, n->used)) + continue; data = ahash_data(n, i, set->dsize); if (mtype_data_equal(data, d, &multi) && !(SET_WITH_TIMEOUT(set) && @@ -930,7 +1032,6 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, } } out: - rcu_read_unlock_bh(); return ret; } @@ -942,15 +1043,19 @@ mtype_head(struct ip_set *set, struct sk_buff *skb) const struct htable *t; struct nlattr *nested; size_t memsize; + u8 htable_bits; + rcu_read_lock_bh(); t = rcu_dereference_bh_nfnl(h->table); memsize = mtype_ahash_memsize(h, t, NLEN(set->family), set->dsize); + htable_bits = t->htable_bits; + rcu_read_unlock_bh(); nested = ipset_nest_start(skb, IPSET_ATTR_DATA); if (!nested) goto nla_put_failure; if (nla_put_net32(skb, IPSET_ATTR_HASHSIZE, - htonl(jhash_size(t->htable_bits))) || + htonl(jhash_size(htable_bits))) || nla_put_net32(skb, IPSET_ATTR_MAXELEM, htonl(h->maxelem))) goto nla_put_failure; #ifdef IP_SET_HASH_WITH_NETMASK @@ -1010,20 +1115,27 @@ mtype_list(const struct ip_set *set, u32 first = cb->args[IPSET_CB_ARG0]; /* We assume that one hash bucket fills into one page */ void *incomplete; - int i; + int i, ret = 0; atd = ipset_nest_start(skb, IPSET_ATTR_ADT); if (!atd) return -EMSGSIZE; + pr_debug("list hash set %s\n", set->name); t = (const struct htable *)cb->args[IPSET_CB_PRIVATE]; + /* Expire may replace a hbucket with another one */ + rcu_read_lock(); for (; cb->args[IPSET_CB_ARG0] < jhash_size(t->htable_bits); cb->args[IPSET_CB_ARG0]++) { incomplete = skb_tail_pointer(skb); - n = hbucket(t, cb->args[IPSET_CB_ARG0]); + n = rcu_dereference(hbucket(t, cb->args[IPSET_CB_ARG0])); pr_debug("cb->arg bucket: %lu, t %p n %p\n", cb->args[IPSET_CB_ARG0], t, n); + if (!n) + continue; for (i = 0; i < n->pos; i++) { + if (!test_bit(i, n->used)) + continue; e = ahash_data(n, i, set->dsize); if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) @@ -1034,7 +1146,8 @@ mtype_list(const struct ip_set *set, if (!nested) { if (cb->args[IPSET_CB_ARG0] == first) { nla_nest_cancel(skb, atd); - return -EMSGSIZE; + ret = -EMSGSIZE; + goto out; } else goto nla_put_failure; } @@ -1049,7 +1162,7 @@ mtype_list(const struct ip_set *set, /* Set listing finished */ cb->args[IPSET_CB_ARG0] = 0; - return 0; + goto out; nla_put_failure: nlmsg_trim(skb, incomplete); @@ -1057,10 +1170,12 @@ nla_put_failure: pr_warn("Can't list set %s: one bucket does not fit into a message. Please report it!\n", set->name); cb->args[IPSET_CB_ARG0] = 0; - return -EMSGSIZE; - } - ipset_nest_end(skb, atd); - return 0; + ret = -EMSGSIZE; + } else + ipset_nest_end(skb, atd); +out: + rcu_read_unlock(); + return ret; } static int @@ -1122,12 +1237,14 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || -#ifdef IP_SET_HASH_WITH_MARKMASK - !ip_set_optattr_netorder(tb, IPSET_ATTR_MARKMASK) || -#endif !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; +#ifdef IP_SET_HASH_WITH_MARKMASK + /* Separated condition in order to avoid directive in argument list */ + if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_MARKMASK))) + return -IPSET_ERR_PROTOCOL; +#endif if (tb[IPSET_ATTR_HASHSIZE]) { hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]); @@ -1150,7 +1267,7 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, #endif #ifdef IP_SET_HASH_WITH_MARKMASK if (tb[IPSET_ATTR_MARKMASK]) { - markmask = ntohl(nla_get_u32(tb[IPSET_ATTR_MARKMASK])); + markmask = ntohl(nla_get_be32(tb[IPSET_ATTR_MARKMASK])); if (markmask == 0) return -IPSET_ERR_INVALID_MARKMASK; diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index 2bbadcc96ac5..f54d7069d633 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -315,6 +315,7 @@ hash_ip_init(void) static void __exit hash_ip_fini(void) { + rcu_barrier(); ip_set_type_unregister(&hash_ip_type); } diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c index 3aafb36484b4..f8fbc325ad34 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmark.c +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -319,6 +319,7 @@ hash_ipmark_init(void) static void __exit hash_ipmark_fini(void) { + rcu_barrier(); ip_set_type_unregister(&hash_ipmark_type); } diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index 4db1270f1197..9a31db8ccca6 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -382,6 +382,7 @@ hash_ipport_init(void) static void __exit hash_ipport_fini(void) { + rcu_barrier(); ip_set_type_unregister(&hash_ipport_type); } diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index c01bf68708ec..fc42489f8795 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -397,6 +397,7 @@ hash_ipportip_init(void) static void __exit hash_ipportip_fini(void) { + rcu_barrier(); ip_set_type_unregister(&hash_ipportip_type); } diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index 50248debdc8b..2a69b9bf66b8 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -554,6 +554,7 @@ hash_ipportnet_init(void) static void __exit hash_ipportnet_fini(void) { + rcu_barrier(); ip_set_type_unregister(&hash_ipportnet_type); } diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c index 8981c8b242b3..112aff3cda96 100644 --- a/net/netfilter/ipset/ip_set_hash_mac.c +++ b/net/netfilter/ipset/ip_set_hash_mac.c @@ -165,6 +165,7 @@ hash_mac_init(void) static void __exit hash_mac_fini(void) { + rcu_barrier(); ip_set_type_unregister(&hash_mac_type); } diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index 089b23fd1a94..e49b1d010d30 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -392,6 +392,7 @@ hash_net_init(void) static void __exit hash_net_fini(void) { + rcu_barrier(); ip_set_type_unregister(&hash_net_type); } diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index 3258189a296f..42c893e08842 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -500,6 +500,7 @@ hash_netiface_init(void) static void __exit hash_netiface_fini(void) { + rcu_barrier(); ip_set_type_unregister(&hash_netiface_type); } diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index ed9cc45084dd..b5428be1f159 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -480,6 +480,7 @@ hash_netnet_init(void) static void __exit hash_netnet_fini(void) { + rcu_barrier(); ip_set_type_unregister(&hash_netnet_type); } diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index fbaf8138e5d4..27307d0a8a5d 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -498,6 +498,7 @@ hash_netport_init(void) static void __exit hash_netport_fini(void) { + rcu_barrier(); ip_set_type_unregister(&hash_netport_type); } diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index a828cbc8bed7..1e0e47ae40a4 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -581,6 +581,7 @@ hash_netportnet_init(void) static void __exit hash_netportnet_fini(void) { + rcu_barrier(); ip_set_type_unregister(&hash_netportnet_type); } -- cgit From 00590fdd5be0d763631ef10e6a3e2ce8fc2d9ec3 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Sat, 13 Jun 2015 16:56:02 +0200 Subject: netfilter: ipset: Introduce RCU locking in list type Standard rculist is used. Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/ip_set_list_set.c | 398 ++++++++++++++++------------------ 1 file changed, 189 insertions(+), 209 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 107ea6cc47f1..9f624ee9a41e 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -9,6 +9,7 @@ #include #include +#include #include #include @@ -27,6 +28,8 @@ MODULE_ALIAS("ip_set_list:set"); /* Member elements */ struct set_elem { + struct rcu_head rcu; + struct list_head list; ip_set_id_t id; }; @@ -41,12 +44,9 @@ struct list_set { u32 size; /* size of set list array */ struct timer_list gc; /* garbage collection */ struct net *net; /* namespace */ - struct set_elem members[0]; /* the set members */ + struct list_head members; /* the set members */ }; -#define list_set_elem(set, map, id) \ - (struct set_elem *)((void *)(map)->members + (id) * (set)->dsize) - static int list_set_ktest(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, @@ -54,17 +54,14 @@ list_set_ktest(struct ip_set *set, const struct sk_buff *skb, { struct list_set *map = set->data; struct set_elem *e; - u32 i, cmdflags = opt->cmdflags; + u32 cmdflags = opt->cmdflags; int ret; /* Don't lookup sub-counters at all */ opt->cmdflags &= ~IPSET_FLAG_MATCH_COUNTERS; if (opt->cmdflags & IPSET_FLAG_SKIP_SUBCOUNTER_UPDATE) opt->cmdflags &= ~IPSET_FLAG_SKIP_COUNTER_UPDATE; - for (i = 0; i < map->size; i++) { - e = list_set_elem(set, map, i); - if (e->id == IPSET_INVALID_ID) - return 0; + list_for_each_entry_rcu(e, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -91,13 +88,9 @@ list_set_kadd(struct ip_set *set, const struct sk_buff *skb, { struct list_set *map = set->data; struct set_elem *e; - u32 i; int ret; - for (i = 0; i < map->size; i++) { - e = list_set_elem(set, map, i); - if (e->id == IPSET_INVALID_ID) - return 0; + list_for_each_entry(e, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -115,13 +108,9 @@ list_set_kdel(struct ip_set *set, const struct sk_buff *skb, { struct list_set *map = set->data; struct set_elem *e; - u32 i; int ret; - for (i = 0; i < map->size; i++) { - e = list_set_elem(set, map, i); - if (e->id == IPSET_INVALID_ID) - return 0; + list_for_each_entry(e, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -138,110 +127,65 @@ list_set_kadt(struct ip_set *set, const struct sk_buff *skb, enum ipset_adt adt, struct ip_set_adt_opt *opt) { struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + int ret = -EINVAL; + rcu_read_lock(); switch (adt) { case IPSET_TEST: - return list_set_ktest(set, skb, par, opt, &ext); + ret = list_set_ktest(set, skb, par, opt, &ext); + break; case IPSET_ADD: - return list_set_kadd(set, skb, par, opt, &ext); + ret = list_set_kadd(set, skb, par, opt, &ext); + break; case IPSET_DEL: - return list_set_kdel(set, skb, par, opt, &ext); + ret = list_set_kdel(set, skb, par, opt, &ext); + break; default: break; } - return -EINVAL; -} - -static bool -id_eq(const struct ip_set *set, u32 i, ip_set_id_t id) -{ - const struct list_set *map = set->data; - const struct set_elem *e; - - if (i >= map->size) - return 0; + rcu_read_unlock(); - e = list_set_elem(set, map, i); - return !!(e->id == id && - !(SET_WITH_TIMEOUT(set) && - ip_set_timeout_expired(ext_timeout(e, set)))); + return ret; } -static int -list_set_add(struct ip_set *set, u32 i, struct set_adt_elem *d, - const struct ip_set_ext *ext) -{ - struct list_set *map = set->data; - struct set_elem *e = list_set_elem(set, map, i); - - if (e->id != IPSET_INVALID_ID) { - if (i == map->size - 1) { - /* Last element replaced: e.g. add new,before,last */ - ip_set_put_byindex(map->net, e->id); - ip_set_ext_destroy(set, e); - } else { - struct set_elem *x = list_set_elem(set, map, - map->size - 1); - - /* Last element pushed off */ - if (x->id != IPSET_INVALID_ID) { - ip_set_put_byindex(map->net, x->id); - ip_set_ext_destroy(set, x); - } - memmove(list_set_elem(set, map, i + 1), e, - set->dsize * (map->size - (i + 1))); - /* Extensions must be initialized to zero */ - memset(e, 0, set->dsize); - } - } - - e->id = d->id; - if (SET_WITH_TIMEOUT(set)) - ip_set_timeout_set(ext_timeout(e, set), ext->timeout); - if (SET_WITH_COUNTER(set)) - ip_set_init_counter(ext_counter(e, set), ext); - if (SET_WITH_COMMENT(set)) - ip_set_init_comment(ext_comment(e, set), ext); - if (SET_WITH_SKBINFO(set)) - ip_set_init_skbinfo(ext_skbinfo(e, set), ext); - return 0; -} +/* Userspace interfaces: we are protected by the nfnl mutex */ -static int -list_set_del(struct ip_set *set, u32 i) +static void +__list_set_del(struct ip_set *set, struct set_elem *e) { struct list_set *map = set->data; - struct set_elem *e = list_set_elem(set, map, i); ip_set_put_byindex(map->net, e->id); + /* We may call it, because we don't have a to be destroyed + * extension which is used by the kernel. + */ ip_set_ext_destroy(set, e); + kfree_rcu(e, rcu); +} - if (i < map->size - 1) - memmove(e, list_set_elem(set, map, i + 1), - set->dsize * (map->size - (i + 1))); +static inline void +list_set_del(struct ip_set *set, struct set_elem *e) +{ + list_del_rcu(&e->list); + __list_set_del(set, e); +} - /* Last element */ - e = list_set_elem(set, map, map->size - 1); - e->id = IPSET_INVALID_ID; - return 0; +static inline void +list_set_replace(struct ip_set *set, struct set_elem *e, struct set_elem *old) +{ + list_replace_rcu(&old->list, &e->list); + __list_set_del(set, old); } static void set_cleanup_entries(struct ip_set *set) { struct list_set *map = set->data; - struct set_elem *e; - u32 i = 0; + struct set_elem *e, *n; - while (i < map->size) { - e = list_set_elem(set, map, i); - if (e->id != IPSET_INVALID_ID && - ip_set_timeout_expired(ext_timeout(e, set))) - list_set_del(set, i); - /* Check element moved to position i in next loop */ - else - i++; - } + list_for_each_entry_safe(e, n, &map->members, list) + if (ip_set_timeout_expired(ext_timeout(e, set))) + list_set_del(set, e); } static int @@ -250,31 +194,45 @@ list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext, { struct list_set *map = set->data; struct set_adt_elem *d = value; - struct set_elem *e; - u32 i; + struct set_elem *e, *next, *prev = NULL; int ret; - for (i = 0; i < map->size; i++) { - e = list_set_elem(set, map, i); - if (e->id == IPSET_INVALID_ID) - return 0; - else if (SET_WITH_TIMEOUT(set) && - ip_set_timeout_expired(ext_timeout(e, set))) + list_for_each_entry(e, &map->members, list) { + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, set))) continue; - else if (e->id != d->id) + else if (e->id != d->id) { + prev = e; continue; + } if (d->before == 0) - return 1; - else if (d->before > 0) - ret = id_eq(set, i + 1, d->refid); - else - ret = i > 0 && id_eq(set, i - 1, d->refid); + ret = 1; + else if (d->before > 0) { + next = list_next_entry(e, list); + ret = !list_is_last(&e->list, &map->members) && + next->id == d->refid; + } else + ret = prev && prev->id == d->refid; return ret; } return 0; } +static void +list_set_init_extensions(struct ip_set *set, const struct ip_set_ext *ext, + struct set_elem *e) +{ + if (SET_WITH_COUNTER(set)) + ip_set_init_counter(ext_counter(e, set), ext); + if (SET_WITH_COMMENT(set)) + ip_set_init_comment(ext_comment(e, set), ext); + if (SET_WITH_SKBINFO(set)) + ip_set_init_skbinfo(ext_skbinfo(e, set), ext); + /* Update timeout last */ + if (SET_WITH_TIMEOUT(set)) + ip_set_timeout_set(ext_timeout(e, set), ext->timeout); +} static int list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext, @@ -282,60 +240,78 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext, { struct list_set *map = set->data; struct set_adt_elem *d = value; - struct set_elem *e; + struct set_elem *e, *n, *prev, *next; bool flag_exist = flags & IPSET_FLAG_EXIST; - u32 i, ret = 0; if (SET_WITH_TIMEOUT(set)) set_cleanup_entries(set); - /* Check already added element */ - for (i = 0; i < map->size; i++) { - e = list_set_elem(set, map, i); - if (e->id == IPSET_INVALID_ID) - goto insert; - else if (e->id != d->id) + /* Find where to add the new entry */ + n = prev = next = NULL; + list_for_each_entry(e, &map->members, list) { + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, set))) continue; - - if ((d->before > 1 && !id_eq(set, i + 1, d->refid)) || - (d->before < 0 && - (i == 0 || !id_eq(set, i - 1, d->refid)))) - /* Before/after doesn't match */ + else if (d->id == e->id) + n = e; + else if (d->before == 0 || e->id != d->refid) + continue; + else if (d->before > 0) + next = e; + else + prev = e; + } + /* Re-add already existing element */ + if (n) { + if ((d->before > 0 && !next) || + (d->before < 0 && !prev)) return -IPSET_ERR_REF_EXIST; if (!flag_exist) - /* Can't re-add */ return -IPSET_ERR_EXIST; /* Update extensions */ - ip_set_ext_destroy(set, e); + ip_set_ext_destroy(set, n); + list_set_init_extensions(set, ext, n); - if (SET_WITH_TIMEOUT(set)) - ip_set_timeout_set(ext_timeout(e, set), ext->timeout); - if (SET_WITH_COUNTER(set)) - ip_set_init_counter(ext_counter(e, set), ext); - if (SET_WITH_COMMENT(set)) - ip_set_init_comment(ext_comment(e, set), ext); - if (SET_WITH_SKBINFO(set)) - ip_set_init_skbinfo(ext_skbinfo(e, set), ext); /* Set is already added to the list */ ip_set_put_byindex(map->net, d->id); return 0; } -insert: - ret = -IPSET_ERR_LIST_FULL; - for (i = 0; i < map->size && ret == -IPSET_ERR_LIST_FULL; i++) { - e = list_set_elem(set, map, i); - if (e->id == IPSET_INVALID_ID) - ret = d->before != 0 ? -IPSET_ERR_REF_EXIST - : list_set_add(set, i, d, ext); - else if (e->id != d->refid) - continue; - else if (d->before > 0) - ret = list_set_add(set, i, d, ext); - else if (i + 1 < map->size) - ret = list_set_add(set, i + 1, d, ext); + /* Add new entry */ + if (d->before == 0) { + /* Append */ + n = list_empty(&map->members) ? NULL : + list_last_entry(&map->members, struct set_elem, list); + } else if (d->before > 0) { + /* Insert after next element */ + if (!list_is_last(&next->list, &map->members)) + n = list_next_entry(next, list); + } else { + /* Insert before prev element */ + if (prev->list.prev != &map->members) + n = list_prev_entry(prev, list); } + /* Can we replace a timed out entry? */ + if (n && + !(SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(n, set)))) + n = NULL; + + e = kzalloc(set->dsize, GFP_KERNEL); + if (!e) + return -ENOMEM; + e->id = d->id; + INIT_LIST_HEAD(&e->list); + list_set_init_extensions(set, ext, e); + if (n) + list_set_replace(set, e, n); + else if (next) + list_add_tail_rcu(&e->list, &next->list); + else if (prev) + list_add_rcu(&e->list, &prev->list); + else + list_add_tail_rcu(&e->list, &map->members); - return ret; + return 0; } static int @@ -344,32 +320,30 @@ list_set_udel(struct ip_set *set, void *value, const struct ip_set_ext *ext, { struct list_set *map = set->data; struct set_adt_elem *d = value; - struct set_elem *e; - u32 i; - - for (i = 0; i < map->size; i++) { - e = list_set_elem(set, map, i); - if (e->id == IPSET_INVALID_ID) - return d->before != 0 ? -IPSET_ERR_REF_EXIST - : -IPSET_ERR_EXIST; - else if (SET_WITH_TIMEOUT(set) && - ip_set_timeout_expired(ext_timeout(e, set))) + struct set_elem *e, *next, *prev = NULL; + + list_for_each_entry(e, &map->members, list) { + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, set))) continue; - else if (e->id != d->id) + else if (e->id != d->id) { + prev = e; continue; + } - if (d->before == 0) - return list_set_del(set, i); - else if (d->before > 0) { - if (!id_eq(set, i + 1, d->refid)) + if (d->before > 0) { + next = list_next_entry(e, list); + if (list_is_last(&e->list, &map->members) || + next->id != d->refid) return -IPSET_ERR_REF_EXIST; - return list_set_del(set, i); - } else if (i == 0 || !id_eq(set, i - 1, d->refid)) - return -IPSET_ERR_REF_EXIST; - else - return list_set_del(set, i); + } else if (d->before < 0) { + if (!prev || prev->id != d->refid) + return -IPSET_ERR_REF_EXIST; + } + list_set_del(set, e); + return 0; } - return -IPSET_ERR_EXIST; + return d->before != 0 ? -IPSET_ERR_REF_EXIST : -IPSET_ERR_EXIST; } static int @@ -404,6 +378,7 @@ list_set_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 f = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + e.before = f & IPSET_FLAG_BEFORE; } @@ -441,27 +416,26 @@ static void list_set_flush(struct ip_set *set) { struct list_set *map = set->data; - struct set_elem *e; - u32 i; - - for (i = 0; i < map->size; i++) { - e = list_set_elem(set, map, i); - if (e->id != IPSET_INVALID_ID) { - ip_set_put_byindex(map->net, e->id); - ip_set_ext_destroy(set, e); - e->id = IPSET_INVALID_ID; - } - } + struct set_elem *e, *n; + + list_for_each_entry_safe(e, n, &map->members, list) + list_set_del(set, e); } static void list_set_destroy(struct ip_set *set) { struct list_set *map = set->data; + struct set_elem *e, *n; if (SET_WITH_TIMEOUT(set)) del_timer_sync(&map->gc); - list_set_flush(set); + list_for_each_entry_safe(e, n, &map->members, list) { + list_del(&e->list); + ip_set_put_byindex(map->net, e->id); + ip_set_ext_destroy(set, e); + kfree(e); + } kfree(map); set->data = NULL; @@ -472,6 +446,11 @@ list_set_head(struct ip_set *set, struct sk_buff *skb) { const struct list_set *map = set->data; struct nlattr *nested; + struct set_elem *e; + u32 n = 0; + + list_for_each_entry(e, &map->members, list) + n++; nested = ipset_nest_start(skb, IPSET_ATTR_DATA); if (!nested) @@ -479,7 +458,7 @@ list_set_head(struct ip_set *set, struct sk_buff *skb) if (nla_put_net32(skb, IPSET_ATTR_SIZE, htonl(map->size)) || nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) || nla_put_net32(skb, IPSET_ATTR_MEMSIZE, - htonl(sizeof(*map) + map->size * set->dsize))) + htonl(sizeof(*map) + n * set->dsize))) goto nla_put_failure; if (unlikely(ip_set_put_flags(skb, set))) goto nla_put_failure; @@ -496,18 +475,22 @@ list_set_list(const struct ip_set *set, { const struct list_set *map = set->data; struct nlattr *atd, *nested; - u32 i, first = cb->args[IPSET_CB_ARG0]; - const struct set_elem *e; + u32 i = 0, first = cb->args[IPSET_CB_ARG0]; + struct set_elem *e; + int ret = 0; atd = ipset_nest_start(skb, IPSET_ATTR_ADT); if (!atd) return -EMSGSIZE; - for (; cb->args[IPSET_CB_ARG0] < map->size; - cb->args[IPSET_CB_ARG0]++) { - i = cb->args[IPSET_CB_ARG0]; - e = list_set_elem(set, map, i); - if (e->id == IPSET_INVALID_ID) - goto finish; + list_for_each_entry(e, &map->members, list) { + if (i == first) + break; + i++; + } + + rcu_read_lock(); + list_for_each_entry_from(e, &map->members, list) { + i++; if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -515,9 +498,10 @@ list_set_list(const struct ip_set *set, if (!nested) { if (i == first) { nla_nest_cancel(skb, atd); - return -EMSGSIZE; - } else - goto nla_put_failure; + ret = -EMSGSIZE; + goto out; + } + goto nla_put_failure; } if (nla_put_string(skb, IPSET_ATTR_NAME, ip_set_name_byindex(map->net, e->id))) @@ -526,20 +510,23 @@ list_set_list(const struct ip_set *set, goto nla_put_failure; ipset_nest_end(skb, nested); } -finish: + ipset_nest_end(skb, atd); /* Set listing finished */ cb->args[IPSET_CB_ARG0] = 0; - return 0; + goto out; nla_put_failure: nla_nest_cancel(skb, nested); if (unlikely(i == first)) { cb->args[IPSET_CB_ARG0] = 0; - return -EMSGSIZE; + ret = -EMSGSIZE; } + cb->args[IPSET_CB_ARG0] = i - 1; ipset_nest_end(skb, atd); - return 0; +out: + rcu_read_unlock(); + return ret; } static bool @@ -574,9 +561,9 @@ list_set_gc(unsigned long ul_set) struct ip_set *set = (struct ip_set *) ul_set; struct list_set *map = set->data; - write_lock_bh(&set->lock); + spin_lock_bh(&set->lock); set_cleanup_entries(set); - write_unlock_bh(&set->lock); + spin_unlock_bh(&set->lock); map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; add_timer(&map->gc); @@ -600,24 +587,16 @@ static bool init_list_set(struct net *net, struct ip_set *set, u32 size) { struct list_set *map; - struct set_elem *e; - u32 i; - map = kzalloc(sizeof(*map) + - min_t(u32, size, IP_SET_LIST_MAX_SIZE) * set->dsize, - GFP_KERNEL); + map = kzalloc(sizeof(*map), GFP_KERNEL); if (!map) return false; map->size = size; map->net = net; + INIT_LIST_HEAD(&map->members); set->data = map; - for (i = 0; i < size; i++) { - e = list_set_elem(set, map, i); - e->id = IPSET_INVALID_ID; - } - return true; } @@ -690,6 +669,7 @@ list_set_init(void) static void __exit list_set_fini(void) { + rcu_barrier(); ip_set_type_unregister(&list_set_type); } -- cgit From ca0f6a5cd99e0c6ba4bb78dc402817f636370f26 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Sat, 13 Jun 2015 19:45:33 +0200 Subject: netfilter: ipset: Fix coding styles reported by checkpatch.pl Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/ip_set_bitmap_gen.h | 11 +- net/netfilter/ipset/ip_set_bitmap_ip.c | 12 +- net/netfilter/ipset/ip_set_bitmap_ipmac.c | 21 +-- net/netfilter/ipset/ip_set_bitmap_port.c | 7 +- net/netfilter/ipset/ip_set_core.c | 201 +++++++++++++-------------- net/netfilter/ipset/ip_set_getport.c | 13 +- net/netfilter/ipset/ip_set_hash_gen.h | 55 ++++---- net/netfilter/ipset/ip_set_hash_ip.c | 4 +- net/netfilter/ipset/ip_set_hash_ipmark.c | 9 +- net/netfilter/ipset/ip_set_hash_ipport.c | 14 +- net/netfilter/ipset/ip_set_hash_ipportip.c | 16 ++- net/netfilter/ipset/ip_set_hash_ipportnet.c | 19 ++- net/netfilter/ipset/ip_set_hash_mac.c | 6 +- net/netfilter/ipset/ip_set_hash_net.c | 8 +- net/netfilter/ipset/ip_set_hash_netiface.c | 25 ++-- net/netfilter/ipset/ip_set_hash_netnet.c | 46 +++--- net/netfilter/ipset/ip_set_hash_netport.c | 19 ++- net/netfilter/ipset/ip_set_hash_netportnet.c | 54 +++---- net/netfilter/ipset/ip_set_list_set.c | 11 +- net/netfilter/ipset/pfxlen.c | 16 +-- net/netfilter/xt_set.c | 44 +++--- 21 files changed, 322 insertions(+), 289 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h index 86429f369128..d05e759ed0fa 100644 --- a/net/netfilter/ipset/ip_set_bitmap_gen.h +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -41,7 +41,7 @@ mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set)) struct mtype *map = set->data; init_timer(&map->gc); - map->gc.data = (unsigned long) set; + map->gc.data = (unsigned long)set; map->gc.function = gc; map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; add_timer(&map->gc); @@ -223,7 +223,7 @@ mtype_list(const struct ip_set *set, if (!test_bit(id, map->members) || (SET_WITH_TIMEOUT(set) && #ifdef IP_SET_BITMAP_STORED_TIMEOUT - mtype_is_filled((const struct mtype_elem *) x) && + mtype_is_filled((const struct mtype_elem *)x) && #endif ip_set_timeout_expired(ext_timeout(x, set)))) continue; @@ -240,7 +240,7 @@ mtype_list(const struct ip_set *set, if (mtype_do_list(skb, map, id, set->dsize)) goto nla_put_failure; if (ip_set_put_extensions(skb, set, x, - mtype_is_filled((const struct mtype_elem *) x))) + mtype_is_filled((const struct mtype_elem *)x))) goto nla_put_failure; ipset_nest_end(skb, nested); } @@ -266,13 +266,14 @@ out: static void mtype_gc(unsigned long ul_set) { - struct ip_set *set = (struct ip_set *) ul_set; + struct ip_set *set = (struct ip_set *)ul_set; struct mtype *map = set->data; void *x; u32 id; /* We run parallel with other readers (test element) - * but adding/deleting new entries is locked out */ + * but adding/deleting new entries is locked out + */ spin_lock_bh(&set->lock); for (id = 0; id < map->elements; id++) if (mtype_gc_test(id, map, set->dsize)) { diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index b8ce474c038d..64a564334418 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -59,7 +59,7 @@ struct bitmap_ip_adt_elem { static inline u32 ip_to_id(const struct bitmap_ip *m, u32 ip) { - return ((ip & ip_set_hostmask(m->netmask)) - m->first_ip)/m->hosts; + return ((ip & ip_set_hostmask(m->netmask)) - m->first_ip) / m->hosts; } /* Common functions */ @@ -175,8 +175,9 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[], if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); - } else + } else { ip_to = ip; + } if (ip_to > map->last_ip) return -IPSET_ERR_BITMAP_RANGE; @@ -187,8 +188,8 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } return ret; } @@ -278,8 +279,9 @@ bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[], if (cidr >= HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(first_ip, last_ip, cidr); - } else + } else { return -IPSET_ERR_PROTOCOL; + } if (tb[IPSET_ATTR_NETMASK]) { netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]); diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index fe00e87decc8..1430535118fb 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -90,7 +90,7 @@ bitmap_ipmac_do_test(const struct bitmap_ipmac_adt_elem *e, return 0; elem = get_elem(map->extensions, e->id, dsize); if (elem->filled == MAC_FILLED) - return e->ether == NULL || + return !e->ether || ether_addr_equal(e->ether, elem->ether); /* Trigger kernel to fill out the ethernet address */ return -EAGAIN; @@ -131,7 +131,8 @@ bitmap_ipmac_add_timeout(unsigned long *timeout, /* If MAC is unset yet, we store plain timeout value * because the timer is not activated yet * and we can reuse it later when MAC is filled out, - * possibly by the kernel */ + * possibly by the kernel + */ if (e->ether) ip_set_timeout_set(timeout, t); else @@ -155,7 +156,7 @@ bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e, /* memcpy isn't atomic */ clear_bit(e->id, map->members); smp_mb__after_atomic(); - memcpy(elem->ether, e->ether, ETH_ALEN); + ether_addr_copy(elem->ether, e->ether); } return IPSET_ADD_FAILED; } else if (!e->ether) @@ -164,19 +165,18 @@ bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e, /* Fill the MAC address and trigger the timer activation */ clear_bit(e->id, map->members); smp_mb__after_atomic(); - memcpy(elem->ether, e->ether, ETH_ALEN); + ether_addr_copy(elem->ether, e->ether); elem->filled = MAC_FILLED; return IPSET_ADD_START_STORED_TIMEOUT; } else if (e->ether) { /* We can store MAC too */ - memcpy(elem->ether, e->ether, ETH_ALEN); + ether_addr_copy(elem->ether, e->ether); elem->filled = MAC_FILLED; return 0; - } else { - elem->filled = MAC_UNSET; - /* MAC is not stored yet, don't start timer */ - return IPSET_ADD_STORE_PLAIN_TIMEOUT; } + elem->filled = MAC_UNSET; + /* MAC is not stored yet, don't start timer */ + return IPSET_ADD_STORE_PLAIN_TIMEOUT; } static inline int @@ -352,8 +352,9 @@ bitmap_ipmac_create(struct net *net, struct ip_set *set, struct nlattr *tb[], if (cidr >= HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(first_ip, last_ip, cidr); - } else + } else { return -IPSET_ERR_PROTOCOL; + } elements = (u64)last_ip - first_ip + 1; diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c index 2d360f951d18..5338ccd5da46 100644 --- a/net/netfilter/ipset/ip_set_bitmap_port.c +++ b/net/netfilter/ipset/ip_set_bitmap_port.c @@ -162,8 +162,9 @@ bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[], if (port < map->first_port) return -IPSET_ERR_BITMAP_RANGE; } - } else + } else { port_to = port; + } if (port_to > map->last_port) return -IPSET_ERR_BITMAP_RANGE; @@ -174,8 +175,8 @@ bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } return ret; } diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 2b21a1983a98..338b4047776f 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -35,6 +35,7 @@ struct ip_set_net { bool is_deleted; /* deleted by ip_set_net_exit */ bool is_destroyed; /* all sets are destroyed */ }; + static int ip_set_net_id __read_mostly; static inline struct ip_set_net *ip_set_pernet(struct net *net) @@ -60,8 +61,7 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET); #define ip_set(inst, id) \ ip_set_dereference((inst)->ip_set_list)[id] -/* - * The set types are implemented in modules and registered set types +/* The set types are implemented in modules and registered set types * can be found in ip_set_type_list. Adding/deleting types is * serialized by ip_set_type_mutex. */ @@ -131,7 +131,8 @@ __find_set_type_get(const char *name, u8 family, u8 revision, goto unlock; } /* Make sure the type is already loaded - * but we don't support the revision */ + * but we don't support the revision + */ list_for_each_entry_rcu(type, &ip_set_type_list, list) if (STRNCMP(type->name, name)) { err = -IPSET_ERR_FIND_TYPE; @@ -290,7 +291,7 @@ static const struct nla_policy ipaddr_policy[IPSET_ATTR_IPADDR_MAX + 1] = { int ip_set_get_ipaddr4(struct nlattr *nla, __be32 *ipaddr) { - struct nlattr *tb[IPSET_ATTR_IPADDR_MAX+1]; + struct nlattr *tb[IPSET_ATTR_IPADDR_MAX + 1]; if (unlikely(!flag_nested(nla))) return -IPSET_ERR_PROTOCOL; @@ -307,7 +308,7 @@ EXPORT_SYMBOL_GPL(ip_set_get_ipaddr4); int ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr) { - struct nlattr *tb[IPSET_ATTR_IPADDR_MAX+1]; + struct nlattr *tb[IPSET_ATTR_IPADDR_MAX + 1]; if (unlikely(!flag_nested(nla))) return -IPSET_ERR_PROTOCOL; @@ -318,7 +319,7 @@ ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr) return -IPSET_ERR_PROTOCOL; memcpy(ipaddr, nla_data(tb[IPSET_ATTR_IPADDR_IPV6]), - sizeof(struct in6_addr)); + sizeof(struct in6_addr)); return 0; } EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6); @@ -467,8 +468,7 @@ ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set, } EXPORT_SYMBOL_GPL(ip_set_put_extensions); -/* - * Creating/destroying/renaming/swapping affect the existence and +/* Creating/destroying/renaming/swapping affect the existence and * the properties of a set. All of these can be executed from userspace * only and serialized by the nfnl mutex indirectly from nfnetlink. * @@ -495,8 +495,7 @@ __ip_set_put(struct ip_set *set) write_unlock_bh(&ip_set_ref_lock); } -/* - * Add, del and test set entries from kernel. +/* Add, del and test set entries from kernel. * * The set behind the index must exist and must be referenced * so it can't be destroyed (or changed) under our foot. @@ -524,7 +523,7 @@ ip_set_test(ip_set_id_t index, const struct sk_buff *skb, dev_net(par->in ? par->in : par->out), index); int ret = 0; - BUG_ON(set == NULL); + BUG_ON(!set); pr_debug("set %s, index %u\n", set->name, index); if (opt->dim < set->type->dimension || @@ -563,7 +562,7 @@ ip_set_add(ip_set_id_t index, const struct sk_buff *skb, dev_net(par->in ? par->in : par->out), index); int ret; - BUG_ON(set == NULL); + BUG_ON(!set); pr_debug("set %s, index %u\n", set->name, index); if (opt->dim < set->type->dimension || @@ -586,7 +585,7 @@ ip_set_del(ip_set_id_t index, const struct sk_buff *skb, dev_net(par->in ? par->in : par->out), index); int ret = 0; - BUG_ON(set == NULL); + BUG_ON(!set); pr_debug("set %s, index %u\n", set->name, index); if (opt->dim < set->type->dimension || @@ -601,8 +600,7 @@ ip_set_del(ip_set_id_t index, const struct sk_buff *skb, } EXPORT_SYMBOL_GPL(ip_set_del); -/* - * Find set by name, reference it once. The reference makes sure the +/* Find set by name, reference it once. The reference makes sure the * thing pointed to, does not go away under our feet. * */ @@ -616,7 +614,7 @@ ip_set_get_byname(struct net *net, const char *name, struct ip_set **set) rcu_read_lock(); for (i = 0; i < inst->ip_set_max; i++) { s = rcu_dereference(inst->ip_set_list)[i]; - if (s != NULL && STRNCMP(s->name, name)) { + if (s && STRNCMP(s->name, name)) { __ip_set_get(s); index = i; *set = s; @@ -629,8 +627,7 @@ ip_set_get_byname(struct net *net, const char *name, struct ip_set **set) } EXPORT_SYMBOL_GPL(ip_set_get_byname); -/* - * If the given set pointer points to a valid set, decrement +/* If the given set pointer points to a valid set, decrement * reference count by 1. The caller shall not assume the index * to be valid, after calling this function. * @@ -643,7 +640,7 @@ __ip_set_put_byindex(struct ip_set_net *inst, ip_set_id_t index) rcu_read_lock(); set = rcu_dereference(inst->ip_set_list)[index]; - if (set != NULL) + if (set) __ip_set_put(set); rcu_read_unlock(); } @@ -657,8 +654,7 @@ ip_set_put_byindex(struct net *net, ip_set_id_t index) } EXPORT_SYMBOL_GPL(ip_set_put_byindex); -/* - * Get the name of a set behind a set index. +/* Get the name of a set behind a set index. * We assume the set is referenced, so it does exist and * can't be destroyed. The set cannot be renamed due to * the referencing either. @@ -669,7 +665,7 @@ ip_set_name_byindex(struct net *net, ip_set_id_t index) { const struct ip_set *set = ip_set_rcu_get(net, index); - BUG_ON(set == NULL); + BUG_ON(!set); BUG_ON(set->ref == 0); /* Referenced, so it's safe */ @@ -677,13 +673,11 @@ ip_set_name_byindex(struct net *net, ip_set_id_t index) } EXPORT_SYMBOL_GPL(ip_set_name_byindex); -/* - * Routines to call by external subsystems, which do not +/* Routines to call by external subsystems, which do not * call nfnl_lock for us. */ -/* - * Find set by index, reference it once. The reference makes sure the +/* Find set by index, reference it once. The reference makes sure the * thing pointed to, does not go away under our feet. * * The nfnl mutex is used in the function. @@ -709,8 +703,7 @@ ip_set_nfnl_get_byindex(struct net *net, ip_set_id_t index) } EXPORT_SYMBOL_GPL(ip_set_nfnl_get_byindex); -/* - * If the given set pointer points to a valid set, decrement +/* If the given set pointer points to a valid set, decrement * reference count by 1. The caller shall not assume the index * to be valid, after calling this function. * @@ -725,15 +718,14 @@ ip_set_nfnl_put(struct net *net, ip_set_id_t index) nfnl_lock(NFNL_SUBSYS_IPSET); if (!inst->is_deleted) { /* already deleted from ip_set_net_exit() */ set = ip_set(inst, index); - if (set != NULL) + if (set) __ip_set_put(set); } nfnl_unlock(NFNL_SUBSYS_IPSET); } EXPORT_SYMBOL_GPL(ip_set_nfnl_put); -/* - * Communication protocol with userspace over netlink. +/* Communication protocol with userspace over netlink. * * The commands are serialized by the nfnl mutex. */ @@ -760,7 +752,7 @@ start_msg(struct sk_buff *skb, u32 portid, u32 seq, unsigned int flags, nlh = nlmsg_put(skb, portid, seq, cmd | (NFNL_SUBSYS_IPSET << 8), sizeof(*nfmsg), flags); - if (nlh == NULL) + if (!nlh) return NULL; nfmsg = nlmsg_data(nlh); @@ -793,7 +785,7 @@ find_set_and_id(struct ip_set_net *inst, const char *name, ip_set_id_t *id) *id = IPSET_INVALID_ID; for (i = 0; i < inst->ip_set_max; i++) { set = ip_set(inst, i); - if (set != NULL && STRNCMP(set->name, name)) { + if (set && STRNCMP(set->name, name)) { *id = i; break; } @@ -819,7 +811,7 @@ find_free_id(struct ip_set_net *inst, const char *name, ip_set_id_t *index, *index = IPSET_INVALID_ID; for (i = 0; i < inst->ip_set_max; i++) { s = ip_set(inst, i); - if (s == NULL) { + if (!s) { if (*index == IPSET_INVALID_ID) *index = i; } else if (STRNCMP(name, s->name)) { @@ -851,18 +843,18 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb, struct ip_set_net *inst = ip_set_pernet(net); struct ip_set *set, *clash = NULL; ip_set_id_t index = IPSET_INVALID_ID; - struct nlattr *tb[IPSET_ATTR_CREATE_MAX+1] = {}; + struct nlattr *tb[IPSET_ATTR_CREATE_MAX + 1] = {}; const char *name, *typename; u8 family, revision; u32 flags = flag_exist(nlh); int ret = 0; if (unlikely(protocol_failed(attr) || - attr[IPSET_ATTR_SETNAME] == NULL || - attr[IPSET_ATTR_TYPENAME] == NULL || - attr[IPSET_ATTR_REVISION] == NULL || - attr[IPSET_ATTR_FAMILY] == NULL || - (attr[IPSET_ATTR_DATA] != NULL && + !attr[IPSET_ATTR_SETNAME] || + !attr[IPSET_ATTR_TYPENAME] || + !attr[IPSET_ATTR_REVISION] || + !attr[IPSET_ATTR_FAMILY] || + (attr[IPSET_ATTR_DATA] && !flag_nested(attr[IPSET_ATTR_DATA])))) return -IPSET_ERR_PROTOCOL; @@ -873,11 +865,10 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb, pr_debug("setname: %s, typename: %s, family: %s, revision: %u\n", name, typename, family_name(family), revision); - /* - * First, and without any locks, allocate and initialize + /* First, and without any locks, allocate and initialize * a normal base set structure. */ - set = kzalloc(sizeof(struct ip_set), GFP_KERNEL); + set = kzalloc(sizeof(*set), GFP_KERNEL); if (!set) return -ENOMEM; spin_lock_init(&set->lock); @@ -885,21 +876,18 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb, set->family = family; set->revision = revision; - /* - * Next, check that we know the type, and take + /* Next, check that we know the type, and take * a reference on the type, to make sure it stays available * while constructing our new set. * * After referencing the type, we try to create the type * specific part of the set without holding any locks. */ - ret = find_set_type_get(typename, family, revision, &(set->type)); + ret = find_set_type_get(typename, family, revision, &set->type); if (ret) goto out; - /* - * Without holding any locks, create private part. - */ + /* Without holding any locks, create private part. */ if (attr[IPSET_ATTR_DATA] && nla_parse_nested(tb, IPSET_ATTR_CREATE_MAX, attr[IPSET_ATTR_DATA], set->type->create_policy)) { @@ -913,8 +901,7 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb, /* BTW, ret==0 here. */ - /* - * Here, we have a valid, constructed set and we are protected + /* Here, we have a valid, constructed set and we are protected * by the nfnl mutex. Find the first free index in ip_set_list * and check clashing. */ @@ -937,7 +924,7 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb, /* Wraparound */ goto cleanup; - list = kzalloc(sizeof(struct ip_set *) * i, GFP_KERNEL); + list = kcalloc(i, sizeof(struct ip_set *), GFP_KERNEL); if (!list) goto cleanup; /* nfnl mutex is held, both lists are valid */ @@ -951,12 +938,11 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb, inst->ip_set_max = i; kfree(tmp); ret = 0; - } else if (ret) + } else if (ret) { goto cleanup; + } - /* - * Finally! Add our shiny new set to the list, and be done. - */ + /* Finally! Add our shiny new set to the list, and be done. */ pr_debug("create: '%s' created with index %u!\n", set->name, index); ip_set(inst, index) = set; @@ -1018,7 +1004,7 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb, if (!attr[IPSET_ATTR_SETNAME]) { for (i = 0; i < inst->ip_set_max; i++) { s = ip_set(inst, i); - if (s != NULL && s->ref) { + if (s && s->ref) { ret = -IPSET_ERR_BUSY; goto out; } @@ -1037,7 +1023,7 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb, } else { s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]), &i); - if (s == NULL) { + if (!s) { ret = -ENOENT; goto out; } else if (s->ref) { @@ -1082,12 +1068,12 @@ ip_set_flush(struct sock *ctnl, struct sk_buff *skb, if (!attr[IPSET_ATTR_SETNAME]) { for (i = 0; i < inst->ip_set_max; i++) { s = ip_set(inst, i); - if (s != NULL) + if (s) ip_set_flush_set(s); } } else { s = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); - if (s == NULL) + if (!s) return -ENOENT; ip_set_flush_set(s); @@ -1119,12 +1105,12 @@ ip_set_rename(struct sock *ctnl, struct sk_buff *skb, int ret = 0; if (unlikely(protocol_failed(attr) || - attr[IPSET_ATTR_SETNAME] == NULL || - attr[IPSET_ATTR_SETNAME2] == NULL)) + !attr[IPSET_ATTR_SETNAME] || + !attr[IPSET_ATTR_SETNAME2])) return -IPSET_ERR_PROTOCOL; set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); - if (set == NULL) + if (!set) return -ENOENT; read_lock_bh(&ip_set_ref_lock); @@ -1136,7 +1122,7 @@ ip_set_rename(struct sock *ctnl, struct sk_buff *skb, name2 = nla_data(attr[IPSET_ATTR_SETNAME2]); for (i = 0; i < inst->ip_set_max; i++) { s = ip_set(inst, i); - if (s != NULL && STRNCMP(s->name, name2)) { + if (s && STRNCMP(s->name, name2)) { ret = -IPSET_ERR_EXIST_SETNAME2; goto out; } @@ -1168,23 +1154,24 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb, char from_name[IPSET_MAXNAMELEN]; if (unlikely(protocol_failed(attr) || - attr[IPSET_ATTR_SETNAME] == NULL || - attr[IPSET_ATTR_SETNAME2] == NULL)) + !attr[IPSET_ATTR_SETNAME] || + !attr[IPSET_ATTR_SETNAME2])) return -IPSET_ERR_PROTOCOL; from = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]), &from_id); - if (from == NULL) + if (!from) return -ENOENT; to = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME2]), &to_id); - if (to == NULL) + if (!to) return -IPSET_ERR_EXIST_SETNAME2; /* Features must not change. - * Not an artificial restriction anymore, as we must prevent - * possible loops created by swapping in setlist type of sets. */ + * Not an artifical restriction anymore, as we must prevent + * possible loops created by swapping in setlist type of sets. + */ if (!(from->type->features == to->type->features && from->family == to->family)) return -IPSET_ERR_TYPE_MISMATCH; @@ -1246,7 +1233,7 @@ dump_init(struct netlink_callback *cb, struct ip_set_net *inst) { struct nlmsghdr *nlh = nlmsg_hdr(cb->skb); int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); - struct nlattr *cda[IPSET_ATTR_CMD_MAX+1]; + struct nlattr *cda[IPSET_ATTR_CMD_MAX + 1]; struct nlattr *attr = (void *)nlh + min_len; u32 dump_type; ip_set_id_t index; @@ -1260,16 +1247,18 @@ dump_init(struct netlink_callback *cb, struct ip_set_net *inst) set = find_set_and_id(inst, nla_data(cda[IPSET_ATTR_SETNAME]), &index); - if (set == NULL) + if (!set) return -ENOENT; dump_type = DUMP_ONE; cb->args[IPSET_CB_INDEX] = index; - } else + } else { dump_type = DUMP_ALL; + } if (cda[IPSET_ATTR_FLAGS]) { u32 f = ip_set_get_h32(cda[IPSET_ATTR_FLAGS]); + dump_type |= (f << 16); } cb->args[IPSET_CB_NET] = (unsigned long)inst; @@ -1295,7 +1284,8 @@ ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb) if (ret < 0) { nlh = nlmsg_hdr(cb->skb); /* We have to create and send the error message - * manually :-( */ + * manually :-( + */ if (nlh->nlmsg_flags & NLM_F_ACK) netlink_ack(cb->skb, nlh, ret); return ret; @@ -1313,7 +1303,7 @@ dump_last: pr_debug("dump type, flag: %u %u index: %ld\n", dump_type, dump_flags, cb->args[IPSET_CB_INDEX]); for (; cb->args[IPSET_CB_INDEX] < max; cb->args[IPSET_CB_INDEX]++) { - index = (ip_set_id_t) cb->args[IPSET_CB_INDEX]; + index = (ip_set_id_t)cb->args[IPSET_CB_INDEX]; write_lock_bh(&ip_set_ref_lock); set = ip_set(inst, index); is_destroyed = inst->is_destroyed; @@ -1480,12 +1470,12 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, size_t payload = min(SIZE_MAX, sizeof(*errmsg) + nlmsg_len(nlh)); int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); - struct nlattr *cda[IPSET_ATTR_CMD_MAX+1]; + struct nlattr *cda[IPSET_ATTR_CMD_MAX + 1]; struct nlattr *cmdattr; u32 *errline; skb2 = nlmsg_new(payload, GFP_KERNEL); - if (skb2 == NULL) + if (!skb2) return -ENOMEM; rep = __nlmsg_put(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, NLMSG_ERROR, payload, 0); @@ -1502,7 +1492,8 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, *errline = lineno; - netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); + netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, + MSG_DONTWAIT); /* Signal netlink not to send its ACK/errmsg. */ return -EINTR; } @@ -1517,25 +1508,25 @@ ip_set_uadd(struct sock *ctnl, struct sk_buff *skb, { struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl)); struct ip_set *set; - struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {}; + struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {}; const struct nlattr *nla; u32 flags = flag_exist(nlh); bool use_lineno; int ret = 0; if (unlikely(protocol_failed(attr) || - attr[IPSET_ATTR_SETNAME] == NULL || + !attr[IPSET_ATTR_SETNAME] || !((attr[IPSET_ATTR_DATA] != NULL) ^ (attr[IPSET_ATTR_ADT] != NULL)) || - (attr[IPSET_ATTR_DATA] != NULL && + (attr[IPSET_ATTR_DATA] && !flag_nested(attr[IPSET_ATTR_DATA])) || - (attr[IPSET_ATTR_ADT] != NULL && + (attr[IPSET_ATTR_ADT] && (!flag_nested(attr[IPSET_ATTR_ADT]) || - attr[IPSET_ATTR_LINENO] == NULL)))) + !attr[IPSET_ATTR_LINENO])))) return -IPSET_ERR_PROTOCOL; set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); - if (set == NULL) + if (!set) return -ENOENT; use_lineno = !!attr[IPSET_ATTR_LINENO]; @@ -1572,25 +1563,25 @@ ip_set_udel(struct sock *ctnl, struct sk_buff *skb, { struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl)); struct ip_set *set; - struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {}; + struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {}; const struct nlattr *nla; u32 flags = flag_exist(nlh); bool use_lineno; int ret = 0; if (unlikely(protocol_failed(attr) || - attr[IPSET_ATTR_SETNAME] == NULL || + !attr[IPSET_ATTR_SETNAME] || !((attr[IPSET_ATTR_DATA] != NULL) ^ (attr[IPSET_ATTR_ADT] != NULL)) || - (attr[IPSET_ATTR_DATA] != NULL && + (attr[IPSET_ATTR_DATA] && !flag_nested(attr[IPSET_ATTR_DATA])) || - (attr[IPSET_ATTR_ADT] != NULL && + (attr[IPSET_ATTR_ADT] && (!flag_nested(attr[IPSET_ATTR_ADT]) || - attr[IPSET_ATTR_LINENO] == NULL)))) + !attr[IPSET_ATTR_LINENO])))) return -IPSET_ERR_PROTOCOL; set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); - if (set == NULL) + if (!set) return -ENOENT; use_lineno = !!attr[IPSET_ATTR_LINENO]; @@ -1627,17 +1618,17 @@ ip_set_utest(struct sock *ctnl, struct sk_buff *skb, { struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl)); struct ip_set *set; - struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {}; + struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {}; int ret = 0; if (unlikely(protocol_failed(attr) || - attr[IPSET_ATTR_SETNAME] == NULL || - attr[IPSET_ATTR_DATA] == NULL || + !attr[IPSET_ATTR_SETNAME] || + !attr[IPSET_ATTR_DATA] || !flag_nested(attr[IPSET_ATTR_DATA]))) return -IPSET_ERR_PROTOCOL; set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); - if (set == NULL) + if (!set) return -ENOENT; if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], @@ -1668,15 +1659,15 @@ ip_set_header(struct sock *ctnl, struct sk_buff *skb, int ret = 0; if (unlikely(protocol_failed(attr) || - attr[IPSET_ATTR_SETNAME] == NULL)) + !attr[IPSET_ATTR_SETNAME])) return -IPSET_ERR_PROTOCOL; set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); - if (set == NULL) + if (!set) return -ENOENT; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (skb2 == NULL) + if (!skb2) return -ENOMEM; nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, @@ -1725,8 +1716,8 @@ ip_set_type(struct sock *ctnl, struct sk_buff *skb, int ret = 0; if (unlikely(protocol_failed(attr) || - attr[IPSET_ATTR_TYPENAME] == NULL || - attr[IPSET_ATTR_FAMILY] == NULL)) + !attr[IPSET_ATTR_TYPENAME] || + !attr[IPSET_ATTR_FAMILY])) return -IPSET_ERR_PROTOCOL; family = nla_get_u8(attr[IPSET_ATTR_FAMILY]); @@ -1736,7 +1727,7 @@ ip_set_type(struct sock *ctnl, struct sk_buff *skb, return ret; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (skb2 == NULL) + if (!skb2) return -ENOMEM; nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, @@ -1781,11 +1772,11 @@ ip_set_protocol(struct sock *ctnl, struct sk_buff *skb, struct nlmsghdr *nlh2; int ret = 0; - if (unlikely(attr[IPSET_ATTR_PROTOCOL] == NULL)) + if (unlikely(!attr[IPSET_ATTR_PROTOCOL])) return -IPSET_ERR_PROTOCOL; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (skb2 == NULL) + if (!skb2) return -ENOMEM; nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, @@ -1913,7 +1904,7 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len) ret = -EFAULT; goto done; } - op = (unsigned int *) data; + op = (unsigned int *)data; if (*op < IP_SET_OP_VERSION) { /* Check the version at the beginning of operations */ @@ -2025,7 +2016,7 @@ ip_set_net_init(struct net *net) if (inst->ip_set_max >= IPSET_INVALID_ID) inst->ip_set_max = IPSET_INVALID_ID - 1; - list = kzalloc(sizeof(struct ip_set *) * inst->ip_set_max, GFP_KERNEL); + list = kcalloc(inst->ip_set_max, sizeof(struct ip_set *), GFP_KERNEL); if (!list) return -ENOMEM; inst->is_deleted = false; @@ -2061,11 +2052,11 @@ static struct pernet_operations ip_set_net_ops = { .size = sizeof(struct ip_set_net) }; - static int __init ip_set_init(void) { int ret = nfnetlink_subsys_register(&ip_set_netlink_subsys); + if (ret != 0) { pr_err("ip_set: cannot register with nfnetlink.\n"); return ret; diff --git a/net/netfilter/ipset/ip_set_getport.c b/net/netfilter/ipset/ip_set_getport.c index 1981f021cc60..42c3e3ba1b94 100644 --- a/net/netfilter/ipset/ip_set_getport.c +++ b/net/netfilter/ipset/ip_set_getport.c @@ -30,7 +30,7 @@ get_port(const struct sk_buff *skb, int protocol, unsigned int protooff, const struct tcphdr *th; th = skb_header_pointer(skb, protooff, sizeof(_tcph), &_tcph); - if (th == NULL) + if (!th) /* No choice either */ return false; @@ -42,7 +42,7 @@ get_port(const struct sk_buff *skb, int protocol, unsigned int protooff, const sctp_sctphdr_t *sh; sh = skb_header_pointer(skb, protooff, sizeof(_sh), &_sh); - if (sh == NULL) + if (!sh) /* No choice either */ return false; @@ -55,7 +55,7 @@ get_port(const struct sk_buff *skb, int protocol, unsigned int protooff, const struct udphdr *uh; uh = skb_header_pointer(skb, protooff, sizeof(_udph), &_udph); - if (uh == NULL) + if (!uh) /* No choice either */ return false; @@ -67,7 +67,7 @@ get_port(const struct sk_buff *skb, int protocol, unsigned int protooff, const struct icmphdr *ic; ic = skb_header_pointer(skb, protooff, sizeof(_ich), &_ich); - if (ic == NULL) + if (!ic) return false; *port = (__force __be16)htons((ic->type << 8) | ic->code); @@ -78,7 +78,7 @@ get_port(const struct sk_buff *skb, int protocol, unsigned int protooff, const struct icmp6hdr *ic; ic = skb_header_pointer(skb, protooff, sizeof(_ich), &_ich); - if (ic == NULL) + if (!ic) return false; *port = (__force __be16) @@ -116,7 +116,8 @@ ip_set_get_ip4_port(const struct sk_buff *skb, bool src, return false; default: /* Other protocols doesn't have ports, - so we can match fragments */ + * so we can match fragments. + */ *proto = protocol; return true; } diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index f352cc022010..afe905c208af 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -35,7 +35,7 @@ /* Number of elements to store in an initial array block */ #define AHASH_INIT_SIZE 4 /* Max number of elements to store in an array block */ -#define AHASH_MAX_SIZE (3*AHASH_INIT_SIZE) +#define AHASH_MAX_SIZE (3 * AHASH_INIT_SIZE) /* Max muber of elements in the array block when tuned */ #define AHASH_MAX_TUNED 64 @@ -57,6 +57,7 @@ tune_ahash_max(u8 curr, u32 multi) */ return n > curr && n <= AHASH_MAX_TUNED ? n : curr; } + #define TUNE_AHASH_MAX(h, multi) \ ((h)->ahash_max = tune_ahash_max((h)->ahash_max, multi)) #else @@ -256,7 +257,7 @@ htable_bits(u32 hashsize) #endif #define HKEY(data, initval, htable_bits) \ -(jhash2((u32 *)(data), HKEY_DATALEN/sizeof(u32), initval) \ +(jhash2((u32 *)(data), HKEY_DATALEN / sizeof(u32), initval) \ & jhash_mask(htable_bits)) #ifndef htype @@ -299,11 +300,11 @@ mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n) /* Add in increasing prefix order, so larger cidr first */ for (i = 0, j = -1; i < nets_length && h->nets[i].cidr[n]; i++) { - if (j != -1) + if (j != -1) { continue; - else if (h->nets[i].cidr[n] < cidr) + } else if (h->nets[i].cidr[n] < cidr) { j = i; - else if (h->nets[i].cidr[n] == cidr) { + } else if (h->nets[i].cidr[n] == cidr) { h->nets[cidr - 1].nets[n]++; return; } @@ -322,15 +323,15 @@ mtype_del_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n) u8 i, j, net_end = nets_length - 1; for (i = 0; i < nets_length; i++) { - if (h->nets[i].cidr[n] != cidr) - continue; + if (h->nets[i].cidr[n] != cidr) + continue; h->nets[cidr - 1].nets[n]--; if (h->nets[cidr - 1].nets[n] > 0) - return; + return; for (j = i; j < net_end && h->nets[j].cidr[n]; j++) - h->nets[j].cidr[n] = h->nets[j + 1].cidr[n]; + h->nets[j].cidr[n] = h->nets[j + 1].cidr[n]; h->nets[j].cidr[n] = 0; - return; + return; } } #endif @@ -426,8 +427,8 @@ mtype_destroy(struct ip_set *set) if (SET_WITH_TIMEOUT(set)) del_timer_sync(&h->gc); - mtype_ahash_destroy(set, __ipset_dereference_protected(h->table, 1), - true); + mtype_ahash_destroy(set, + __ipset_dereference_protected(h->table, 1), true); kfree(h); set->data = NULL; @@ -439,7 +440,7 @@ mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set)) struct htype *h = set->data; init_timer(&h->gc); - h->gc.data = (unsigned long) set; + h->gc.data = (unsigned long)set; h->gc.function = gc; h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; add_timer(&h->gc); @@ -530,7 +531,7 @@ mtype_expire(struct ip_set *set, struct htype *h, u8 nets_length, size_t dsize) static void mtype_gc(unsigned long ul_set) { - struct ip_set *set = (struct ip_set *) ul_set; + struct ip_set *set = (struct ip_set *)ul_set; struct htype *h = set->data; pr_debug("called\n"); @@ -544,7 +545,8 @@ mtype_gc(unsigned long ul_set) /* Resize a hash: create a new hash table with doubling the hashsize * and inserting the elements to it. Repeat until we succeed or - * fail due to memory pressures. */ + * fail due to memory pressures. + */ static int mtype_resize(struct ip_set *set, bool retried) { @@ -687,7 +689,8 @@ cleanup: } /* Add an element to a hash and update the internal counters when succeeded, - * otherwise report the proper error code. */ + * otherwise report the proper error code. + */ static int mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags) @@ -926,7 +929,8 @@ mtype_data_match(struct mtype_elem *data, const struct ip_set_ext *ext, #ifdef IP_SET_HASH_WITH_NETS /* Special test function which takes into account the different network - * sizes added to the set */ + * sizes added to the set + */ static int mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d, const struct ip_set_ext *ext, @@ -1004,7 +1008,8 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, t = rcu_dereference_bh(h->table); #ifdef IP_SET_HASH_WITH_NETS /* If we test an IP address and not a network address, - * try all possible network sizes */ + * try all possible network sizes + */ for (i = 0; i < IPSET_NET_COUNT; i++) if (DCIDR_GET(d->cidr, i) != SET_HOST_MASK(set->family)) break; @@ -1148,8 +1153,8 @@ mtype_list(const struct ip_set *set, nla_nest_cancel(skb, atd); ret = -EMSGSIZE; goto out; - } else - goto nla_put_failure; + } + goto nla_put_failure; } if (mtype_data_list(skb, e)) goto nla_put_failure; @@ -1171,8 +1176,9 @@ nla_put_failure: set->name); cb->args[IPSET_CB_ARG0] = 0; ret = -EMSGSIZE; - } else + } else { ipset_nest_end(skb, atd); + } out: rcu_read_unlock(); return ret; @@ -1180,12 +1186,13 @@ out: static int IPSET_TOKEN(MTYPE, _kadt)(struct ip_set *set, const struct sk_buff *skb, - const struct xt_action_param *par, - enum ipset_adt adt, struct ip_set_adt_opt *opt); + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt); static int IPSET_TOKEN(MTYPE, _uadt)(struct ip_set *set, struct nlattr *tb[], - enum ipset_adt adt, u32 *lineno, u32 flags, bool retried); + enum ipset_adt adt, u32 *lineno, u32 flags, + bool retried); static const struct ip_set_type_variant mtype_variant = { .kadt = mtype_kadt, diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index f54d7069d633..9d6bf19f7b78 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -158,8 +158,8 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } return ret; } diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c index f8fbc325ad34..a0695a2ab585 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmark.c +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -155,8 +155,8 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } return ret; } @@ -206,7 +206,6 @@ hash_ipmark6_data_next(struct hash_ipmark4_elem *next, #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" - static int hash_ipmark6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, @@ -268,10 +267,8 @@ hash_ipmark6_uadt(struct ip_set *set, struct nlattr *tb[], ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; - return ret; + return 0; } static struct ip_set_type hash_ipmark_type __read_mostly = { diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index 9a31db8ccca6..9d84b3dff603 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -140,8 +140,9 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; - } else + } else { return -IPSET_ERR_MISSING_PROTO; + } if (!(with_ports || e.proto == IPPROTO_ICMP)) e.port = 0; @@ -187,8 +188,8 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } } return ret; @@ -305,8 +306,9 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; - } else + } else { return -IPSET_ERR_MISSING_PROTO; + } if (!(with_ports || e.proto == IPPROTO_ICMPV6)) e.port = 0; @@ -329,8 +331,8 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } return ret; } diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index fc42489f8795..215b7b942038 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -63,7 +63,7 @@ hash_ipportip4_data_equal(const struct hash_ipportip4_elem *ip1, static bool hash_ipportip4_data_list(struct sk_buff *skb, - const struct hash_ipportip4_elem *data) + const struct hash_ipportip4_elem *data) { if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip2) || @@ -147,8 +147,9 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; - } else + } else { return -IPSET_ERR_MISSING_PROTO; + } if (!(with_ports || e.proto == IPPROTO_ICMP)) e.port = 0; @@ -194,8 +195,8 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } } return ret; @@ -320,8 +321,9 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; - } else + } else { return -IPSET_ERR_MISSING_PROTO; + } if (!(with_ports || e.proto == IPPROTO_ICMPV6)) e.port = 0; @@ -344,8 +346,8 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } return ret; } diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index 2a69b9bf66b8..9ca719625ea3 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -209,14 +209,16 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; - } else + } else { return -IPSET_ERR_MISSING_PROTO; + } if (!(with_ports || e.proto == IPPROTO_ICMP)) e.port = 0; if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } @@ -263,8 +265,9 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], swap(ip2_from, ip2_to); if (ip2_from + UINT_MAX == ip2_to) return -IPSET_ERR_HASH_RANGE; - } else + } else { ip_set_mask_from_to(ip2_from, ip2_to, e.cidr + 1); + } if (retried) ip = ntohl(h->next.ip); @@ -287,8 +290,8 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; ip2 = ip2_last + 1; } } @@ -466,14 +469,16 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; - } else + } else { return -IPSET_ERR_MISSING_PROTO; + } if (!(with_ports || e.proto == IPPROTO_ICMPV6)) e.port = 0; if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } @@ -497,8 +502,8 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } return ret; } diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c index 112aff3cda96..f1e7d2c0f685 100644 --- a/net/netfilter/ipset/ip_set_hash_mac.c +++ b/net/netfilter/ipset/ip_set_hash_mac.c @@ -89,10 +89,10 @@ hash_mac4_kadt(struct ip_set *set, const struct sk_buff *skb, return 0; if (skb_mac_header(skb) < skb->head || - (skb_mac_header(skb) + ETH_HLEN) > skb->data) + (skb_mac_header(skb) + ETH_HLEN) > skb->data) return -EINVAL; - memcpy(e.ether, eth_hdr(skb)->h_source, ETH_ALEN); + ether_addr_copy(e.ether, eth_hdr(skb)->h_source); if (memcmp(e.ether, invalid_ether, ETH_ALEN) == 0) return -EINVAL; return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); @@ -116,7 +116,7 @@ hash_mac4_uadt(struct ip_set *set, struct nlattr *tb[], ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; - memcpy(e.ether, nla_data(tb[IPSET_ATTR_ETHER]), ETH_ALEN); + ether_addr_copy(e.ether, nla_data(tb[IPSET_ATTR_ETHER])); if (memcmp(e.ether, invalid_ether, ETH_ALEN) == 0) return -IPSET_ERR_HASH_ELEM; diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index e49b1d010d30..3e4bffdc1cc0 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -169,6 +169,7 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } @@ -176,7 +177,7 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) { e.ip = htonl(ip & ip_set_hostmask(e.cidr)); ret = adtfn(set, &e, &ext, &ext, flags); - return ip_set_enomatch(ret, flags, adt, set) ? -ret: + return ip_set_enomatch(ret, flags, adt, set) ? -ret : ip_set_eexist(ret, flags) ? 0 : ret; } @@ -198,8 +199,8 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; ip = last + 1; } return ret; @@ -339,6 +340,7 @@ hash_net6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index 42c893e08842..43d8c9896fa3 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -143,7 +143,7 @@ static const char *get_physindev_name(const struct sk_buff *skb) return dev ? dev->name : NULL; } -static const char *get_phyoutdev_name(const struct sk_buff *skb) +static const char *get_physoutdev_name(const struct sk_buff *skb) { struct net_device *dev = nf_bridge_get_physoutdev(skb); @@ -178,15 +178,16 @@ hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb, if (opt->cmdflags & IPSET_FLAG_PHYSDEV) { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) const char *eiface = SRCDIR ? get_physindev_name(skb) : - get_phyoutdev_name(skb); + get_physoutdev_name(skb); if (!eiface) return -EINVAL; STRLCPY(e.iface, eiface); e.physdev = 1; #endif - } else + } else { STRLCPY(e.iface, SRCDIR ? IFACE(in) : IFACE(out)); + } if (strlen(e.iface) == 0) return -EINVAL; @@ -229,6 +230,7 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_PHYSDEV) e.physdev = 1; if (cadt_flags & IPSET_FLAG_NOMATCH) @@ -249,8 +251,9 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], swap(ip, ip_to); if (ip + UINT_MAX == ip_to) return -IPSET_ERR_HASH_RANGE; - } else + } else { ip_set_mask_from_to(ip, ip_to, e.cidr); + } if (retried) ip = ntohl(h->next.ip); @@ -261,8 +264,8 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; ip = last + 1; } return ret; @@ -385,15 +388,16 @@ hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb, if (opt->cmdflags & IPSET_FLAG_PHYSDEV) { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) const char *eiface = SRCDIR ? get_physindev_name(skb) : - get_phyoutdev_name(skb); + get_physoutdev_name(skb); + if (!eiface) return -EINVAL; - STRLCPY(e.iface, eiface); e.physdev = 1; #endif - } else + } else { STRLCPY(e.iface, SRCDIR ? IFACE(in) : IFACE(out)); + } if (strlen(e.iface) == 0) return -EINVAL; @@ -403,7 +407,7 @@ hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb, static int hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[], - enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netiface6_elem e = { .cidr = HOST_MASK, .elem = 1 }; @@ -440,6 +444,7 @@ hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_PHYSDEV) e.physdev = 1; if (cadt_flags & IPSET_FLAG_NOMATCH) diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index b5428be1f159..3c862c0a76d1 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -57,8 +57,8 @@ struct hash_netnet4_elem { static inline bool hash_netnet4_data_equal(const struct hash_netnet4_elem *ip1, - const struct hash_netnet4_elem *ip2, - u32 *multi) + const struct hash_netnet4_elem *ip2, + u32 *multi) { return ip1->ipcmp == ip2->ipcmp && ip1->ccmp == ip2->ccmp; @@ -84,7 +84,7 @@ hash_netnet4_data_reset_flags(struct hash_netnet4_elem *elem, u8 *flags) static inline void hash_netnet4_data_reset_elem(struct hash_netnet4_elem *elem, - struct hash_netnet4_elem *orig) + struct hash_netnet4_elem *orig) { elem->ip[1] = orig->ip[1]; } @@ -103,7 +103,7 @@ hash_netnet4_data_netmask(struct hash_netnet4_elem *elem, u8 cidr, bool inner) static bool hash_netnet4_data_list(struct sk_buff *skb, - const struct hash_netnet4_elem *data) + const struct hash_netnet4_elem *data) { u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; @@ -122,7 +122,7 @@ nla_put_failure: static inline void hash_netnet4_data_next(struct hash_netnet4_elem *next, - const struct hash_netnet4_elem *d) + const struct hash_netnet4_elem *d) { next->ipcmp = d->ipcmp; } @@ -133,8 +133,8 @@ hash_netnet4_data_next(struct hash_netnet4_elem *next, static int hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb, - const struct xt_action_param *par, - enum ipset_adt adt, struct ip_set_adt_opt *opt) + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_netnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; @@ -156,7 +156,7 @@ hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb, static int hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], - enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { const struct hash_netnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; @@ -199,6 +199,7 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } @@ -221,8 +222,9 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], swap(ip, ip_to); if (unlikely(ip + UINT_MAX == ip_to)) return -IPSET_ERR_HASH_RANGE; - } else + } else { ip_set_mask_from_to(ip, ip_to, e.cidr[0]); + } ip2_to = ip2_from; if (tb[IPSET_ATTR_IP2_TO]) { @@ -233,8 +235,9 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], swap(ip2_from, ip2_to); if (unlikely(ip2_from + UINT_MAX == ip2_to)) return -IPSET_ERR_HASH_RANGE; - } else + } else { ip_set_mask_from_to(ip2_from, ip2_to, e.cidr[1]); + } if (retried) ip = ntohl(h->next.ip[0]); @@ -251,8 +254,8 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; ip2 = last2 + 1; } ip = last + 1; @@ -276,8 +279,8 @@ struct hash_netnet6_elem { static inline bool hash_netnet6_data_equal(const struct hash_netnet6_elem *ip1, - const struct hash_netnet6_elem *ip2, - u32 *multi) + const struct hash_netnet6_elem *ip2, + u32 *multi) { return ipv6_addr_equal(&ip1->ip[0].in6, &ip2->ip[0].in6) && ipv6_addr_equal(&ip1->ip[1].in6, &ip2->ip[1].in6) && @@ -304,7 +307,7 @@ hash_netnet6_data_reset_flags(struct hash_netnet6_elem *elem, u8 *flags) static inline void hash_netnet6_data_reset_elem(struct hash_netnet6_elem *elem, - struct hash_netnet6_elem *orig) + struct hash_netnet6_elem *orig) { elem->ip[1] = orig->ip[1]; } @@ -323,7 +326,7 @@ hash_netnet6_data_netmask(struct hash_netnet6_elem *elem, u8 cidr, bool inner) static bool hash_netnet6_data_list(struct sk_buff *skb, - const struct hash_netnet6_elem *data) + const struct hash_netnet6_elem *data) { u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; @@ -342,7 +345,7 @@ nla_put_failure: static inline void hash_netnet6_data_next(struct hash_netnet4_elem *next, - const struct hash_netnet6_elem *d) + const struct hash_netnet6_elem *d) { } @@ -356,8 +359,8 @@ hash_netnet6_data_next(struct hash_netnet4_elem *next, static int hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb, - const struct xt_action_param *par, - enum ipset_adt adt, struct ip_set_adt_opt *opt) + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_netnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; @@ -367,7 +370,7 @@ hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb, e.cidr[0] = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); e.cidr[1] = INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); if (adt == IPSET_TEST) - e.ccmp = (HOST_MASK << (sizeof(u8)*8)) | HOST_MASK; + e.ccmp = (HOST_MASK << (sizeof(u8) * 8)) | HOST_MASK; ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0].in6); ip6addrptr(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.ip[1].in6); @@ -379,7 +382,7 @@ hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb, static int hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[], - enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netnet6_elem e = { .cidr = { HOST_MASK, HOST_MASK, }, }; @@ -424,6 +427,7 @@ hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index 27307d0a8a5d..731813e0f08c 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -198,8 +198,9 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; - } else + } else { return -IPSET_ERR_MISSING_PROTO; + } if (!(with_ports || e.proto == IPPROTO_ICMP)) e.port = 0; @@ -208,6 +209,7 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } @@ -233,8 +235,9 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], swap(ip, ip_to); if (ip + UINT_MAX == ip_to) return -IPSET_ERR_HASH_RANGE; - } else + } else { ip_set_mask_from_to(ip, ip_to, e.cidr + 1); + } if (retried) ip = ntohl(h->next.ip); @@ -250,8 +253,8 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } ip = last + 1; } @@ -413,14 +416,16 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; - } else + } else { return -IPSET_ERR_MISSING_PROTO; + } if (!(with_ports || e.proto == IPPROTO_ICMPV6)) e.port = 0; if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } @@ -444,8 +449,8 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } return ret; } diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index 1e0e47ae40a4..0c68734f5cc4 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -62,8 +62,8 @@ struct hash_netportnet4_elem { static inline bool hash_netportnet4_data_equal(const struct hash_netportnet4_elem *ip1, - const struct hash_netportnet4_elem *ip2, - u32 *multi) + const struct hash_netportnet4_elem *ip2, + u32 *multi) { return ip1->ipcmp == ip2->ipcmp && ip1->ccmp == ip2->ccmp && @@ -91,7 +91,7 @@ hash_netportnet4_data_reset_flags(struct hash_netportnet4_elem *elem, u8 *flags) static inline void hash_netportnet4_data_reset_elem(struct hash_netportnet4_elem *elem, - struct hash_netportnet4_elem *orig) + struct hash_netportnet4_elem *orig) { elem->ip[1] = orig->ip[1]; } @@ -111,7 +111,7 @@ hash_netportnet4_data_netmask(struct hash_netportnet4_elem *elem, static bool hash_netportnet4_data_list(struct sk_buff *skb, - const struct hash_netportnet4_elem *data) + const struct hash_netportnet4_elem *data) { u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; @@ -132,7 +132,7 @@ nla_put_failure: static inline void hash_netportnet4_data_next(struct hash_netportnet4_elem *next, - const struct hash_netportnet4_elem *d) + const struct hash_netportnet4_elem *d) { next->ipcmp = d->ipcmp; next->port = d->port; @@ -144,8 +144,8 @@ hash_netportnet4_data_next(struct hash_netportnet4_elem *next, static int hash_netportnet4_kadt(struct ip_set *set, const struct sk_buff *skb, - const struct xt_action_param *par, - enum ipset_adt adt, struct ip_set_adt_opt *opt) + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_netportnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; @@ -171,7 +171,7 @@ hash_netportnet4_kadt(struct ip_set *set, const struct sk_buff *skb, static int hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], - enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { const struct hash_netportnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; @@ -223,14 +223,16 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; - } else + } else { return -IPSET_ERR_MISSING_PROTO; + } if (!(with_ports || e.proto == IPPROTO_ICMP)) e.port = 0; if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } @@ -254,8 +256,9 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], swap(ip, ip_to); if (unlikely(ip + UINT_MAX == ip_to)) return -IPSET_ERR_HASH_RANGE; - } else + } else { ip_set_mask_from_to(ip, ip_to, e.cidr[0]); + } port_to = port = ntohs(e.port); if (tb[IPSET_ATTR_PORT_TO]) { @@ -273,8 +276,9 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], swap(ip2_from, ip2_to); if (unlikely(ip2_from + UINT_MAX == ip2_to)) return -IPSET_ERR_HASH_RANGE; - } else + } else { ip_set_mask_from_to(ip2_from, ip2_to, e.cidr[1]); + } if (retried) ip = ntohl(h->next.ip[0]); @@ -296,8 +300,8 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; ip2 = ip2_last + 1; } } @@ -324,8 +328,8 @@ struct hash_netportnet6_elem { static inline bool hash_netportnet6_data_equal(const struct hash_netportnet6_elem *ip1, - const struct hash_netportnet6_elem *ip2, - u32 *multi) + const struct hash_netportnet6_elem *ip2, + u32 *multi) { return ipv6_addr_equal(&ip1->ip[0].in6, &ip2->ip[0].in6) && ipv6_addr_equal(&ip1->ip[1].in6, &ip2->ip[1].in6) && @@ -354,7 +358,7 @@ hash_netportnet6_data_reset_flags(struct hash_netportnet6_elem *elem, u8 *flags) static inline void hash_netportnet6_data_reset_elem(struct hash_netportnet6_elem *elem, - struct hash_netportnet6_elem *orig) + struct hash_netportnet6_elem *orig) { elem->ip[1] = orig->ip[1]; } @@ -374,7 +378,7 @@ hash_netportnet6_data_netmask(struct hash_netportnet6_elem *elem, static bool hash_netportnet6_data_list(struct sk_buff *skb, - const struct hash_netportnet6_elem *data) + const struct hash_netportnet6_elem *data) { u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; @@ -395,7 +399,7 @@ nla_put_failure: static inline void hash_netportnet6_data_next(struct hash_netportnet4_elem *next, - const struct hash_netportnet6_elem *d) + const struct hash_netportnet6_elem *d) { next->port = d->port; } @@ -410,8 +414,8 @@ hash_netportnet6_data_next(struct hash_netportnet4_elem *next, static int hash_netportnet6_kadt(struct ip_set *set, const struct sk_buff *skb, - const struct xt_action_param *par, - enum ipset_adt adt, struct ip_set_adt_opt *opt) + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_netportnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; @@ -437,7 +441,7 @@ hash_netportnet6_kadt(struct ip_set *set, const struct sk_buff *skb, static int hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[], - enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { const struct hash_netportnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; @@ -493,14 +497,16 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[], if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; - } else + } else { return -IPSET_ERR_MISSING_PROTO; + } if (!(with_ports || e.proto == IPPROTO_ICMPV6)) e.port = 0; if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } @@ -524,8 +530,8 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } return ret; } diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 9f624ee9a41e..a1fe5377a2b3 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -206,14 +206,15 @@ list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext, continue; } - if (d->before == 0) + if (d->before == 0) { ret = 1; - else if (d->before > 0) { + } else if (d->before > 0) { next = list_next_entry(e, list); ret = !list_is_last(&e->list, &map->members) && next->id == d->refid; - } else + } else { ret = prev && prev->id == d->refid; + } return ret; } return 0; @@ -558,7 +559,7 @@ static const struct ip_set_type_variant set_variant = { static void list_set_gc(unsigned long ul_set) { - struct ip_set *set = (struct ip_set *) ul_set; + struct ip_set *set = (struct ip_set *)ul_set; struct list_set *map = set->data; spin_lock_bh(&set->lock); @@ -575,7 +576,7 @@ list_set_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set)) struct list_set *map = set->data; init_timer(&map->gc); - map->gc.data = (unsigned long) set; + map->gc.data = (unsigned long)set; map->gc.function = gc; map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; add_timer(&map->gc); diff --git a/net/netfilter/ipset/pfxlen.c b/net/netfilter/ipset/pfxlen.c index 04d15fdc99ee..1c8a42c1056c 100644 --- a/net/netfilter/ipset/pfxlen.c +++ b/net/netfilter/ipset/pfxlen.c @@ -1,9 +1,7 @@ #include #include -/* - * Prefixlen maps for fast conversions, by Jan Engelhardt. - */ +/* Prefixlen maps for fast conversions, by Jan Engelhardt. */ #define E(a, b, c, d) \ {.ip6 = { \ @@ -11,8 +9,7 @@ htonl(c), htonl(d), \ } } -/* - * This table works for both IPv4 and IPv6; +/* This table works for both IPv4 and IPv6; * just use prefixlen_netmask_map[prefixlength].ip. */ const union nf_inet_addr ip_set_netmask_map[] = { @@ -149,13 +146,12 @@ const union nf_inet_addr ip_set_netmask_map[] = { EXPORT_SYMBOL_GPL(ip_set_netmask_map); #undef E -#define E(a, b, c, d) \ - {.ip6 = { (__force __be32) a, (__force __be32) b, \ - (__force __be32) c, (__force __be32) d, \ +#define E(a, b, c, d) \ + {.ip6 = { (__force __be32)a, (__force __be32)b, \ + (__force __be32)c, (__force __be32)d, \ } } -/* - * This table works for both IPv4 and IPv6; +/* This table works for both IPv4 and IPv6; * just use prefixlen_hostmask_map[prefixlength].ip. */ const union nf_inet_addr ip_set_hostmask_map[] = { diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c index b103e9627716..5669e5b453f4 100644 --- a/net/netfilter/xt_set.c +++ b/net/netfilter/xt_set.c @@ -9,7 +9,8 @@ */ /* Kernel module which implements the set match and SET target - * for netfilter/iptables. */ + * for netfilter/iptables. + */ #include #include @@ -53,6 +54,7 @@ static bool set_match_v0(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_set_info_match_v0 *info = par->matchinfo; + ADT_OPT(opt, par->family, info->match_set.u.compat.dim, info->match_set.u.compat.flags, 0, UINT_MAX); @@ -69,10 +71,10 @@ compat_flags(struct xt_set_info_v0 *info) info->u.compat.dim = IPSET_DIM_ZERO; if (info->u.flags[0] & IPSET_MATCH_INV) info->u.compat.flags |= IPSET_INV_MATCH; - for (i = 0; i < IPSET_DIM_MAX-1 && info->u.flags[i]; i++) { + for (i = 0; i < IPSET_DIM_MAX - 1 && info->u.flags[i]; i++) { info->u.compat.dim++; if (info->u.flags[i] & IPSET_SRC) - info->u.compat.flags |= (1<u.compat.dim); + info->u.compat.flags |= (1 << info->u.compat.dim); } } @@ -89,7 +91,7 @@ set_match_v0_checkentry(const struct xt_mtchk_param *par) info->match_set.index); return -ENOENT; } - if (info->match_set.u.flags[IPSET_DIM_MAX-1] != 0) { + if (info->match_set.u.flags[IPSET_DIM_MAX - 1] != 0) { pr_warn("Protocol error: set match dimension is over the limit!\n"); ip_set_nfnl_put(par->net, info->match_set.index); return -ERANGE; @@ -115,6 +117,7 @@ static bool set_match_v1(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_set_info_match_v1 *info = par->matchinfo; + ADT_OPT(opt, par->family, info->match_set.dim, info->match_set.flags, 0, UINT_MAX); @@ -179,9 +182,10 @@ static bool set_match_v3(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_set_info_match_v3 *info = par->matchinfo; + int ret; + ADT_OPT(opt, par->family, info->match_set.dim, info->match_set.flags, info->flags, UINT_MAX); - int ret; if (info->packets.op != IPSET_COUNTER_NONE || info->bytes.op != IPSET_COUNTER_NONE) @@ -225,9 +229,10 @@ static bool set_match_v4(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_set_info_match_v4 *info = par->matchinfo; + int ret; + ADT_OPT(opt, par->family, info->match_set.dim, info->match_set.flags, info->flags, UINT_MAX); - int ret; if (info->packets.op != IPSET_COUNTER_NONE || info->bytes.op != IPSET_COUNTER_NONE) @@ -253,6 +258,7 @@ static unsigned int set_target_v0(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_set_info_target_v0 *info = par->targinfo; + ADT_OPT(add_opt, par->family, info->add_set.u.compat.dim, info->add_set.u.compat.flags, 0, UINT_MAX); ADT_OPT(del_opt, par->family, info->del_set.u.compat.dim, @@ -291,8 +297,8 @@ set_target_v0_checkentry(const struct xt_tgchk_param *par) return -ENOENT; } } - if (info->add_set.u.flags[IPSET_DIM_MAX-1] != 0 || - info->del_set.u.flags[IPSET_DIM_MAX-1] != 0) { + if (info->add_set.u.flags[IPSET_DIM_MAX - 1] != 0 || + info->del_set.u.flags[IPSET_DIM_MAX - 1] != 0) { pr_warn("Protocol error: SET target dimension is over the limit!\n"); if (info->add_set.index != IPSET_INVALID_ID) ip_set_nfnl_put(par->net, info->add_set.index); @@ -325,6 +331,7 @@ static unsigned int set_target_v1(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_set_info_target_v1 *info = par->targinfo; + ADT_OPT(add_opt, par->family, info->add_set.dim, info->add_set.flags, 0, UINT_MAX); ADT_OPT(del_opt, par->family, info->del_set.dim, @@ -393,6 +400,7 @@ static unsigned int set_target_v2(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_set_info_target_v2 *info = par->targinfo; + ADT_OPT(add_opt, par->family, info->add_set.dim, info->add_set.flags, info->flags, info->timeout); ADT_OPT(del_opt, par->family, info->del_set.dim, @@ -400,8 +408,8 @@ set_target_v2(struct sk_buff *skb, const struct xt_action_param *par) /* Normalize to fit into jiffies */ if (add_opt.ext.timeout != IPSET_NO_TIMEOUT && - add_opt.ext.timeout > UINT_MAX/MSEC_PER_SEC) - add_opt.ext.timeout = UINT_MAX/MSEC_PER_SEC; + add_opt.ext.timeout > UINT_MAX / MSEC_PER_SEC) + add_opt.ext.timeout = UINT_MAX / MSEC_PER_SEC; if (info->add_set.index != IPSET_INVALID_ID) ip_set_add(info->add_set.index, skb, par, &add_opt); if (info->del_set.index != IPSET_INVALID_ID) @@ -419,6 +427,8 @@ static unsigned int set_target_v3(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_set_info_target_v3 *info = par->targinfo; + int ret; + ADT_OPT(add_opt, par->family, info->add_set.dim, info->add_set.flags, info->flags, info->timeout); ADT_OPT(del_opt, par->family, info->del_set.dim, @@ -426,12 +436,10 @@ set_target_v3(struct sk_buff *skb, const struct xt_action_param *par) ADT_OPT(map_opt, par->family, info->map_set.dim, info->map_set.flags, 0, UINT_MAX); - int ret; - /* Normalize to fit into jiffies */ if (add_opt.ext.timeout != IPSET_NO_TIMEOUT && - add_opt.ext.timeout > UINT_MAX/MSEC_PER_SEC) - add_opt.ext.timeout = UINT_MAX/MSEC_PER_SEC; + add_opt.ext.timeout > UINT_MAX / MSEC_PER_SEC) + add_opt.ext.timeout = UINT_MAX / MSEC_PER_SEC; if (info->add_set.index != IPSET_INVALID_ID) ip_set_add(info->add_set.index, skb, par, &add_opt); if (info->del_set.index != IPSET_INVALID_ID) @@ -457,7 +465,6 @@ set_target_v3(struct sk_buff *skb, const struct xt_action_param *par) return XT_CONTINUE; } - static int set_target_v3_checkentry(const struct xt_tgchk_param *par) { @@ -497,8 +504,7 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par) !(par->hook_mask & (1 << NF_INET_FORWARD | 1 << NF_INET_LOCAL_OUT | 1 << NF_INET_POST_ROUTING))) { - pr_warn("mapping of prio or/and queue is allowed only" - "from OUTPUT/FORWARD/POSTROUTING chains\n"); + pr_warn("mapping of prio or/and queue is allowed only from OUTPUT/FORWARD/POSTROUTING chains\n"); return -EINVAL; } index = ip_set_nfnl_get_byindex(par->net, @@ -519,8 +525,7 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par) if (info->add_set.dim > IPSET_DIM_MAX || info->del_set.dim > IPSET_DIM_MAX || info->map_set.dim > IPSET_DIM_MAX) { - pr_warn("Protocol error: SET target dimension " - "is over the limit!\n"); + pr_warn("Protocol error: SET target dimension is over the limit!\n"); if (info->add_set.index != IPSET_INVALID_ID) ip_set_nfnl_put(par->net, info->add_set.index); if (info->del_set.index != IPSET_INVALID_ID) @@ -546,7 +551,6 @@ set_target_v3_destroy(const struct xt_tgdtor_param *par) ip_set_nfnl_put(par->net, info->map_set.index); } - static struct xt_match set_matches[] __read_mostly = { { .name = "set", -- cgit From 2d45a02d0166caf2627fe91897c6ffc3b19514c4 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Fri, 12 Jun 2015 10:16:41 -0300 Subject: sctp: fix ASCONF list handling ->auto_asconf_splist is per namespace and mangled by functions like sctp_setsockopt_auto_asconf() which doesn't guarantee any serialization. Also, the call to inet_sk_copy_descendant() was backuping ->auto_asconf_list through the copy but was not honoring ->do_auto_asconf, which could lead to list corruption if it was different between both sockets. This commit thus fixes the list handling by using ->addr_wq_lock spinlock to protect the list. A special handling is done upon socket creation and destruction for that. Error handlig on sctp_init_sock() will never return an error after having initialized asconf, so sctp_destroy_sock() can be called without addrq_wq_lock. The lock now will be take on sctp_close_sock(), before locking the socket, so we don't do it in inverse order compared to sctp_addr_wq_timeout_handler(). Instead of taking the lock on sctp_sock_migrate() for copying and restoring the list values, it's preferred to avoid rewritting it by implementing sctp_copy_descendant(). Issue was found with a test application that kept flipping sysctl default_auto_asconf on and off, but one could trigger it by issuing simultaneous setsockopt() calls on multiple sockets or by creating/destroying sockets fast enough. This is only triggerable locally. Fixes: 9f7d653b67ae ("sctp: Add Auto-ASCONF support (core).") Reported-by: Ji Jianwen Suggested-by: Neil Horman Suggested-by: Hannes Frederic Sowa Acked-by: Hannes Frederic Sowa Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/socket.c | 43 ++++++++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index f09de7fac2e6..5f6c4e61325b 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1528,8 +1528,10 @@ static void sctp_close(struct sock *sk, long timeout) /* Supposedly, no process has access to the socket, but * the net layers still may. + * Also, sctp_destroy_sock() needs to be called with addr_wq_lock + * held and that should be grabbed before socket lock. */ - local_bh_disable(); + spin_lock_bh(&net->sctp.addr_wq_lock); bh_lock_sock(sk); /* Hold the sock, since sk_common_release() will put sock_put() @@ -1539,7 +1541,7 @@ static void sctp_close(struct sock *sk, long timeout) sk_common_release(sk); bh_unlock_sock(sk); - local_bh_enable(); + spin_unlock_bh(&net->sctp.addr_wq_lock); sock_put(sk); @@ -3580,6 +3582,7 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval, if ((val && sp->do_auto_asconf) || (!val && !sp->do_auto_asconf)) return 0; + spin_lock_bh(&sock_net(sk)->sctp.addr_wq_lock); if (val == 0 && sp->do_auto_asconf) { list_del(&sp->auto_asconf_list); sp->do_auto_asconf = 0; @@ -3588,6 +3591,7 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval, &sock_net(sk)->sctp.auto_asconf_splist); sp->do_auto_asconf = 1; } + spin_unlock_bh(&sock_net(sk)->sctp.addr_wq_lock); return 0; } @@ -4121,18 +4125,28 @@ static int sctp_init_sock(struct sock *sk) local_bh_disable(); percpu_counter_inc(&sctp_sockets_allocated); sock_prot_inuse_add(net, sk->sk_prot, 1); + + /* Nothing can fail after this block, otherwise + * sctp_destroy_sock() will be called without addr_wq_lock held + */ if (net->sctp.default_auto_asconf) { + spin_lock(&sock_net(sk)->sctp.addr_wq_lock); list_add_tail(&sp->auto_asconf_list, &net->sctp.auto_asconf_splist); sp->do_auto_asconf = 1; - } else + spin_unlock(&sock_net(sk)->sctp.addr_wq_lock); + } else { sp->do_auto_asconf = 0; + } + local_bh_enable(); return 0; } -/* Cleanup any SCTP per socket resources. */ +/* Cleanup any SCTP per socket resources. Must be called with + * sock_net(sk)->sctp.addr_wq_lock held if sp->do_auto_asconf is true + */ static void sctp_destroy_sock(struct sock *sk) { struct sctp_sock *sp; @@ -7195,6 +7209,19 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk, newinet->mc_list = NULL; } +static inline void sctp_copy_descendant(struct sock *sk_to, + const struct sock *sk_from) +{ + int ancestor_size = sizeof(struct inet_sock) + + sizeof(struct sctp_sock) - + offsetof(struct sctp_sock, auto_asconf_list); + + if (sk_from->sk_family == PF_INET6) + ancestor_size += sizeof(struct ipv6_pinfo); + + __inet_sk_copy_descendant(sk_to, sk_from, ancestor_size); +} + /* Populate the fields of the newsk from the oldsk and migrate the assoc * and its messages to the newsk. */ @@ -7209,7 +7236,6 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, struct sk_buff *skb, *tmp; struct sctp_ulpevent *event; struct sctp_bind_hashbucket *head; - struct list_head tmplist; /* Migrate socket buffer sizes and all the socket level options to the * new socket. @@ -7217,12 +7243,7 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, newsk->sk_sndbuf = oldsk->sk_sndbuf; newsk->sk_rcvbuf = oldsk->sk_rcvbuf; /* Brute force copy old sctp opt. */ - if (oldsp->do_auto_asconf) { - memcpy(&tmplist, &newsp->auto_asconf_list, sizeof(tmplist)); - inet_sk_copy_descendant(newsk, oldsk); - memcpy(&newsp->auto_asconf_list, &tmplist, sizeof(tmplist)); - } else - inet_sk_copy_descendant(newsk, oldsk); + sctp_copy_descendant(newsk, oldsk); /* Restore the ep value that was overwritten with the above structure * copy. -- cgit From 758f0d4b16e0508aa47a4a4d06eacba0d66e24d5 Mon Sep 17 00:00:00 2001 From: Kenneth Klette Jonassen Date: Fri, 12 Jun 2015 17:24:03 +0200 Subject: tcp: cdg: use div_u64() Fixes cross-compile to mips. Signed-off-by: Kenneth Klette Jonassen Signed-off-by: David S. Miller --- net/ipv4/tcp_cdg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c index a52ce2db53f2..8c6fd3d5e40f 100644 --- a/net/ipv4/tcp_cdg.c +++ b/net/ipv4/tcp_cdg.c @@ -145,7 +145,7 @@ static void tcp_cdg_hystart_update(struct sock *sk) return; if (hystart_detect & HYSTART_ACK_TRAIN) { - u32 now_us = local_clock() / NSEC_PER_USEC; + u32 now_us = div_u64(local_clock(), NSEC_PER_USEC); if (ca->last_ack == 0 || !tcp_is_cwnd_limited(sk)) { ca->last_ack = now_us; -- cgit From 1bc1754e82f03c2c29b6e39ee02af48fa3bdef23 Mon Sep 17 00:00:00 2001 From: Varka Bhadram Date: Fri, 12 Jun 2015 12:44:24 +0530 Subject: mac802154: rx packet handle cleanup This patch replaces !netif_running(sdata->dev) with !ieee802154_sdata_running(sdata) and also devide the code two separate if branches. Signed-off-by: Varka Bhadram Signed-off-by: Marcel Holtmann --- net/mac802154/rx.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c index 1bdf98068608..d93ad2d4a4fc 100644 --- a/net/mac802154/rx.c +++ b/net/mac802154/rx.c @@ -202,8 +202,10 @@ __ieee802154_rx_handle_packet(struct ieee802154_local *local, } list_for_each_entry_rcu(sdata, &local->interfaces, list) { - if (sdata->wpan_dev.iftype != NL802154_IFTYPE_NODE || - !netif_running(sdata->dev)) + if (sdata->wpan_dev.iftype != NL802154_IFTYPE_NODE) + continue; + + if (!ieee802154_sdata_running(sdata)) continue; ieee802154_subif_frame(sdata, skb, &hdr); -- cgit From b4ee194441d7e4457c7bac6c2a5da8428974db5a Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Sat, 13 Jun 2015 22:15:51 +0200 Subject: mac802154: iface: fix hrtimer cancel on ifdown The interframe spacing timer is a per phy definition and is part of a ieee802154_local structure. If we have possible multiple interfaces ifdown one interface then the timer should not be cancled. First if the last interface is down and the receive handling is stopped we should be sure that the interframe spacing timer isn't run anymore. Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/mac802154/iface.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c index e3d77b07c0e4..d8043378e7a0 100644 --- a/net/mac802154/iface.c +++ b/net/mac802154/iface.c @@ -302,15 +302,15 @@ static int mac802154_slave_close(struct net_device *dev) ASSERT_RTNL(); - hrtimer_cancel(&local->ifs_timer); - netif_stop_queue(dev); local->open_count--; clear_bit(SDATA_STATE_RUNNING, &sdata->state); - if (!local->open_count) + if (!local->open_count) { + hrtimer_cancel(&local->ifs_timer); drv_stop(local); + } return 0; } -- cgit From ed2e627cb17d385f02d0a28fd7e564031f7769b0 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Sat, 13 Jun 2015 22:15:52 +0200 Subject: mac802154: iface: flush workqueue before stop This patch flushs the workqueue which is currently used for xmit_sync callback before calling stop driver-ops. Flush the queue will ensure all pending tx frames are transmitted. Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/mac802154/iface.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c index d8043378e7a0..692731d240e4 100644 --- a/net/mac802154/iface.c +++ b/net/mac802154/iface.c @@ -308,6 +308,7 @@ static int mac802154_slave_close(struct net_device *dev) clear_bit(SDATA_STATE_RUNNING, &sdata->state); if (!local->open_count) { + flush_workqueue(local->workqueue); hrtimer_cancel(&local->ifs_timer); drv_stop(local); } -- cgit From f09becc79f899f92557ce6d5562a8b80d6addb34 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 12 Jun 2015 13:58:52 +0200 Subject: netfilter: Kconfig: get rid of parens around depends on According to the reporter, they are not needed. Reported-by: Sergei Shtylyov Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/Kconfig | 3 ++- net/ipv6/netfilter/Kconfig | 3 ++- net/netfilter/Kconfig | 18 +++++++++--------- 3 files changed, 13 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index fb20f363151f..2199a5db25e6 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -195,7 +195,8 @@ config IP_NF_MATCH_ECN config IP_NF_MATCH_RPFILTER tristate '"rpfilter" reverse path filter match support' - depends on NETFILTER_ADVANCED && (IP_NF_MANGLE || IP_NF_RAW) + depends on NETFILTER_ADVANCED + depends on IP_NF_MANGLE || IP_NF_RAW ---help--- This option allows you to match packets whose replies would go out via the interface the packet came in. diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index ca6998345b42..b552cf0d6198 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -186,7 +186,8 @@ config IP6_NF_MATCH_MH config IP6_NF_MATCH_RPFILTER tristate '"rpfilter" reverse path filter match support' - depends on NETFILTER_ADVANCED && (IP6_NF_MANGLE || IP6_NF_RAW) + depends on NETFILTER_ADVANCED + depends on IP6_NF_MANGLE || IP6_NF_RAW ---help--- This option allows you to match packets whose replies would go out via the interface the packet came in. diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index fbc8d15c7fda..6eae69a698ed 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -206,7 +206,7 @@ config NF_CONNTRACK_FTP config NF_CONNTRACK_H323 tristate "H.323 protocol support" - depends on (IPV6 || IPV6=n) + depends on IPV6 || IPV6=n depends on NETFILTER_ADVANCED help H.323 is a VoIP signalling protocol from ITU-T. As one of the most @@ -723,7 +723,7 @@ config NETFILTER_XT_TARGET_HL config NETFILTER_XT_TARGET_HMARK tristate '"HMARK" target support' - depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n) + depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n depends on NETFILTER_ADVANCED ---help--- This option adds the "HMARK" target. @@ -865,7 +865,7 @@ config NETFILTER_XT_TARGET_REDIRECT config NETFILTER_XT_TARGET_TEE tristate '"TEE" - packet cloning to alternate destination' depends on NETFILTER_ADVANCED - depends on (IPV6 || IPV6=n) + depends on IPV6 || IPV6=n depends on !NF_CONNTRACK || NF_CONNTRACK ---help--- This option adds a "TEE" target with which a packet can be cloned and @@ -875,8 +875,8 @@ config NETFILTER_XT_TARGET_TPROXY tristate '"TPROXY" target transparent proxying support' depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED - depends on (IPV6 || IPV6=n) - depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n) + depends on IPV6 || IPV6=n + depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n depends on IP_NF_MANGLE select NF_DEFRAG_IPV4 select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES @@ -915,7 +915,7 @@ config NETFILTER_XT_TARGET_SECMARK config NETFILTER_XT_TARGET_TCPMSS tristate '"TCPMSS" target support' - depends on (IPV6 || IPV6=n) + depends on IPV6 || IPV6=n default m if NETFILTER_ADVANCED=n ---help--- This option adds a `TCPMSS' target, which allows you to alter the @@ -1127,7 +1127,7 @@ config NETFILTER_XT_MATCH_ESP config NETFILTER_XT_MATCH_HASHLIMIT tristate '"hashlimit" match support' - depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n) + depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n depends on NETFILTER_ADVANCED help This option adds a `hashlimit' match. @@ -1369,8 +1369,8 @@ config NETFILTER_XT_MATCH_SOCKET depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED depends on !NF_CONNTRACK || NF_CONNTRACK - depends on (IPV6 || IPV6=n) - depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n) + depends on IPV6 || IPV6=n + depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n select NF_DEFRAG_IPV4 select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES help -- cgit From fb77ff4f43990dc91926ce2704036a547482544e Mon Sep 17 00:00:00 2001 From: Vincent Cuissard Date: Mon, 15 Jun 2015 17:34:23 +0200 Subject: NFC: nci: fix mistake in uart generic driver It was not possible to register a UART driver due to a bad condition. Signed-off-by: Vincent Cuissard Signed-off-by: Samuel Ortiz --- net/nfc/nci/uart.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/nfc/nci/uart.c b/net/nfc/nci/uart.c index 2c48810af5bb..21d8875673a4 100644 --- a/net/nfc/nci/uart.c +++ b/net/nfc/nci/uart.c @@ -417,7 +417,7 @@ int nci_uart_register(struct nci_uart *nu) nu->ops.recv = nci_uart_default_recv; /* Add this driver in the driver list */ - if (!nci_uart_drivers[nu->driver]) { + if (nci_uart_drivers[nu->driver]) { pr_err("driver %d is already registered\n", nu->driver); return -EBUSY; } -- cgit From 711bdde6a884354ddae8da2fcb495b2a9364cc90 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Jun 2015 09:57:30 -0700 Subject: netfilter: x_tables: remove XT_TABLE_INFO_SZ and a dereference. After Florian patches, there is no need for XT_TABLE_INFO_SZ anymore : Only one copy of table is kept, instead of one copy per cpu. We also can avoid a dereference if we put table data right after xt_table_info. It reduces register pressure and helps compiler. Then, we attempt a kmalloc() if total size is under order-3 allocation, to reduce TLB pressure, as in many cases, rules fit in 32 KB. Signed-off-by: Eric Dumazet Cc: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/arp_tables.c | 4 ++-- net/ipv4/netfilter/ip_tables.c | 4 ++-- net/ipv6/netfilter/ip6_tables.c | 4 ++-- net/netfilter/x_tables.c | 32 ++++++++++++-------------------- 4 files changed, 18 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index d75c139393fc..95c9b6eece25 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -256,7 +256,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, const struct arphdr *arp; struct arpt_entry *e, *back; const char *indev, *outdev; - void *table_base; + const void *table_base; const struct xt_table_info *private; struct xt_action_param acpar; unsigned int addend; @@ -868,7 +868,7 @@ static int compat_table_info(const struct xt_table_info *info, struct xt_table_info *newinfo) { struct arpt_entry *iter; - void *loc_cpu_entry; + const void *loc_cpu_entry; int ret; if (!newinfo || !info) diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 6151500c2df8..6c72fbb7b49e 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -938,7 +938,7 @@ copy_entries_to_user(unsigned int total_size, struct xt_counters *counters; const struct xt_table_info *private = table->private; int ret = 0; - void *loc_cpu_entry; + const void *loc_cpu_entry; counters = alloc_counters(table); if (IS_ERR(counters)) @@ -1052,7 +1052,7 @@ static int compat_table_info(const struct xt_table_info *info, struct xt_table_info *newinfo) { struct ipt_entry *iter; - void *loc_cpu_entry; + const void *loc_cpu_entry; int ret; if (!newinfo || !info) diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 80a7f0d38aa4..3c35ced39b42 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -951,7 +951,7 @@ copy_entries_to_user(unsigned int total_size, struct xt_counters *counters; const struct xt_table_info *private = table->private; int ret = 0; - void *loc_cpu_entry; + const void *loc_cpu_entry; counters = alloc_counters(table); if (IS_ERR(counters)) @@ -1065,7 +1065,7 @@ static int compat_table_info(const struct xt_table_info *info, struct xt_table_info *newinfo) { struct ip6t_entry *iter; - void *loc_cpu_entry; + const void *loc_cpu_entry; int ret; if (!newinfo || !info) diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 6062ce3e862c..d324fe71260c 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -658,29 +658,23 @@ EXPORT_SYMBOL_GPL(xt_compat_target_to_user); struct xt_table_info *xt_alloc_table_info(unsigned int size) { - struct xt_table_info *newinfo; + struct xt_table_info *info = NULL; + size_t sz = sizeof(*info) + size; /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */ if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > totalram_pages) return NULL; - newinfo = kzalloc(XT_TABLE_INFO_SZ, GFP_KERNEL); - if (!newinfo) - return NULL; - - newinfo->size = size; - - if (size <= PAGE_SIZE) - newinfo->entries = kmalloc(size, GFP_KERNEL); - else - newinfo->entries = vmalloc(size); - - if (newinfo->entries == NULL) { - xt_free_table_info(newinfo); - return NULL; + if (sz <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) + info = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); + if (!info) { + info = vmalloc(sz); + if (!info) + return NULL; } - - return newinfo; + memset(info, 0, sizeof(*info)); + info->size = size; + return info; } EXPORT_SYMBOL(xt_alloc_table_info); @@ -688,8 +682,6 @@ void xt_free_table_info(struct xt_table_info *info) { int cpu; - kvfree(info->entries); - if (info->jumpstack != NULL) { for_each_possible_cpu(cpu) kvfree(info->jumpstack[cpu]); @@ -698,7 +690,7 @@ void xt_free_table_info(struct xt_table_info *info) free_percpu(info->stackptr); - kfree(info); + kvfree(info); } EXPORT_SYMBOL(xt_free_table_info); -- cgit From 2cbce139fc57bc2625f88add055d0b94f00c3352 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 12 Jun 2015 13:55:41 +0200 Subject: netfilter: nf_tables: attach net_device to basechain The device is part of the hook configuration, so instead of a global configuration per table, set it to each of the basechain that we create. This patch reworks ebddf1a8d78a ("netfilter: nf_tables: allow to bind table to net_device"). Note that this adds a dev_name field in the nft_base_chain structure which is required the netdev notification subscription that follows up in a patch to handle gone net_devices. Suggested-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 79 +++++++++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 37 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 4528f122bcd2..900c81a2f89a 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -399,8 +399,6 @@ static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = { [NFTA_TABLE_NAME] = { .type = NLA_STRING, .len = NFT_TABLE_MAXNAMELEN - 1 }, [NFTA_TABLE_FLAGS] = { .type = NLA_U32 }, - [NFTA_TABLE_DEV] = { .type = NLA_STRING, - .len = IFNAMSIZ - 1 }, }; static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net, @@ -425,10 +423,6 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net, nla_put_be32(skb, NFTA_TABLE_USE, htonl(table->use))) goto nla_put_failure; - if (table->dev && - nla_put_string(skb, NFTA_TABLE_DEV, table->dev->name)) - goto nla_put_failure; - nlmsg_end(skb, nlh); return 0; @@ -614,11 +608,6 @@ static int nf_tables_updtable(struct nft_ctx *ctx) if (flags == ctx->table->flags) return 0; - if ((ctx->afi->flags & NFT_AF_NEEDS_DEV) && - ctx->nla[NFTA_TABLE_DEV] && - nla_strcmp(ctx->nla[NFTA_TABLE_DEV], ctx->table->dev->name)) - return -EOPNOTSUPP; - trans = nft_trans_alloc(ctx, NFT_MSG_NEWTABLE, sizeof(struct nft_trans_table)); if (trans == NULL) @@ -656,7 +645,6 @@ static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb, struct nft_table *table; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; - struct net_device *dev = NULL; u32 flags = 0; struct nft_ctx ctx; int err; @@ -691,20 +679,6 @@ static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb, return -EINVAL; } - if (afi->flags & NFT_AF_NEEDS_DEV) { - char ifname[IFNAMSIZ]; - - if (!nla[NFTA_TABLE_DEV]) - return -EOPNOTSUPP; - - nla_strlcpy(ifname, nla[NFTA_TABLE_DEV], IFNAMSIZ); - dev = dev_get_by_name(net, ifname); - if (!dev) - return -ENOENT; - } else if (nla[NFTA_TABLE_DEV]) { - return -EOPNOTSUPP; - } - err = -EAFNOSUPPORT; if (!try_module_get(afi->owner)) goto err1; @@ -718,7 +692,6 @@ static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb, INIT_LIST_HEAD(&table->chains); INIT_LIST_HEAD(&table->sets); table->flags = flags; - table->dev = dev; nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE); @@ -732,9 +705,6 @@ err3: err2: module_put(afi->owner); err1: - if (dev != NULL) - dev_put(dev); - return err; } @@ -838,9 +808,6 @@ static void nf_tables_table_destroy(struct nft_ctx *ctx) { BUG_ON(ctx->table->use > 0); - if (ctx->table->dev) - dev_put(ctx->table->dev); - kfree(ctx->table); module_put(ctx->afi->owner); } @@ -916,6 +883,8 @@ static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = { static const struct nla_policy nft_hook_policy[NFTA_HOOK_MAX + 1] = { [NFTA_HOOK_HOOKNUM] = { .type = NLA_U32 }, [NFTA_HOOK_PRIORITY] = { .type = NLA_U32 }, + [NFTA_HOOK_DEV] = { .type = NLA_STRING, + .len = IFNAMSIZ - 1 }, }; static int nft_dump_stats(struct sk_buff *skb, struct nft_stats __percpu *stats) @@ -989,6 +958,9 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, goto nla_put_failure; if (nla_put_be32(skb, NFTA_HOOK_PRIORITY, htonl(ops->priority))) goto nla_put_failure; + if (basechain->dev_name[0] && + nla_put_string(skb, NFTA_HOOK_DEV, basechain->dev_name)) + goto nla_put_failure; nla_nest_end(skb, nest); if (nla_put_be32(skb, NFTA_CHAIN_POLICY, @@ -1200,9 +1172,13 @@ static void nf_tables_chain_destroy(struct nft_chain *chain) BUG_ON(chain->use > 0); if (chain->flags & NFT_BASE_CHAIN) { - module_put(nft_base_chain(chain)->type->owner); - free_percpu(nft_base_chain(chain)->stats); - kfree(nft_base_chain(chain)); + struct nft_base_chain *basechain = nft_base_chain(chain); + + module_put(basechain->type->owner); + free_percpu(basechain->stats); + if (basechain->ops[0].dev != NULL) + dev_put(basechain->ops[0].dev); + kfree(basechain); } else { kfree(chain); } @@ -1221,6 +1197,7 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, struct nlattr *ha[NFTA_HOOK_MAX + 1]; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; + struct net_device *dev = NULL; u8 policy = NF_ACCEPT; u64 handle = 0; unsigned int i; @@ -1360,17 +1337,43 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, return -ENOENT; hookfn = type->hooks[hooknum]; + if (afi->flags & NFT_AF_NEEDS_DEV) { + char ifname[IFNAMSIZ]; + + if (!ha[NFTA_HOOK_DEV]) { + module_put(type->owner); + return -EOPNOTSUPP; + } + + nla_strlcpy(ifname, ha[NFTA_HOOK_DEV], IFNAMSIZ); + dev = dev_get_by_name(net, ifname); + if (!dev) { + module_put(type->owner); + return -ENOENT; + } + } else if (ha[NFTA_HOOK_DEV]) { + module_put(type->owner); + return -EOPNOTSUPP; + } + basechain = kzalloc(sizeof(*basechain), GFP_KERNEL); if (basechain == NULL) { module_put(type->owner); + if (dev != NULL) + dev_put(dev); return -ENOMEM; } + if (dev != NULL) + strncpy(basechain->dev_name, dev->name, IFNAMSIZ); + if (nla[NFTA_CHAIN_COUNTERS]) { stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]); if (IS_ERR(stats)) { module_put(type->owner); kfree(basechain); + if (dev != NULL) + dev_put(dev); return PTR_ERR(stats); } basechain->stats = stats; @@ -1379,6 +1382,8 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, if (stats == NULL) { module_put(type->owner); kfree(basechain); + if (dev != NULL) + dev_put(dev); return -ENOMEM; } rcu_assign_pointer(basechain->stats, stats); @@ -1396,7 +1401,7 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, ops->priority = priority; ops->priv = chain; ops->hook = afi->hooks[ops->hooknum]; - ops->dev = table->dev; + ops->dev = dev; if (hookfn) ops->hook = hookfn; if (afi->hook_ops_init) -- cgit From d8ee8f7c56b267751f95a1f0b4a75618909c07ac Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 15 Jun 2015 02:42:31 +0200 Subject: netfilter: nf_tables: add nft_register_basechain() and nft_unregister_basechain() This wrapper functions take care of hook registration for basechains. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 52 ++++++++++++++++++++++++++++++------------- 1 file changed, 37 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 900c81a2f89a..c66dc62afb46 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -127,13 +127,38 @@ static void nft_trans_destroy(struct nft_trans *trans) kfree(trans); } +static int nft_register_basechain(struct nft_base_chain *basechain, + unsigned int hook_nops) +{ + return nf_register_hooks(basechain->ops, hook_nops); +} + +static void nft_unregister_basechain(struct nft_base_chain *basechain, + unsigned int hook_nops) +{ + nf_unregister_hooks(basechain->ops, hook_nops); +} + +static int nf_tables_register_hooks(const struct nft_table *table, + struct nft_chain *chain, + unsigned int hook_nops) +{ + if (table->flags & NFT_TABLE_F_DORMANT || + !(chain->flags & NFT_BASE_CHAIN)) + return 0; + + return nft_register_basechain(nft_base_chain(chain), hook_nops); +} + static void nf_tables_unregister_hooks(const struct nft_table *table, - const struct nft_chain *chain, + struct nft_chain *chain, unsigned int hook_nops) { - if (!(table->flags & NFT_TABLE_F_DORMANT) && - chain->flags & NFT_BASE_CHAIN) - nf_unregister_hooks(nft_base_chain(chain)->ops, hook_nops); + if (table->flags & NFT_TABLE_F_DORMANT || + !(chain->flags & NFT_BASE_CHAIN)) + return; + + nft_unregister_basechain(nft_base_chain(chain), hook_nops); } /* Internal table flags */ @@ -560,7 +585,7 @@ static int nf_tables_table_enable(const struct nft_af_info *afi, if (!(chain->flags & NFT_BASE_CHAIN)) continue; - err = nf_register_hooks(nft_base_chain(chain)->ops, afi->nops); + err = nft_register_basechain(nft_base_chain(chain), afi->nops); if (err < 0) goto err; @@ -575,20 +600,20 @@ err: if (i-- <= 0) break; - nf_unregister_hooks(nft_base_chain(chain)->ops, afi->nops); + nft_unregister_basechain(nft_base_chain(chain), afi->nops); } return err; } static void nf_tables_table_disable(const struct nft_af_info *afi, - struct nft_table *table) + struct nft_table *table) { struct nft_chain *chain; list_for_each_entry(chain, &table->chains, list) { if (chain->flags & NFT_BASE_CHAIN) - nf_unregister_hooks(nft_base_chain(chain)->ops, - afi->nops); + nft_unregister_basechain(nft_base_chain(chain), + afi->nops); } } @@ -1421,12 +1446,9 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, chain->table = table; nla_strlcpy(chain->name, name, NFT_CHAIN_MAXNAMELEN); - if (!(table->flags & NFT_TABLE_F_DORMANT) && - chain->flags & NFT_BASE_CHAIN) { - err = nf_register_hooks(nft_base_chain(chain)->ops, afi->nops); - if (err < 0) - goto err1; - } + err = nf_tables_register_hooks(table, chain, afi->nops); + if (err < 0) + goto err1; nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); err = nft_trans_chain_add(&ctx, NFT_MSG_NEWCHAIN); -- cgit From 835b803377f5f11f9ccf234f70ed667a82605c45 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 15 Jun 2015 12:12:01 +0200 Subject: netfilter: nf_tables_netdev: unregister hooks on net_device removal In case the net_device is gone, we have to unregister the hooks and put back the reference on the net_device object. Once it comes back, register them again. This also covers the device rename case. This patch also adds a new flag to indicate that the basechain is disabled, so their hooks are not registered. This flag is used by the netdev family to handle the case where the net_device object is gone. Currently this flag is not exposed to userspace. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 16 ++++++--- net/netfilter/nf_tables_netdev.c | 75 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index c66dc62afb46..cfe636808541 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -127,17 +127,25 @@ static void nft_trans_destroy(struct nft_trans *trans) kfree(trans); } -static int nft_register_basechain(struct nft_base_chain *basechain, - unsigned int hook_nops) +int nft_register_basechain(struct nft_base_chain *basechain, + unsigned int hook_nops) { + if (basechain->flags & NFT_BASECHAIN_DISABLED) + return 0; + return nf_register_hooks(basechain->ops, hook_nops); } +EXPORT_SYMBOL_GPL(nft_register_basechain); -static void nft_unregister_basechain(struct nft_base_chain *basechain, - unsigned int hook_nops) +void nft_unregister_basechain(struct nft_base_chain *basechain, + unsigned int hook_nops) { + if (basechain->flags & NFT_BASECHAIN_DISABLED) + return; + nf_unregister_hooks(basechain->ops, hook_nops); } +EXPORT_SYMBOL_GPL(nft_unregister_basechain); static int nf_tables_register_hooks(const struct nft_table *table, struct nft_chain *chain, diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c index 04cb17057f46..2cae4d4a03b7 100644 --- a/net/netfilter/nf_tables_netdev.c +++ b/net/netfilter/nf_tables_netdev.c @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -157,6 +158,77 @@ static const struct nf_chain_type nft_filter_chain_netdev = { .hook_mask = (1 << NF_NETDEV_INGRESS), }; +static void nft_netdev_event(unsigned long event, struct nft_af_info *afi, + struct net_device *dev, struct nft_table *table, + struct nft_base_chain *basechain) +{ + switch (event) { + case NETDEV_REGISTER: + if (strcmp(basechain->dev_name, dev->name) != 0) + return; + + BUG_ON(!(basechain->flags & NFT_BASECHAIN_DISABLED)); + + dev_hold(dev); + basechain->ops[0].dev = dev; + basechain->flags &= ~NFT_BASECHAIN_DISABLED; + if (!(table->flags & NFT_TABLE_F_DORMANT)) + nft_register_basechain(basechain, afi->nops); + break; + case NETDEV_UNREGISTER: + if (strcmp(basechain->dev_name, dev->name) != 0) + return; + + BUG_ON(basechain->flags & NFT_BASECHAIN_DISABLED); + + if (!(table->flags & NFT_TABLE_F_DORMANT)) + nft_unregister_basechain(basechain, afi->nops); + + dev_put(basechain->ops[0].dev); + basechain->ops[0].dev = NULL; + basechain->flags |= NFT_BASECHAIN_DISABLED; + break; + case NETDEV_CHANGENAME: + if (dev->ifindex != basechain->ops[0].dev->ifindex) + return; + + strncpy(basechain->dev_name, dev->name, IFNAMSIZ); + break; + } +} + +static int nf_tables_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct nft_af_info *afi; + struct nft_table *table; + struct nft_chain *chain; + + nfnl_lock(NFNL_SUBSYS_NFTABLES); + list_for_each_entry(afi, &dev_net(dev)->nft.af_info, list) { + if (afi->family != NFPROTO_NETDEV) + continue; + + list_for_each_entry(table, &afi->tables, list) { + list_for_each_entry(chain, &table->chains, list) { + if (!(chain->flags & NFT_BASE_CHAIN)) + continue; + + nft_netdev_event(event, afi, dev, table, + nft_base_chain(chain)); + } + } + } + nfnl_unlock(NFNL_SUBSYS_NFTABLES); + + return NOTIFY_DONE; +} + +static struct notifier_block nf_tables_netdev_notifier = { + .notifier_call = nf_tables_netdev_event, +}; + static int __init nf_tables_netdev_init(void) { int ret; @@ -166,11 +238,14 @@ static int __init nf_tables_netdev_init(void) if (ret < 0) nft_unregister_chain_type(&nft_filter_chain_netdev); + register_netdevice_notifier(&nf_tables_netdev_notifier); + return ret; } static void __exit nf_tables_netdev_exit(void) { + unregister_netdevice_notifier(&nf_tables_netdev_notifier); unregister_pernet_subsys(&nf_tables_netdev_net_ops); nft_unregister_chain_type(&nft_filter_chain_netdev); } -- cgit From ffeedafbf0236f03aeb2e8db273b3e5ae5f5bc89 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 12 Jun 2015 19:39:12 -0700 Subject: bpf: introduce current->pid, tgid, uid, gid, comm accessors eBPF programs attached to kprobes need to filter based on current->pid, uid and other fields, so introduce helper functions: u64 bpf_get_current_pid_tgid(void) Return: current->tgid << 32 | current->pid u64 bpf_get_current_uid_gid(void) Return: current_gid << 32 | current_uid bpf_get_current_comm(char *buf, int size_of_buf) stores current->comm into buf They can be used from the programs attached to TC as well to classify packets based on current task fields. Update tracex2 example to print histogram of write syscalls for each process instead of aggregated for all. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index d271c06bf01f..20aa51ccbf9d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1459,6 +1459,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_l4_csum_replace_proto; case BPF_FUNC_clone_redirect: return &bpf_clone_redirect_proto; + case BPF_FUNC_get_current_pid_tgid: + return &bpf_get_current_pid_tgid_proto; + case BPF_FUNC_get_current_uid_gid: + return &bpf_get_current_uid_gid_proto; + case BPF_FUNC_get_current_comm: + return &bpf_get_current_comm_proto; default: return sk_filter_func_proto(func_id); } -- cgit From 0756ea3e85139d23a8148ebaa95411c2f0aa4f11 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 12 Jun 2015 19:39:13 -0700 Subject: bpf: allow networking programs to use bpf_trace_printk() for debugging bpf_trace_printk() is a helper function used to debug eBPF programs. Let socket and TC programs use it as well. Note, it's DEBUG ONLY helper. If it's used in the program, the kernel will print warning banner to make sure users don't use it in production. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 20aa51ccbf9d..65ff107d3d29 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1442,6 +1442,8 @@ sk_filter_func_proto(enum bpf_func_id func_id) return &bpf_tail_call_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; + case BPF_FUNC_trace_printk: + return bpf_get_trace_printk_proto(); default: return NULL; } -- cgit From 7f1095394918c7058ff81c96c3bab3a897e97a9d Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Fri, 12 Jun 2015 17:39:50 -0700 Subject: bridge: use either ndo VLAN ops or switchdev VLAN ops to install MASTER vlans v2: Move struct switchdev_obj automatics to inner scope where there used. v1: To maintain backward compatibility with the existing iproute2 "bridge vlan" command, let bridge's setlink/dellink handler call into either the port driver's 8021q ndo ops or the port driver's bridge_setlink/dellink ops. This allows port driver to choose 8021q ops or the newer bridge_setlink/dellink ops when implementing VLAN add/del filtering on the device. The iproute "bridge vlan" command does not need to be modified. To summarize using the "bridge vlan" command examples, we have: 1) bridge vlan add|del vid VID dev DEV Here iproute2 sets MASTER flag. Bridge's bridge_setlink/dellink is called. Vlan is set on bridge for port. If port driver implements ndo 8021q ops, call those to port driver can install vlan filter on device. Otherwise, if port driver implements bridge_setlink/dellink ops, call those to install vlan filter to device. This option only works if port is bridged. 2) bridge vlan add|del vid VID dev DEV master Same as 1) 3) bridge vlan add|del vid VID dev DEV self Bridge's bridge_setlink/dellink isn't called. Port driver's bridge_setlink/dellink is called, if implemented. This option works if port is bridged or not. If port is not bridged, a VLAN can still be added/deleted to device filter using this variant. 4) bridge vlan add|del vid VID dev DEV master self This is a combination of 1) and 3), but will only work if port is bridged. Signed-off-by: Scott Feldman Signed-off-by: David S. Miller --- net/bridge/br_vlan.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 57 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 13013fe8db24..17fc358a5432 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -2,6 +2,7 @@ #include #include #include +#include #include "br_private.h" @@ -36,6 +37,36 @@ static void __vlan_add_flags(struct net_port_vlans *v, u16 vid, u16 flags) clear_bit(vid, v->untagged_bitmap); } +static int __vlan_vid_add(struct net_device *dev, struct net_bridge *br, + u16 vid, u16 flags) +{ + const struct net_device_ops *ops = dev->netdev_ops; + int err; + + /* If driver uses VLAN ndo ops, use 8021q to install vid + * on device, otherwise try switchdev ops to install vid. + */ + + if (ops->ndo_vlan_rx_add_vid) { + err = vlan_vid_add(dev, br->vlan_proto, vid); + } else { + struct switchdev_obj vlan_obj = { + .id = SWITCHDEV_OBJ_PORT_VLAN, + .u.vlan = { + .flags = flags, + .vid_start = vid, + .vid_end = vid, + }, + }; + + err = switchdev_port_obj_add(dev, &vlan_obj); + if (err == -EOPNOTSUPP) + err = 0; + } + + return err; +} + static int __vlan_add(struct net_port_vlans *v, u16 vid, u16 flags) { struct net_bridge_port *p = NULL; @@ -62,7 +93,7 @@ static int __vlan_add(struct net_port_vlans *v, u16 vid, u16 flags) * This ensures tagged traffic enters the bridge when * promiscuous mode is disabled by br_manage_promisc(). */ - err = vlan_vid_add(dev, br->vlan_proto, vid); + err = __vlan_vid_add(dev, br, vid, flags); if (err) return err; } @@ -86,6 +117,30 @@ out_filt: return err; } +static void __vlan_vid_del(struct net_device *dev, struct net_bridge *br, + u16 vid) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + /* If driver uses VLAN ndo ops, use 8021q to delete vid + * on device, otherwise try switchdev ops to delete vid. + */ + + if (ops->ndo_vlan_rx_kill_vid) { + vlan_vid_del(dev, br->vlan_proto, vid); + } else { + struct switchdev_obj vlan_obj = { + .id = SWITCHDEV_OBJ_PORT_VLAN, + .u.vlan = { + .vid_start = vid, + .vid_end = vid, + }, + }; + + switchdev_port_obj_del(dev, &vlan_obj); + } +} + static int __vlan_del(struct net_port_vlans *v, u16 vid) { if (!test_bit(vid, v->vlan_bitmap)) @@ -96,7 +151,7 @@ static int __vlan_del(struct net_port_vlans *v, u16 vid) if (v->port_idx) { struct net_bridge_port *p = v->parent.port; - vlan_vid_del(p->dev, p->br->vlan_proto, vid); + __vlan_vid_del(p->dev, p->br, vid); } clear_bit(vid, v->vlan_bitmap); -- cgit From b4ad7baa01970d2c5096fbbcb0c593e199c6f18b Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Sun, 14 Jun 2015 11:33:11 -0700 Subject: bridge: del external_learned fdbs from device on flush or ageout We need to delete from offload the device externally learnded fdbs when any one of these events happen: 1) Bridge ages out fdb. (When bridge is doing ageing vs. device doing ageing. If device is doing ageing, it would send SWITCHDEV_FDB_DEL directly). 2) STP state change flushes fdbs on port. 3) User uses sysfs interface to flush fdbs from bridge or bridge port: echo 1 >/sys/class/net/BR_DEV/bridge/flush echo 1 >/sys/class/net/BR_PORT/brport/flush 4) Offload driver send event SWITCHDEV_FDB_DEL to delete fdb entry. For rocker, we can now get called to delete fdb entry in wait and nowait contexts, so set NOWAIT flag when deleting fdb entry. Signed-off-by: Scott Feldman Signed-off-by: David S. Miller --- net/bridge/br_fdb.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'net') diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 13949a71591d..be84b7e5a3da 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "br_private.h" static struct kmem_cache *br_fdb_cache __read_mostly; @@ -130,11 +131,27 @@ static void fdb_del_hw_addr(struct net_bridge *br, const unsigned char *addr) } } +static void fdb_del_external_learn(struct net_bridge_fdb_entry *f) +{ + struct switchdev_obj obj = { + .id = SWITCHDEV_OBJ_PORT_FDB, + .u.fdb = { + .addr = f->addr.addr, + .vid = f->vlan_id, + }, + }; + + switchdev_port_obj_del(f->dst->dev, &obj); +} + static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f) { if (f->is_static) fdb_del_hw_addr(br, f->addr.addr); + if (f->added_by_external_learn) + fdb_del_external_learn(f); + hlist_del_rcu(&f->hlist); fdb_notify(br, f, RTM_DELNEIGH); call_rcu(&f->rcu, fdb_rcu_free); -- cgit From 3b766cd832328fcb87db3507e7b98cf42f21689d Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Mon, 15 Jun 2015 17:59:07 +0300 Subject: net/core: Add reading VF statistics through the PF netdevice Add ndo_get_vf_stats where the PF retrieves and fills the VFs traffic statistics. We encode the VF stats in a nested manner to allow for future extensions. Signed-off-by: Eran Ben Elisha Signed-off-by: Hadar Hen Zion Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 077b6d280371..2d102ce1474f 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -819,7 +819,19 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev, nla_total_size(sizeof(struct ifla_vf_spoofchk)) + nla_total_size(sizeof(struct ifla_vf_rate)) + nla_total_size(sizeof(struct ifla_vf_link_state)) + - nla_total_size(sizeof(struct ifla_vf_rss_query_en))); + nla_total_size(sizeof(struct ifla_vf_rss_query_en)) + + /* IFLA_VF_STATS_RX_PACKETS */ + nla_total_size(sizeof(__u64)) + + /* IFLA_VF_STATS_TX_PACKETS */ + nla_total_size(sizeof(__u64)) + + /* IFLA_VF_STATS_RX_BYTES */ + nla_total_size(sizeof(__u64)) + + /* IFLA_VF_STATS_TX_BYTES */ + nla_total_size(sizeof(__u64)) + + /* IFLA_VF_STATS_BROADCAST */ + nla_total_size(sizeof(__u64)) + + /* IFLA_VF_STATS_MULTICAST */ + nla_total_size(sizeof(__u64))); return size; } else return 0; @@ -1123,7 +1135,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, && (ext_filter_mask & RTEXT_FILTER_VF)) { int i; - struct nlattr *vfinfo, *vf; + struct nlattr *vfinfo, *vf, *vfstats; int num_vfs = dev_num_vf(dev->dev.parent); vfinfo = nla_nest_start(skb, IFLA_VFINFO_LIST); @@ -1138,6 +1150,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, struct ifla_vf_spoofchk vf_spoofchk; struct ifla_vf_link_state vf_linkstate; struct ifla_vf_rss_query_en vf_rss_query_en; + struct ifla_vf_stats vf_stats; /* * Not all SR-IOV capable drivers support the @@ -1190,6 +1203,30 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, sizeof(vf_rss_query_en), &vf_rss_query_en)) goto nla_put_failure; + memset(&vf_stats, 0, sizeof(vf_stats)); + if (dev->netdev_ops->ndo_get_vf_stats) + dev->netdev_ops->ndo_get_vf_stats(dev, i, + &vf_stats); + vfstats = nla_nest_start(skb, IFLA_VF_STATS); + if (!vfstats) { + nla_nest_cancel(skb, vf); + nla_nest_cancel(skb, vfinfo); + goto nla_put_failure; + } + if (nla_put_u64(skb, IFLA_VF_STATS_RX_PACKETS, + vf_stats.rx_packets) || + nla_put_u64(skb, IFLA_VF_STATS_TX_PACKETS, + vf_stats.tx_packets) || + nla_put_u64(skb, IFLA_VF_STATS_RX_BYTES, + vf_stats.rx_bytes) || + nla_put_u64(skb, IFLA_VF_STATS_TX_BYTES, + vf_stats.tx_bytes) || + nla_put_u64(skb, IFLA_VF_STATS_BROADCAST, + vf_stats.broadcast) || + nla_put_u64(skb, IFLA_VF_STATS_MULTICAST, + vf_stats.multicast)) + goto nla_put_failure; + nla_nest_end(skb, vfstats); nla_nest_end(skb, vf); } nla_nest_end(skb, vfinfo); @@ -1303,6 +1340,16 @@ static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = { [IFLA_VF_RATE] = { .len = sizeof(struct ifla_vf_rate) }, [IFLA_VF_LINK_STATE] = { .len = sizeof(struct ifla_vf_link_state) }, [IFLA_VF_RSS_QUERY_EN] = { .len = sizeof(struct ifla_vf_rss_query_en) }, + [IFLA_VF_STATS] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy ifla_vf_stats_policy[IFLA_VF_STATS_MAX + 1] = { + [IFLA_VF_STATS_RX_PACKETS] = { .type = NLA_U64 }, + [IFLA_VF_STATS_TX_PACKETS] = { .type = NLA_U64 }, + [IFLA_VF_STATS_RX_BYTES] = { .type = NLA_U64 }, + [IFLA_VF_STATS_TX_BYTES] = { .type = NLA_U64 }, + [IFLA_VF_STATS_BROADCAST] = { .type = NLA_U64 }, + [IFLA_VF_STATS_MULTICAST] = { .type = NLA_U64 }, }; static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = { -- cgit From eb4cb008529ca08e0d8c0fa54e8f739520197a65 Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Mon, 15 Jun 2015 11:26:18 -0400 Subject: sock_diag: define destruction multicast groups These groups will contain socket-destruction events for AF_INET/AF_INET6, IPPROTO_TCP/IPPROTO_UDP. Near the end of socket destruction, a check for listeners is performed. In the presence of a listener, rather than completely cleanup the socket, a unit of work will be added to a private work queue which will first broadcast information about the socket and then finish the cleanup operation. Signed-off-by: Craig Gallek Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 11 ++++++- net/core/sock_diag.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 7063c329c1b6..1e1fe9a68d83 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -131,6 +131,7 @@ #include #include #include +#include #include @@ -1423,7 +1424,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, } EXPORT_SYMBOL(sk_alloc); -static void __sk_free(struct sock *sk) +void sk_destruct(struct sock *sk) { struct sk_filter *filter; @@ -1451,6 +1452,14 @@ static void __sk_free(struct sock *sk) sk_prot_free(sk->sk_prot_creator, sk); } +static void __sk_free(struct sock *sk) +{ + if (unlikely(sock_diag_has_destroy_listeners(sk))) + sock_diag_broadcast_destroy(sk); + else + sk_destruct(sk); +} + void sk_free(struct sock *sk) { /* diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index 74dddf84adcd..d79866c5f8bc 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -5,6 +5,9 @@ #include #include #include +#include +#include +#include #include #include @@ -12,6 +15,7 @@ static const struct sock_diag_handler *sock_diag_handlers[AF_MAX]; static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh); static DEFINE_MUTEX(sock_diag_table_mutex); +static struct workqueue_struct *broadcast_wq; static u64 sock_gen_cookie(struct sock *sk) { @@ -101,6 +105,62 @@ out: } EXPORT_SYMBOL(sock_diag_put_filterinfo); +struct broadcast_sk { + struct sock *sk; + struct work_struct work; +}; + +static size_t sock_diag_nlmsg_size(void) +{ + return NLMSG_ALIGN(sizeof(struct inet_diag_msg) + + nla_total_size(sizeof(u8)) /* INET_DIAG_PROTOCOL */ + + nla_total_size(sizeof(struct tcp_info))); /* INET_DIAG_INFO */ +} + +static void sock_diag_broadcast_destroy_work(struct work_struct *work) +{ + struct broadcast_sk *bsk = + container_of(work, struct broadcast_sk, work); + struct sock *sk = bsk->sk; + const struct sock_diag_handler *hndl; + struct sk_buff *skb; + const enum sknetlink_groups group = sock_diag_destroy_group(sk); + int err = -1; + + WARN_ON(group == SKNLGRP_NONE); + + skb = nlmsg_new(sock_diag_nlmsg_size(), GFP_KERNEL); + if (!skb) + goto out; + + mutex_lock(&sock_diag_table_mutex); + hndl = sock_diag_handlers[sk->sk_family]; + if (hndl && hndl->get_info) + err = hndl->get_info(skb, sk); + mutex_unlock(&sock_diag_table_mutex); + + if (!err) + nlmsg_multicast(sock_net(sk)->diag_nlsk, skb, 0, group, + GFP_KERNEL); + else + kfree_skb(skb); +out: + sk_destruct(sk); + kfree(bsk); +} + +void sock_diag_broadcast_destroy(struct sock *sk) +{ + /* Note, this function is often called from an interrupt context. */ + struct broadcast_sk *bsk = + kmalloc(sizeof(struct broadcast_sk), GFP_ATOMIC); + if (!bsk) + return sk_destruct(sk); + bsk->sk = sk; + INIT_WORK(&bsk->work, sock_diag_broadcast_destroy_work); + queue_work(broadcast_wq, &bsk->work); +} + void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)) { mutex_lock(&sock_diag_table_mutex); @@ -211,10 +271,32 @@ static void sock_diag_rcv(struct sk_buff *skb) mutex_unlock(&sock_diag_mutex); } +static int sock_diag_bind(struct net *net, int group) +{ + switch (group) { + case SKNLGRP_INET_TCP_DESTROY: + case SKNLGRP_INET_UDP_DESTROY: + if (!sock_diag_handlers[AF_INET]) + request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, + NETLINK_SOCK_DIAG, AF_INET); + break; + case SKNLGRP_INET6_TCP_DESTROY: + case SKNLGRP_INET6_UDP_DESTROY: + if (!sock_diag_handlers[AF_INET6]) + request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, + NETLINK_SOCK_DIAG, AF_INET); + break; + } + return 0; +} + static int __net_init diag_net_init(struct net *net) { struct netlink_kernel_cfg cfg = { + .groups = SKNLGRP_MAX, .input = sock_diag_rcv, + .bind = sock_diag_bind, + .flags = NL_CFG_F_NONROOT_RECV, }; net->diag_nlsk = netlink_kernel_create(net, NETLINK_SOCK_DIAG, &cfg); @@ -234,12 +316,15 @@ static struct pernet_operations diag_net_ops = { static int __init sock_diag_init(void) { + broadcast_wq = alloc_workqueue("sock_diag_events", 0, 0); + BUG_ON(!broadcast_wq); return register_pernet_subsys(&diag_net_ops); } static void __exit sock_diag_exit(void) { unregister_pernet_subsys(&diag_net_ops); + destroy_workqueue(broadcast_wq); } module_init(sock_diag_init); -- cgit From 3fd22af808f4d7455ba91596d334438c7ee0f889 Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Mon, 15 Jun 2015 11:26:19 -0400 Subject: sock_diag: specify info_size per inet protocol Previously, there was no clear distinction between the inet protocols that used struct tcp_info to report information and those that didn't. This change adds a specific size attribute to the inet_diag_handler struct which defines these interfaces. This will make dispatching sock_diag get_info requests identical for all inet protocols in a following patch. Tested: ss -au Tested: ss -at Signed-off-by: Craig Gallek Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/dccp/diag.c | 1 + net/ipv4/inet_diag.c | 4 ++-- net/ipv4/tcp_diag.c | 1 + net/ipv4/udp_diag.c | 2 ++ 4 files changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/dccp/diag.c b/net/dccp/diag.c index 5a45f8de5d99..2d84303ea6bf 100644 --- a/net/dccp/diag.c +++ b/net/dccp/diag.c @@ -66,6 +66,7 @@ static const struct inet_diag_handler dccp_diag_handler = { .dump_one = dccp_diag_dump_one, .idiag_get_info = dccp_diag_get_info, .idiag_type = IPPROTO_DCCP, + .idiag_info_size = sizeof(struct tcp_info), }; static int __init dccp_diag_init(void) diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 4d32262c7502..b1f01174bf32 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -200,9 +200,9 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, } #undef EXPIRES_IN_MS - if (ext & (1 << (INET_DIAG_INFO - 1))) { + if ((ext & (1 << (INET_DIAG_INFO - 1))) && handler->idiag_info_size) { attr = nla_reserve(skb, INET_DIAG_INFO, - sizeof(struct tcp_info)); + handler->idiag_info_size); if (!attr) goto errout; diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 79b34a0f4a4a..423e3881a40b 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -50,6 +50,7 @@ static const struct inet_diag_handler tcp_diag_handler = { .dump_one = tcp_diag_dump_one, .idiag_get_info = tcp_diag_get_info, .idiag_type = IPPROTO_TCP, + .idiag_info_size = sizeof(struct tcp_info), }; static int __init tcp_diag_init(void) diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index b763c39ae1d7..6116604bf6e8 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -170,6 +170,7 @@ static const struct inet_diag_handler udp_diag_handler = { .dump_one = udp_diag_dump_one, .idiag_get_info = udp_diag_get_info, .idiag_type = IPPROTO_UDP, + .idiag_info_size = 0, }; static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, @@ -190,6 +191,7 @@ static const struct inet_diag_handler udplite_diag_handler = { .dump_one = udplite_diag_dump_one, .idiag_get_info = udp_diag_get_info, .idiag_type = IPPROTO_UDPLITE, + .idiag_info_size = 0, }; static int __init udp_diag_init(void) -- cgit From 35ac838a9b96470f999db04320f53a2033642bfb Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Mon, 15 Jun 2015 11:26:20 -0400 Subject: sock_diag: implement a get_info handler for inet This get_info handler will simply dispatch to the appropriate existing inet protocol handler. This patch also includes a new netlink attribute (INET_DIAG_PROTOCOL). This attribute is currently only used for multicast messages. Without this attribute, there is no way of knowing the IP protocol used by the socket information being broadcast. This attribute is not necessary in the 'dump' variant of this protocol (though it could easily be added) because dump requests are issued for specific family/protocol pairs. Tested: ss -E (note, the -E option has not yet been merged into the upstream version of ss). Signed-off-by: Craig Gallek Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/inet_diag.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp.c | 4 +++- net/ipv4/tcp_diag.c | 5 +++-- 3 files changed, 52 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index b1f01174bf32..21985d8d41e7 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -1078,14 +1078,60 @@ static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) return inet_diag_get_exact(skb, h, nlmsg_data(h)); } +static +int inet_diag_handler_get_info(struct sk_buff *skb, struct sock *sk) +{ + const struct inet_diag_handler *handler; + struct nlmsghdr *nlh; + struct nlattr *attr; + struct inet_diag_msg *r; + void *info = NULL; + int err = 0; + + nlh = nlmsg_put(skb, 0, 0, SOCK_DIAG_BY_FAMILY, sizeof(*r), 0); + if (!nlh) + return -ENOMEM; + + r = nlmsg_data(nlh); + memset(r, 0, sizeof(*r)); + inet_diag_msg_common_fill(r, sk); + r->idiag_state = sk->sk_state; + + if ((err = nla_put_u8(skb, INET_DIAG_PROTOCOL, sk->sk_protocol))) { + nlmsg_cancel(skb, nlh); + return err; + } + + handler = inet_diag_lock_handler(sk->sk_protocol); + if (IS_ERR(handler)) { + inet_diag_unlock_handler(handler); + nlmsg_cancel(skb, nlh); + return PTR_ERR(handler); + } + + attr = handler->idiag_info_size + ? nla_reserve(skb, INET_DIAG_INFO, handler->idiag_info_size) + : NULL; + if (attr) + info = nla_data(attr); + + handler->idiag_get_info(sk, r, info); + inet_diag_unlock_handler(handler); + + nlmsg_end(skb, nlh); + return 0; +} + static const struct sock_diag_handler inet_diag_handler = { .family = AF_INET, .dump = inet_diag_handler_dump, + .get_info = inet_diag_handler_get_info, }; static const struct sock_diag_handler inet6_diag_handler = { .family = AF_INET6, .dump = inet_diag_handler_dump, + .get_info = inet_diag_handler_get_info, }; int inet_diag_register(const struct inet_diag_handler *h) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 65f791f74845..697b86dd45b3 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2624,13 +2624,15 @@ EXPORT_SYMBOL(compat_tcp_setsockopt); /* Return information about state of tcp endpoint in API format. */ void tcp_get_info(struct sock *sk, struct tcp_info *info) { - const struct tcp_sock *tp = tcp_sk(sk); + const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ const struct inet_connection_sock *icsk = inet_csk(sk); u32 now = tcp_time_stamp; unsigned int start; u32 rate; memset(info, 0, sizeof(*info)); + if (sk->sk_type != SOCK_STREAM) + return; info->tcpi_state = sk->sk_state; info->tcpi_ca_state = icsk->icsk_ca_state; diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 423e3881a40b..479f34946177 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -19,13 +19,14 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, void *_info) { - const struct tcp_sock *tp = tcp_sk(sk); struct tcp_info *info = _info; if (sk->sk_state == TCP_LISTEN) { r->idiag_rqueue = sk->sk_ack_backlog; r->idiag_wqueue = sk->sk_max_ack_backlog; - } else { + } else if (sk->sk_type == SOCK_STREAM) { + const struct tcp_sock *tp = tcp_sk(sk); + r->idiag_rqueue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); r->idiag_wqueue = tp->write_seq - tp->snd_una; } -- cgit From 89d256bb69f2596c3a31ac51466eac9e1791c388 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 15 Jun 2015 20:49:24 -0700 Subject: bpf: disallow bpf tc programs access current->pid,uid Accessing current->pid/uid from cls_bpf may lead to misleading results and should not be used when TC classifiers need accurate information about pid/uid. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 65ff107d3d29..be3098fb65e4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1461,12 +1461,6 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_l4_csum_replace_proto; case BPF_FUNC_clone_redirect: return &bpf_clone_redirect_proto; - case BPF_FUNC_get_current_pid_tgid: - return &bpf_get_current_pid_tgid_proto; - case BPF_FUNC_get_current_uid_gid: - return &bpf_get_current_uid_gid_proto; - case BPF_FUNC_get_current_comm: - return &bpf_get_current_comm_proto; default: return sk_filter_func_proto(func_id); } -- cgit From 95c0aa15711e1e5fd62300a8abd244186ebf67e7 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 17 Jun 2015 15:35:17 +0200 Subject: mac802154: iface: fix order while interface up This patch moves the hardware setting before calling the driver start callback which activates the receive handling. The hardware setup contains settings like address filtering which should be setup before activate the receive handling on the transceiver. These setting are protected by ieee802154_check_concurrent_iface check. This means we need to set these registers once before calling drv_start and can't be overwritten by other interfaces. Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/mac802154/iface.c | 103 +++++++++++++++++++++++++++----------------------- 1 file changed, 55 insertions(+), 48 deletions(-) (limited to 'net') diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c index 692731d240e4..0b0cccb85336 100644 --- a/net/mac802154/iface.c +++ b/net/mac802154/iface.c @@ -135,6 +135,56 @@ static int mac802154_wpan_mac_addr(struct net_device *dev, void *p) return mac802154_wpan_update_llsec(dev); } +static int ieee802154_setup_hw(struct ieee802154_sub_if_data *sdata) +{ + struct ieee802154_local *local = sdata->local; + struct wpan_dev *wpan_dev = &sdata->wpan_dev; + int ret; + + if (local->hw.flags & IEEE802154_HW_PROMISCUOUS) { + ret = drv_set_promiscuous_mode(local, + wpan_dev->promiscuous_mode); + if (ret < 0) + return ret; + } + + if (local->hw.flags & IEEE802154_HW_AFILT) { + ret = drv_set_pan_id(local, wpan_dev->pan_id); + if (ret < 0) + return ret; + + ret = drv_set_extended_addr(local, wpan_dev->extended_addr); + if (ret < 0) + return ret; + + ret = drv_set_short_addr(local, wpan_dev->short_addr); + if (ret < 0) + return ret; + } + + if (local->hw.flags & IEEE802154_HW_LBT) { + ret = drv_set_lbt_mode(local, wpan_dev->lbt); + if (ret < 0) + return ret; + } + + if (local->hw.flags & IEEE802154_HW_CSMA_PARAMS) { + ret = drv_set_csma_params(local, wpan_dev->min_be, + wpan_dev->max_be, + wpan_dev->csma_retries); + if (ret < 0) + return ret; + } + + if (local->hw.flags & IEEE802154_HW_FRAME_RETRIES) { + ret = drv_set_max_frame_retries(local, wpan_dev->frame_retries); + if (ret < 0) + return ret; + } + + return 0; +} + static int mac802154_slave_open(struct net_device *dev) { struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); @@ -146,6 +196,10 @@ static int mac802154_slave_open(struct net_device *dev) set_bit(SDATA_STATE_RUNNING, &sdata->state); if (!local->open_count) { + res = ieee802154_setup_hw(sdata); + if (res) + goto err; + res = drv_start(local); if (res) goto err; @@ -239,60 +293,13 @@ static int mac802154_wpan_open(struct net_device *dev) { int rc; struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); - struct ieee802154_local *local = sdata->local; struct wpan_dev *wpan_dev = &sdata->wpan_dev; rc = ieee802154_check_concurrent_iface(sdata, wpan_dev->iftype); if (rc < 0) return rc; - rc = mac802154_slave_open(dev); - if (rc < 0) - return rc; - - if (local->hw.flags & IEEE802154_HW_PROMISCUOUS) { - rc = drv_set_promiscuous_mode(local, - wpan_dev->promiscuous_mode); - if (rc < 0) - goto out; - } - - if (local->hw.flags & IEEE802154_HW_AFILT) { - rc = drv_set_pan_id(local, wpan_dev->pan_id); - if (rc < 0) - goto out; - - rc = drv_set_extended_addr(local, wpan_dev->extended_addr); - if (rc < 0) - goto out; - - rc = drv_set_short_addr(local, wpan_dev->short_addr); - if (rc < 0) - goto out; - } - - if (local->hw.flags & IEEE802154_HW_LBT) { - rc = drv_set_lbt_mode(local, wpan_dev->lbt); - if (rc < 0) - goto out; - } - - if (local->hw.flags & IEEE802154_HW_CSMA_PARAMS) { - rc = drv_set_csma_params(local, wpan_dev->min_be, - wpan_dev->max_be, - wpan_dev->csma_retries); - if (rc < 0) - goto out; - } - - if (local->hw.flags & IEEE802154_HW_FRAME_RETRIES) { - rc = drv_set_max_frame_retries(local, wpan_dev->frame_retries); - if (rc < 0) - goto out; - } - -out: - return rc; + return mac802154_slave_open(dev); } static int mac802154_slave_close(struct net_device *dev) -- cgit From 5c698e8bbfaa6e26d851eeeeee09d61dfc9ff4a0 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 17 Jun 2015 15:35:18 +0200 Subject: mac802154: iface: cleanup stack variable There is no need to init res with zero, res can be unused but then we returning zero and not res. Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/mac802154/iface.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c index 0b0cccb85336..8b698246a51b 100644 --- a/net/mac802154/iface.c +++ b/net/mac802154/iface.c @@ -189,7 +189,7 @@ static int mac802154_slave_open(struct net_device *dev) { struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); struct ieee802154_local *local = sdata->local; - int res = 0; + int res; ASSERT_RTNL(); -- cgit From a2105ae1de117e17522d37b6dedb1f286c544160 Mon Sep 17 00:00:00 2001 From: Glenn Ruben Bakke Date: Wed, 17 Jun 2015 07:32:23 -0700 Subject: Bluetooth: 6lowpan: Enable delete_netdev to be scheduled when last peer is deleted This patch fixes an issue with the netdev not being unregistered when the last peer is deleted. Removing the logical negation operator on the boolean solves this issue. If the last peer is removed the condition will be true, and the delete_netdev() is scheduled. Signed-off-by: Lukasz Duda Signed-off-by: Glenn Ruben Bakke Signed-off-by: Marcel Holtmann --- net/bluetooth/6lowpan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index f3d6046c8ee7..3edc7315835e 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -977,7 +977,7 @@ static void chan_close_cb(struct l2cap_chan *chan) ifdown(dev->netdev); - if (!removed) { + if (removed) { INIT_WORK(&entry->delete_netdev, delete_netdev); schedule_work(&entry->delete_netdev); } -- cgit From f63666d20973b85073c4849c9d7471219adf6da9 Mon Sep 17 00:00:00 2001 From: Glenn Ruben Bakke Date: Wed, 17 Jun 2015 07:32:24 -0700 Subject: Bluetooth: 6lowpan: Rename ambiguous variable This patch renames the variable used to trigger scheduling of delete_netdev. Changed to infinitiv in order to describe the action to be done. Signed-off-by: Lukasz Duda Signed-off-by: Glenn Ruben Bakke Signed-off-by: Marcel Holtmann --- net/bluetooth/6lowpan.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 3edc7315835e..7ee591aac2fc 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -937,7 +937,7 @@ static void chan_close_cb(struct l2cap_chan *chan) struct lowpan_dev *dev = NULL; struct lowpan_peer *peer; int err = -ENOENT; - bool last = false, removed = true; + bool last = false, remove = true; BT_DBG("chan %p conn %p", chan, chan->conn); @@ -948,7 +948,7 @@ static void chan_close_cb(struct l2cap_chan *chan) /* If conn is set, then the netdev is also there and we should * not remove it. */ - removed = false; + remove = false; } spin_lock(&devices_lock); @@ -977,7 +977,7 @@ static void chan_close_cb(struct l2cap_chan *chan) ifdown(dev->netdev); - if (removed) { + if (remove) { INIT_WORK(&entry->delete_netdev, delete_netdev); schedule_work(&entry->delete_netdev); } -- cgit From fc84242f7a9731885ae70c0077816fa6fb19c4d5 Mon Sep 17 00:00:00 2001 From: Glenn Ruben Bakke Date: Wed, 17 Jun 2015 07:32:25 -0700 Subject: Bluetooth: 6lowpan: Move netdev sysfs device reference This patch moves the sysfs device used by the netdev from the device of the first connected peer to the hci sysfs device. Using the sysfs device of hci instead of the first connected device fixes this issue such that the sysfs group of tx-0 and bt0 kobject are still present after the last peer has been deleted and all sysfs entries can be removed. Signed-off-by: Lukasz Duda Signed-off-by: Glenn Ruben Bakke Signed-off-by: Marcel Holtmann --- net/bluetooth/6lowpan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 7ee591aac2fc..bc105a91c84a 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -856,7 +856,7 @@ static int setup_netdev(struct l2cap_chan *chan, struct lowpan_dev **dev) set_dev_addr(netdev, &chan->src, chan->src_type); netdev->netdev_ops = &netdev_ops; - SET_NETDEV_DEV(netdev, &chan->conn->hcon->dev); + SET_NETDEV_DEV(netdev, &chan->conn->hcon->hdev->dev); SET_NETDEV_DEVTYPE(netdev, &bt_type); err = register_netdev(netdev); -- cgit From 2ad88fb2c0ff57372406eff4df8dde020bf1a18d Mon Sep 17 00:00:00 2001 From: Glenn Ruben Bakke Date: Wed, 17 Jun 2015 07:32:26 -0700 Subject: Bluetooth: 6lowpan: Fix double kfree of netdev priv This patch removes the kfree of the netdev priv in device_event() upon NETDEV_UNREGISTER event. The freeing of memory is taken care of by the netdev destructor. Signed-off-by: Lukasz Duda Signed-off-by: Glenn Ruben Bakke Signed-off-by: Marcel Holtmann --- net/bluetooth/6lowpan.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index bc105a91c84a..071f9eb3805d 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -928,7 +928,7 @@ static void delete_netdev(struct work_struct *work) unregister_netdev(entry->netdev); - /* The entry pointer is deleted in device_event() */ + /* The entry pointer is deleted by the netdev destructor. */ } static void chan_close_cb(struct l2cap_chan *chan) @@ -1418,7 +1418,6 @@ static int device_event(struct notifier_block *unused, BT_DBG("Unregistered netdev %s %p", netdev->name, netdev); list_del(&entry->list); - kfree(entry); break; } } -- cgit From 89e4042861772a89fc54ce517255c3a38ba58ac1 Mon Sep 17 00:00:00 2001 From: Glenn Ruben Bakke Date: Wed, 17 Jun 2015 07:32:27 -0700 Subject: Bluetooth: 6lowpan: Fix module refcount This patch removes the additional module_put() in disconnect_all_peers() making a correct module refcount so that the module can be removed after disabling 6lowpan through debugfs. Signed-off-by: Lukasz Duda Signed-off-by: Glenn Ruben Bakke Signed-off-by: Marcel Holtmann --- net/bluetooth/6lowpan.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 071f9eb3805d..2fb7b3064904 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -1208,8 +1208,6 @@ static void disconnect_all_peers(void) list_del_rcu(&peer->list); kfree_rcu(peer, rcu); - - module_put(THIS_MODULE); } spin_unlock(&devices_lock); } -- cgit From 2dab80a8b486f02222a69daca6859519e05781d9 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Mon, 15 Jun 2015 20:28:51 +0300 Subject: bridge: fix br_stp_set_bridge_priority race conditions After the ->set() spinlocks were removed br_stp_set_bridge_priority was left running without any protection when used via sysfs. It can race with port add/del and could result in use-after-free cases and corrupted lists. Tested by running port add/del in a loop with stp enabled while setting priority in a loop, crashes are easily reproducible. The spinlocks around sysfs ->set() were removed in commit: 14f98f258f19 ("bridge: range check STP parameters") There's also a race condition in the netlink priority support that is fixed by this change, but it was introduced recently and the fixes tag covers it, just in case it's needed the commit is: af615762e972 ("bridge: add ageing_time, stp_state, priority over netlink") Signed-off-by: Nikolay Aleksandrov Fixes: 14f98f258f19 ("bridge: range check STP parameters") Signed-off-by: David S. Miller --- net/bridge/br_ioctl.c | 2 -- net/bridge/br_stp_if.c | 4 +++- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c index a9a4a1b7863d..8d423bc649b9 100644 --- a/net/bridge/br_ioctl.c +++ b/net/bridge/br_ioctl.c @@ -247,9 +247,7 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; - spin_lock_bh(&br->lock); br_stp_set_bridge_priority(br, args[1]); - spin_unlock_bh(&br->lock); return 0; case BRCTL_SET_PORT_PRIORITY: diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index 41146872c1b4..7832d07f48f6 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -243,12 +243,13 @@ bool br_stp_recalculate_bridge_id(struct net_bridge *br) return true; } -/* called under bridge lock */ +/* Acquires and releases bridge lock */ void br_stp_set_bridge_priority(struct net_bridge *br, u16 newprio) { struct net_bridge_port *p; int wasroot; + spin_lock_bh(&br->lock); wasroot = br_is_root_bridge(br); list_for_each_entry(p, &br->port_list, list) { @@ -266,6 +267,7 @@ void br_stp_set_bridge_priority(struct net_bridge *br, u16 newprio) br_port_state_selection(br); if (br_is_root_bridge(br) && !wasroot) br_become_root_bridge(br); + spin_unlock_bh(&br->lock); } /* called under bridge lock */ -- cgit From ef493bd930ae482c608c5999b40d79dda3f2a674 Mon Sep 17 00:00:00 2001 From: Roman Kubiak Date: Fri, 12 Jun 2015 12:32:57 +0200 Subject: netfilter: nfnetlink_queue: add security context information This patch adds an additional attribute when sending packet information via netlink in netfilter_queue module. It will send additional security context data, so that userspace applications can verify this context against their own security databases. Signed-off-by: Roman Kubiak Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_queue_core.c | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c index 22a5ac76683e..6eccf0fcdc63 100644 --- a/net/netfilter/nfnetlink_queue_core.c +++ b/net/netfilter/nfnetlink_queue_core.c @@ -278,6 +278,23 @@ nla_put_failure: return -1; } +static u32 nfqnl_get_sk_secctx(struct sk_buff *skb, char **secdata) +{ + u32 seclen = 0; +#if IS_ENABLED(CONFIG_NETWORK_SECMARK) + if (!skb || !sk_fullsock(skb->sk)) + return 0; + + read_lock_bh(&skb->sk->sk_callback_lock); + + if (skb->secmark) + security_secid_to_secctx(skb->secmark, secdata, &seclen); + + read_unlock_bh(&skb->sk->sk_callback_lock); +#endif + return seclen; +} + static struct sk_buff * nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, struct nf_queue_entry *entry, @@ -297,6 +314,8 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, struct nf_conn *ct = NULL; enum ip_conntrack_info uninitialized_var(ctinfo); bool csum_verify; + char *secdata = NULL; + u32 seclen = 0; size = nlmsg_total_size(sizeof(struct nfgenmsg)) + nla_total_size(sizeof(struct nfqnl_msg_packet_hdr)) @@ -352,6 +371,12 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, + nla_total_size(sizeof(u_int32_t))); /* gid */ } + if ((queue->flags & NFQA_CFG_F_SECCTX) && entskb->sk) { + seclen = nfqnl_get_sk_secctx(entskb, &secdata); + if (seclen) + size += nla_total_size(seclen); + } + skb = nfnetlink_alloc_skb(net, size, queue->peer_portid, GFP_ATOMIC); if (!skb) { @@ -479,6 +504,9 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, nfqnl_put_sk_uidgid(skb, entskb->sk) < 0) goto nla_put_failure; + if (seclen && nla_put(skb, NFQA_SECCTX, seclen, secdata)) + goto nla_put_failure; + if (ct && nfqnl_ct_put(skb, ct, ctinfo) < 0) goto nla_put_failure; @@ -1142,7 +1170,12 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb, ret = -EOPNOTSUPP; goto err_out_unlock; } - +#if !IS_ENABLED(CONFIG_NETWORK_SECMARK) + if (flags & mask & NFQA_CFG_F_SECCTX) { + ret = -EOPNOTSUPP; + goto err_out_unlock; + } +#endif spin_lock_bh(&queue->lock); queue->flags &= ~mask; queue->flags |= flags & mask; -- cgit From 01555e74bde51444c6898ef1800fb2bc697d479e Mon Sep 17 00:00:00 2001 From: Harout Hedeshian Date: Mon, 15 Jun 2015 18:40:43 -0600 Subject: netfilter: xt_socket: add XT_SOCKET_RESTORESKMARK flag xt_socket is useful for matching sockets with IP_TRANSPARENT and taking some action on the matching packets. However, it lacks the ability to match only a small subset of transparent sockets. Suppose there are 2 applications, each with its own set of transparent sockets. The first application wants all matching packets dropped, while the second application wants them forwarded somewhere else. Add the ability to retore the skb->mark from the sk_mark. The mark is only restored if a matching socket is found and the transparent / nowildcard conditions are satisfied. Now the 2 hypothetical applications can differentiate their sockets based on a mark value set with SO_MARK. iptables -t mangle -I PREROUTING -m socket --transparent \ --restore-skmark -j action iptables -t mangle -A action -m mark --mark 10 -j action2 iptables -t mangle -A action -m mark --mark 11 -j action3 Signed-off-by: Harout Hedeshian Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_socket.c | 59 ++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 53 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index e092cb046326..43e26c881100 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -205,6 +205,7 @@ static bool socket_match(const struct sk_buff *skb, struct xt_action_param *par, const struct xt_socket_mtinfo1 *info) { + struct sk_buff *pskb = (struct sk_buff *)skb; struct sock *sk = skb->sk; if (!sk) @@ -226,6 +227,10 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, if (info->flags & XT_SOCKET_TRANSPARENT) transparent = xt_socket_sk_is_transparent(sk); + if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard && + transparent) + pskb->mark = sk->sk_mark; + if (sk != skb->sk) sock_gen_put(sk); @@ -247,7 +252,7 @@ socket_mt4_v0(const struct sk_buff *skb, struct xt_action_param *par) } static bool -socket_mt4_v1_v2(const struct sk_buff *skb, struct xt_action_param *par) +socket_mt4_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par) { return socket_match(skb, par, par->matchinfo); } @@ -371,9 +376,10 @@ static struct sock *xt_socket_lookup_slow_v6(const struct sk_buff *skb, } static bool -socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par) +socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo; + struct sk_buff *pskb = (struct sk_buff *)skb; struct sock *sk = skb->sk; if (!sk) @@ -395,6 +401,10 @@ socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par) if (info->flags & XT_SOCKET_TRANSPARENT) transparent = xt_socket_sk_is_transparent(sk); + if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard && + transparent) + pskb->mark = sk->sk_mark; + if (sk != skb->sk) sock_gen_put(sk); @@ -428,6 +438,19 @@ static int socket_mt_v2_check(const struct xt_mtchk_param *par) return 0; } +static int socket_mt_v3_check(const struct xt_mtchk_param *par) +{ + const struct xt_socket_mtinfo3 *info = + (struct xt_socket_mtinfo3 *)par->matchinfo; + + if (info->flags & ~XT_SOCKET_FLAGS_V3) { + pr_info("unknown flags 0x%x\n", + info->flags & ~XT_SOCKET_FLAGS_V3); + return -EINVAL; + } + return 0; +} + static struct xt_match socket_mt_reg[] __read_mostly = { { .name = "socket", @@ -442,7 +465,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = { .name = "socket", .revision = 1, .family = NFPROTO_IPV4, - .match = socket_mt4_v1_v2, + .match = socket_mt4_v1_v2_v3, .checkentry = socket_mt_v1_check, .matchsize = sizeof(struct xt_socket_mtinfo1), .hooks = (1 << NF_INET_PRE_ROUTING) | @@ -454,7 +477,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = { .name = "socket", .revision = 1, .family = NFPROTO_IPV6, - .match = socket_mt6_v1_v2, + .match = socket_mt6_v1_v2_v3, .checkentry = socket_mt_v1_check, .matchsize = sizeof(struct xt_socket_mtinfo1), .hooks = (1 << NF_INET_PRE_ROUTING) | @@ -466,7 +489,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = { .name = "socket", .revision = 2, .family = NFPROTO_IPV4, - .match = socket_mt4_v1_v2, + .match = socket_mt4_v1_v2_v3, .checkentry = socket_mt_v2_check, .matchsize = sizeof(struct xt_socket_mtinfo1), .hooks = (1 << NF_INET_PRE_ROUTING) | @@ -478,13 +501,37 @@ static struct xt_match socket_mt_reg[] __read_mostly = { .name = "socket", .revision = 2, .family = NFPROTO_IPV6, - .match = socket_mt6_v1_v2, + .match = socket_mt6_v1_v2_v3, .checkentry = socket_mt_v2_check, .matchsize = sizeof(struct xt_socket_mtinfo1), .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN), .me = THIS_MODULE, }, +#endif + { + .name = "socket", + .revision = 3, + .family = NFPROTO_IPV4, + .match = socket_mt4_v1_v2_v3, + .checkentry = socket_mt_v3_check, + .matchsize = sizeof(struct xt_socket_mtinfo1), + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN), + .me = THIS_MODULE, + }, +#ifdef XT_SOCKET_HAVE_IPV6 + { + .name = "socket", + .revision = 3, + .family = NFPROTO_IPV6, + .match = socket_mt6_v1_v2_v3, + .checkentry = socket_mt_v3_check, + .matchsize = sizeof(struct xt_socket_mtinfo1), + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN), + .me = THIS_MODULE, + }, #endif }; -- cgit From c4e70a87d975d1f561a00abfe2d3cefa2a486c95 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 16 Jun 2015 13:38:26 +0200 Subject: netfilter: bridge: rename br_netfilter.c to br_netfilter_hooks.c To prepare separation of the IPv6 code into different file. Signed-off-by: Pablo Neira Ayuso --- net/bridge/Makefile | 1 + net/bridge/br_netfilter.c | 1282 --------------------------------------- net/bridge/br_netfilter_hooks.c | 1282 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 1283 insertions(+), 1282 deletions(-) delete mode 100644 net/bridge/br_netfilter.c create mode 100644 net/bridge/br_netfilter_hooks.c (limited to 'net') diff --git a/net/bridge/Makefile b/net/bridge/Makefile index fd7ee03c59b3..c52577ac718e 100644 --- a/net/bridge/Makefile +++ b/net/bridge/Makefile @@ -12,6 +12,7 @@ bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o bridge-$(subst m,y,$(CONFIG_BRIDGE_NETFILTER)) += br_nf_core.o +br_netfilter-y := br_netfilter_hooks.o obj-$(CONFIG_BRIDGE_NETFILTER) += br_netfilter.o bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o br_mdb.o diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c deleted file mode 100644 index e4e5f2f29173..000000000000 --- a/net/bridge/br_netfilter.c +++ /dev/null @@ -1,1282 +0,0 @@ -/* - * Handle firewalling - * Linux ethernet bridge - * - * Authors: - * Lennert Buytenhek - * Bart De Schuymer - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Lennert dedicates this file to Kerstin Wurdinger. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include -#include "br_private.h" -#ifdef CONFIG_SYSCTL -#include -#endif - -#ifdef CONFIG_SYSCTL -static struct ctl_table_header *brnf_sysctl_header; -static int brnf_call_iptables __read_mostly = 1; -static int brnf_call_ip6tables __read_mostly = 1; -static int brnf_call_arptables __read_mostly = 1; -static int brnf_filter_vlan_tagged __read_mostly = 0; -static int brnf_filter_pppoe_tagged __read_mostly = 0; -static int brnf_pass_vlan_indev __read_mostly = 0; -#else -#define brnf_call_iptables 1 -#define brnf_call_ip6tables 1 -#define brnf_call_arptables 1 -#define brnf_filter_vlan_tagged 0 -#define brnf_filter_pppoe_tagged 0 -#define brnf_pass_vlan_indev 0 -#endif - -#define IS_IP(skb) \ - (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IP)) - -#define IS_IPV6(skb) \ - (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IPV6)) - -#define IS_ARP(skb) \ - (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_ARP)) - -static inline __be16 vlan_proto(const struct sk_buff *skb) -{ - if (skb_vlan_tag_present(skb)) - return skb->protocol; - else if (skb->protocol == htons(ETH_P_8021Q)) - return vlan_eth_hdr(skb)->h_vlan_encapsulated_proto; - else - return 0; -} - -#define IS_VLAN_IP(skb) \ - (vlan_proto(skb) == htons(ETH_P_IP) && \ - brnf_filter_vlan_tagged) - -#define IS_VLAN_IPV6(skb) \ - (vlan_proto(skb) == htons(ETH_P_IPV6) && \ - brnf_filter_vlan_tagged) - -#define IS_VLAN_ARP(skb) \ - (vlan_proto(skb) == htons(ETH_P_ARP) && \ - brnf_filter_vlan_tagged) - -static inline __be16 pppoe_proto(const struct sk_buff *skb) -{ - return *((__be16 *)(skb_mac_header(skb) + ETH_HLEN + - sizeof(struct pppoe_hdr))); -} - -#define IS_PPPOE_IP(skb) \ - (skb->protocol == htons(ETH_P_PPP_SES) && \ - pppoe_proto(skb) == htons(PPP_IP) && \ - brnf_filter_pppoe_tagged) - -#define IS_PPPOE_IPV6(skb) \ - (skb->protocol == htons(ETH_P_PPP_SES) && \ - pppoe_proto(skb) == htons(PPP_IPV6) && \ - brnf_filter_pppoe_tagged) - -/* largest possible L2 header, see br_nf_dev_queue_xmit() */ -#define NF_BRIDGE_MAX_MAC_HEADER_LENGTH (PPPOE_SES_HLEN + ETH_HLEN) - -#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) -struct brnf_frag_data { - char mac[NF_BRIDGE_MAX_MAC_HEADER_LENGTH]; - u8 encap_size; - u8 size; - u16 vlan_tci; - __be16 vlan_proto; -}; - -static DEFINE_PER_CPU(struct brnf_frag_data, brnf_frag_data_storage); -#endif - -static struct nf_bridge_info *nf_bridge_info_get(const struct sk_buff *skb) -{ - return skb->nf_bridge; -} - -static void nf_bridge_info_free(struct sk_buff *skb) -{ - if (skb->nf_bridge) { - nf_bridge_put(skb->nf_bridge); - skb->nf_bridge = NULL; - } -} - -static inline struct rtable *bridge_parent_rtable(const struct net_device *dev) -{ - struct net_bridge_port *port; - - port = br_port_get_rcu(dev); - return port ? &port->br->fake_rtable : NULL; -} - -static inline struct net_device *bridge_parent(const struct net_device *dev) -{ - struct net_bridge_port *port; - - port = br_port_get_rcu(dev); - return port ? port->br->dev : NULL; -} - -static inline struct nf_bridge_info *nf_bridge_alloc(struct sk_buff *skb) -{ - skb->nf_bridge = kzalloc(sizeof(struct nf_bridge_info), GFP_ATOMIC); - if (likely(skb->nf_bridge)) - atomic_set(&(skb->nf_bridge->use), 1); - - return skb->nf_bridge; -} - -static inline struct nf_bridge_info *nf_bridge_unshare(struct sk_buff *skb) -{ - struct nf_bridge_info *nf_bridge = skb->nf_bridge; - - if (atomic_read(&nf_bridge->use) > 1) { - struct nf_bridge_info *tmp = nf_bridge_alloc(skb); - - if (tmp) { - memcpy(tmp, nf_bridge, sizeof(struct nf_bridge_info)); - atomic_set(&tmp->use, 1); - } - nf_bridge_put(nf_bridge); - nf_bridge = tmp; - } - return nf_bridge; -} - -static unsigned int nf_bridge_encap_header_len(const struct sk_buff *skb) -{ - switch (skb->protocol) { - case __cpu_to_be16(ETH_P_8021Q): - return VLAN_HLEN; - case __cpu_to_be16(ETH_P_PPP_SES): - return PPPOE_SES_HLEN; - default: - return 0; - } -} - -static inline void nf_bridge_push_encap_header(struct sk_buff *skb) -{ - unsigned int len = nf_bridge_encap_header_len(skb); - - skb_push(skb, len); - skb->network_header -= len; -} - -static inline void nf_bridge_pull_encap_header(struct sk_buff *skb) -{ - unsigned int len = nf_bridge_encap_header_len(skb); - - skb_pull(skb, len); - skb->network_header += len; -} - -static inline void nf_bridge_pull_encap_header_rcsum(struct sk_buff *skb) -{ - unsigned int len = nf_bridge_encap_header_len(skb); - - skb_pull_rcsum(skb, len); - skb->network_header += len; -} - -/* When handing a packet over to the IP layer - * check whether we have a skb that is in the - * expected format - */ - -static int br_validate_ipv4(struct sk_buff *skb) -{ - const struct iphdr *iph; - struct net_device *dev = skb->dev; - u32 len; - - if (!pskb_may_pull(skb, sizeof(struct iphdr))) - goto inhdr_error; - - iph = ip_hdr(skb); - - /* Basic sanity checks */ - if (iph->ihl < 5 || iph->version != 4) - goto inhdr_error; - - if (!pskb_may_pull(skb, iph->ihl*4)) - goto inhdr_error; - - iph = ip_hdr(skb); - if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) - goto inhdr_error; - - len = ntohs(iph->tot_len); - if (skb->len < len) { - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS); - goto drop; - } else if (len < (iph->ihl*4)) - goto inhdr_error; - - if (pskb_trim_rcsum(skb, len)) { - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS); - goto drop; - } - - memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); - /* We should really parse IP options here but until - * somebody who actually uses IP options complains to - * us we'll just silently ignore the options because - * we're lazy! - */ - return 0; - -inhdr_error: - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS); -drop: - return -1; -} - -/* We only check the length. A bridge shouldn't do any hop-by-hop stuff - * anyway - */ -static int check_hbh_len(struct sk_buff *skb) -{ - unsigned char *raw = (u8 *)(ipv6_hdr(skb) + 1); - u32 pkt_len; - const unsigned char *nh = skb_network_header(skb); - int off = raw - nh; - int len = (raw[1] + 1) << 3; - - if ((raw + len) - skb->data > skb_headlen(skb)) - goto bad; - - off += 2; - len -= 2; - - while (len > 0) { - int optlen = nh[off + 1] + 2; - - switch (nh[off]) { - case IPV6_TLV_PAD1: - optlen = 1; - break; - - case IPV6_TLV_PADN: - break; - - case IPV6_TLV_JUMBO: - if (nh[off + 1] != 4 || (off & 3) != 2) - goto bad; - pkt_len = ntohl(*(__be32 *)(nh + off + 2)); - if (pkt_len <= IPV6_MAXPLEN || - ipv6_hdr(skb)->payload_len) - goto bad; - if (pkt_len > skb->len - sizeof(struct ipv6hdr)) - goto bad; - if (pskb_trim_rcsum(skb, - pkt_len + sizeof(struct ipv6hdr))) - goto bad; - nh = skb_network_header(skb); - break; - default: - if (optlen > len) - goto bad; - break; - } - off += optlen; - len -= optlen; - } - if (len == 0) - return 0; -bad: - return -1; -} - -/* Equivalent to br_validate_ipv4 for IPv6 */ -static int br_validate_ipv6(struct sk_buff *skb) -{ - const struct ipv6hdr *hdr; - struct net_device *dev = skb->dev; - struct inet6_dev *idev = in6_dev_get(skb->dev); - u32 pkt_len; - u8 ip6h_len = sizeof(struct ipv6hdr); - - if (!pskb_may_pull(skb, ip6h_len)) - goto inhdr_error; - - if (skb->len < ip6h_len) - goto drop; - - hdr = ipv6_hdr(skb); - - if (hdr->version != 6) - goto inhdr_error; - - pkt_len = ntohs(hdr->payload_len); - - if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) { - if (pkt_len + ip6h_len > skb->len) { - IP6_INC_STATS_BH(dev_net(dev), idev, - IPSTATS_MIB_INTRUNCATEDPKTS); - goto drop; - } - if (pskb_trim_rcsum(skb, pkt_len + ip6h_len)) { - IP6_INC_STATS_BH(dev_net(dev), idev, - IPSTATS_MIB_INDISCARDS); - goto drop; - } - } - if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb)) - goto drop; - - memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); - /* No IP options in IPv6 header; however it should be - * checked if some next headers need special treatment - */ - return 0; - -inhdr_error: - IP6_INC_STATS_BH(dev_net(dev), idev, IPSTATS_MIB_INHDRERRORS); -drop: - return -1; -} - -static void nf_bridge_update_protocol(struct sk_buff *skb) -{ - switch (skb->nf_bridge->orig_proto) { - case BRNF_PROTO_8021Q: - skb->protocol = htons(ETH_P_8021Q); - break; - case BRNF_PROTO_PPPOE: - skb->protocol = htons(ETH_P_PPP_SES); - break; - case BRNF_PROTO_UNCHANGED: - break; - } -} - -/* Obtain the correct destination MAC address, while preserving the original - * source MAC address. If we already know this address, we just copy it. If we - * don't, we use the neighbour framework to find out. In both cases, we make - * sure that br_handle_frame_finish() is called afterwards. - */ -static int br_nf_pre_routing_finish_bridge(struct sock *sk, struct sk_buff *skb) -{ - struct neighbour *neigh; - struct dst_entry *dst; - - skb->dev = bridge_parent(skb->dev); - if (!skb->dev) - goto free_skb; - dst = skb_dst(skb); - neigh = dst_neigh_lookup_skb(dst, skb); - if (neigh) { - struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); - int ret; - - if (neigh->hh.hh_len) { - neigh_hh_bridge(&neigh->hh, skb); - skb->dev = nf_bridge->physindev; - ret = br_handle_frame_finish(sk, skb); - } else { - /* the neighbour function below overwrites the complete - * MAC header, so we save the Ethernet source address and - * protocol number. - */ - skb_copy_from_linear_data_offset(skb, - -(ETH_HLEN-ETH_ALEN), - nf_bridge->neigh_header, - ETH_HLEN-ETH_ALEN); - /* tell br_dev_xmit to continue with forwarding */ - nf_bridge->mask |= BRNF_BRIDGED_DNAT; - /* FIXME Need to refragment */ - ret = neigh->output(neigh, skb); - } - neigh_release(neigh); - return ret; - } -free_skb: - kfree_skb(skb); - return 0; -} - -static bool daddr_was_changed(const struct sk_buff *skb, - const struct nf_bridge_info *nf_bridge) -{ - switch (skb->protocol) { - case htons(ETH_P_IP): - return ip_hdr(skb)->daddr != nf_bridge->ipv4_daddr; - case htons(ETH_P_IPV6): - return memcmp(&nf_bridge->ipv6_daddr, &ipv6_hdr(skb)->daddr, - sizeof(ipv6_hdr(skb)->daddr)) != 0; - default: - return false; - } -} - -/* PF_BRIDGE/PRE_ROUTING: Undo the changes made for ip6tables - * PREROUTING and continue the bridge PRE_ROUTING hook. See comment - * for br_nf_pre_routing_finish(), same logic is used here but - * equivalent IPv6 function ip6_route_input() called indirectly. - */ -static int br_nf_pre_routing_finish_ipv6(struct sock *sk, struct sk_buff *skb) -{ - struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); - struct rtable *rt; - struct net_device *dev = skb->dev; - const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops(); - - nf_bridge->frag_max_size = IP6CB(skb)->frag_max_size; - - if (nf_bridge->pkt_otherhost) { - skb->pkt_type = PACKET_OTHERHOST; - nf_bridge->pkt_otherhost = false; - } - nf_bridge->mask &= ~BRNF_NF_BRIDGE_PREROUTING; - if (daddr_was_changed(skb, nf_bridge)) { - skb_dst_drop(skb); - v6ops->route_input(skb); - - if (skb_dst(skb)->error) { - kfree_skb(skb); - return 0; - } - - if (skb_dst(skb)->dev == dev) { - skb->dev = nf_bridge->physindev; - nf_bridge_update_protocol(skb); - nf_bridge_push_encap_header(skb); - NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, - sk, skb, skb->dev, NULL, - br_nf_pre_routing_finish_bridge, - 1); - return 0; - } - ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr); - skb->pkt_type = PACKET_HOST; - } else { - rt = bridge_parent_rtable(nf_bridge->physindev); - if (!rt) { - kfree_skb(skb); - return 0; - } - skb_dst_set_noref(skb, &rt->dst); - } - - skb->dev = nf_bridge->physindev; - nf_bridge_update_protocol(skb); - nf_bridge_push_encap_header(skb); - NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, sk, skb, - skb->dev, NULL, - br_handle_frame_finish, 1); - - return 0; -} - -/* This requires some explaining. If DNAT has taken place, - * we will need to fix up the destination Ethernet address. - * This is also true when SNAT takes place (for the reply direction). - * - * There are two cases to consider: - * 1. The packet was DNAT'ed to a device in the same bridge - * port group as it was received on. We can still bridge - * the packet. - * 2. The packet was DNAT'ed to a different device, either - * a non-bridged device or another bridge port group. - * The packet will need to be routed. - * - * The correct way of distinguishing between these two cases is to - * call ip_route_input() and to look at skb->dst->dev, which is - * changed to the destination device if ip_route_input() succeeds. - * - * Let's first consider the case that ip_route_input() succeeds: - * - * If the output device equals the logical bridge device the packet - * came in on, we can consider this bridging. The corresponding MAC - * address will be obtained in br_nf_pre_routing_finish_bridge. - * Otherwise, the packet is considered to be routed and we just - * change the destination MAC address so that the packet will - * later be passed up to the IP stack to be routed. For a redirected - * packet, ip_route_input() will give back the localhost as output device, - * which differs from the bridge device. - * - * Let's now consider the case that ip_route_input() fails: - * - * This can be because the destination address is martian, in which case - * the packet will be dropped. - * If IP forwarding is disabled, ip_route_input() will fail, while - * ip_route_output_key() can return success. The source - * address for ip_route_output_key() is set to zero, so ip_route_output_key() - * thinks we're handling a locally generated packet and won't care - * if IP forwarding is enabled. If the output device equals the logical bridge - * device, we proceed as if ip_route_input() succeeded. If it differs from the - * logical bridge port or if ip_route_output_key() fails we drop the packet. - */ -static int br_nf_pre_routing_finish(struct sock *sk, struct sk_buff *skb) -{ - struct net_device *dev = skb->dev; - struct iphdr *iph = ip_hdr(skb); - struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); - struct rtable *rt; - int err; - - nf_bridge->frag_max_size = IPCB(skb)->frag_max_size; - - if (nf_bridge->pkt_otherhost) { - skb->pkt_type = PACKET_OTHERHOST; - nf_bridge->pkt_otherhost = false; - } - nf_bridge->mask &= ~BRNF_NF_BRIDGE_PREROUTING; - if (daddr_was_changed(skb, nf_bridge)) { - if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) { - struct in_device *in_dev = __in_dev_get_rcu(dev); - - /* If err equals -EHOSTUNREACH the error is due to a - * martian destination or due to the fact that - * forwarding is disabled. For most martian packets, - * ip_route_output_key() will fail. It won't fail for 2 types of - * martian destinations: loopback destinations and destination - * 0.0.0.0. In both cases the packet will be dropped because the - * destination is the loopback device and not the bridge. */ - if (err != -EHOSTUNREACH || !in_dev || IN_DEV_FORWARD(in_dev)) - goto free_skb; - - rt = ip_route_output(dev_net(dev), iph->daddr, 0, - RT_TOS(iph->tos), 0); - if (!IS_ERR(rt)) { - /* - Bridged-and-DNAT'ed traffic doesn't - * require ip_forwarding. */ - if (rt->dst.dev == dev) { - skb_dst_set(skb, &rt->dst); - goto bridged_dnat; - } - ip_rt_put(rt); - } -free_skb: - kfree_skb(skb); - return 0; - } else { - if (skb_dst(skb)->dev == dev) { -bridged_dnat: - skb->dev = nf_bridge->physindev; - nf_bridge_update_protocol(skb); - nf_bridge_push_encap_header(skb); - NF_HOOK_THRESH(NFPROTO_BRIDGE, - NF_BR_PRE_ROUTING, - sk, skb, skb->dev, NULL, - br_nf_pre_routing_finish_bridge, - 1); - return 0; - } - ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr); - skb->pkt_type = PACKET_HOST; - } - } else { - rt = bridge_parent_rtable(nf_bridge->physindev); - if (!rt) { - kfree_skb(skb); - return 0; - } - skb_dst_set_noref(skb, &rt->dst); - } - - skb->dev = nf_bridge->physindev; - nf_bridge_update_protocol(skb); - nf_bridge_push_encap_header(skb); - NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, sk, skb, - skb->dev, NULL, - br_handle_frame_finish, 1); - - return 0; -} - -static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct net_device *dev) -{ - struct net_device *vlan, *br; - - br = bridge_parent(dev); - if (brnf_pass_vlan_indev == 0 || !skb_vlan_tag_present(skb)) - return br; - - vlan = __vlan_find_dev_deep_rcu(br, skb->vlan_proto, - skb_vlan_tag_get(skb) & VLAN_VID_MASK); - - return vlan ? vlan : br; -} - -/* Some common code for IPv4/IPv6 */ -static struct net_device *setup_pre_routing(struct sk_buff *skb) -{ - struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); - - if (skb->pkt_type == PACKET_OTHERHOST) { - skb->pkt_type = PACKET_HOST; - nf_bridge->pkt_otherhost = true; - } - - nf_bridge->mask |= BRNF_NF_BRIDGE_PREROUTING; - nf_bridge->physindev = skb->dev; - skb->dev = brnf_get_logical_dev(skb, skb->dev); - - if (skb->protocol == htons(ETH_P_8021Q)) - nf_bridge->orig_proto = BRNF_PROTO_8021Q; - else if (skb->protocol == htons(ETH_P_PPP_SES)) - nf_bridge->orig_proto = BRNF_PROTO_PPPOE; - - /* Must drop socket now because of tproxy. */ - skb_orphan(skb); - return skb->dev; -} - -/* Replicate the checks that IPv6 does on packet reception and pass the packet - * to ip6tables. - */ -static unsigned int br_nf_pre_routing_ipv6(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct nf_bridge_info *nf_bridge; - - if (br_validate_ipv6(skb)) - return NF_DROP; - - nf_bridge_put(skb->nf_bridge); - if (!nf_bridge_alloc(skb)) - return NF_DROP; - if (!setup_pre_routing(skb)) - return NF_DROP; - - nf_bridge = nf_bridge_info_get(skb); - nf_bridge->ipv6_daddr = ipv6_hdr(skb)->daddr; - - skb->protocol = htons(ETH_P_IPV6); - NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, state->sk, skb, - skb->dev, NULL, - br_nf_pre_routing_finish_ipv6); - - return NF_STOLEN; -} - -/* Direct IPv6 traffic to br_nf_pre_routing_ipv6. - * Replicate the checks that IPv4 does on packet reception. - * Set skb->dev to the bridge device (i.e. parent of the - * receiving device) to make netfilter happy, the REDIRECT - * target in particular. Save the original destination IP - * address to be able to detect DNAT afterwards. */ -static unsigned int br_nf_pre_routing(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct nf_bridge_info *nf_bridge; - struct net_bridge_port *p; - struct net_bridge *br; - __u32 len = nf_bridge_encap_header_len(skb); - - if (unlikely(!pskb_may_pull(skb, len))) - return NF_DROP; - - p = br_port_get_rcu(state->in); - if (p == NULL) - return NF_DROP; - br = p->br; - - if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) { - if (!brnf_call_ip6tables && !br->nf_call_ip6tables) - return NF_ACCEPT; - - nf_bridge_pull_encap_header_rcsum(skb); - return br_nf_pre_routing_ipv6(ops, skb, state); - } - - if (!brnf_call_iptables && !br->nf_call_iptables) - return NF_ACCEPT; - - if (!IS_IP(skb) && !IS_VLAN_IP(skb) && !IS_PPPOE_IP(skb)) - return NF_ACCEPT; - - nf_bridge_pull_encap_header_rcsum(skb); - - if (br_validate_ipv4(skb)) - return NF_DROP; - - nf_bridge_put(skb->nf_bridge); - if (!nf_bridge_alloc(skb)) - return NF_DROP; - if (!setup_pre_routing(skb)) - return NF_DROP; - - nf_bridge = nf_bridge_info_get(skb); - nf_bridge->ipv4_daddr = ip_hdr(skb)->daddr; - - skb->protocol = htons(ETH_P_IP); - - NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, state->sk, skb, - skb->dev, NULL, - br_nf_pre_routing_finish); - - return NF_STOLEN; -} - - -/* PF_BRIDGE/LOCAL_IN ************************************************/ -/* The packet is locally destined, which requires a real - * dst_entry, so detach the fake one. On the way up, the - * packet would pass through PRE_ROUTING again (which already - * took place when the packet entered the bridge), but we - * register an IPv4 PRE_ROUTING 'sabotage' hook that will - * prevent this from happening. */ -static unsigned int br_nf_local_in(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - br_drop_fake_rtable(skb); - return NF_ACCEPT; -} - -/* PF_BRIDGE/FORWARD *************************************************/ -static int br_nf_forward_finish(struct sock *sk, struct sk_buff *skb) -{ - struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); - struct net_device *in; - - if (!IS_ARP(skb) && !IS_VLAN_ARP(skb)) { - - if (skb->protocol == htons(ETH_P_IP)) - nf_bridge->frag_max_size = IPCB(skb)->frag_max_size; - - if (skb->protocol == htons(ETH_P_IPV6)) - nf_bridge->frag_max_size = IP6CB(skb)->frag_max_size; - - in = nf_bridge->physindev; - if (nf_bridge->pkt_otherhost) { - skb->pkt_type = PACKET_OTHERHOST; - nf_bridge->pkt_otherhost = false; - } - nf_bridge_update_protocol(skb); - } else { - in = *((struct net_device **)(skb->cb)); - } - nf_bridge_push_encap_header(skb); - - NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_FORWARD, sk, skb, - in, skb->dev, br_forward_finish, 1); - return 0; -} - - -/* This is the 'purely bridged' case. For IP, we pass the packet to - * netfilter with indev and outdev set to the bridge device, - * but we are still able to filter on the 'real' indev/outdev - * because of the physdev module. For ARP, indev and outdev are the - * bridge ports. */ -static unsigned int br_nf_forward_ip(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct nf_bridge_info *nf_bridge; - struct net_device *parent; - u_int8_t pf; - - if (!skb->nf_bridge) - return NF_ACCEPT; - - /* Need exclusive nf_bridge_info since we might have multiple - * different physoutdevs. */ - if (!nf_bridge_unshare(skb)) - return NF_DROP; - - nf_bridge = nf_bridge_info_get(skb); - if (!nf_bridge) - return NF_DROP; - - parent = bridge_parent(state->out); - if (!parent) - return NF_DROP; - - if (IS_IP(skb) || IS_VLAN_IP(skb) || IS_PPPOE_IP(skb)) - pf = NFPROTO_IPV4; - else if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) - pf = NFPROTO_IPV6; - else - return NF_ACCEPT; - - nf_bridge_pull_encap_header(skb); - - if (skb->pkt_type == PACKET_OTHERHOST) { - skb->pkt_type = PACKET_HOST; - nf_bridge->pkt_otherhost = true; - } - - if (pf == NFPROTO_IPV4) { - if (br_validate_ipv4(skb)) - return NF_DROP; - IPCB(skb)->frag_max_size = nf_bridge->frag_max_size; - } - - if (pf == NFPROTO_IPV6) { - if (br_validate_ipv6(skb)) - return NF_DROP; - IP6CB(skb)->frag_max_size = nf_bridge->frag_max_size; - } - - nf_bridge->physoutdev = skb->dev; - if (pf == NFPROTO_IPV4) - skb->protocol = htons(ETH_P_IP); - else - skb->protocol = htons(ETH_P_IPV6); - - NF_HOOK(pf, NF_INET_FORWARD, NULL, skb, - brnf_get_logical_dev(skb, state->in), - parent, br_nf_forward_finish); - - return NF_STOLEN; -} - -static unsigned int br_nf_forward_arp(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct net_bridge_port *p; - struct net_bridge *br; - struct net_device **d = (struct net_device **)(skb->cb); - - p = br_port_get_rcu(state->out); - if (p == NULL) - return NF_ACCEPT; - br = p->br; - - if (!brnf_call_arptables && !br->nf_call_arptables) - return NF_ACCEPT; - - if (!IS_ARP(skb)) { - if (!IS_VLAN_ARP(skb)) - return NF_ACCEPT; - nf_bridge_pull_encap_header(skb); - } - - if (arp_hdr(skb)->ar_pln != 4) { - if (IS_VLAN_ARP(skb)) - nf_bridge_push_encap_header(skb); - return NF_ACCEPT; - } - *d = state->in; - NF_HOOK(NFPROTO_ARP, NF_ARP_FORWARD, state->sk, skb, - state->in, state->out, br_nf_forward_finish); - - return NF_STOLEN; -} - -#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) || IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) -static int br_nf_push_frag_xmit(struct sock *sk, struct sk_buff *skb) -{ - struct brnf_frag_data *data; - int err; - - data = this_cpu_ptr(&brnf_frag_data_storage); - err = skb_cow_head(skb, data->size); - - if (err) { - kfree_skb(skb); - return 0; - } - - if (data->vlan_tci) { - skb->vlan_tci = data->vlan_tci; - skb->vlan_proto = data->vlan_proto; - } - - skb_copy_to_linear_data_offset(skb, -data->size, data->mac, data->size); - __skb_push(skb, data->encap_size); - - nf_bridge_info_free(skb); - return br_dev_queue_push_xmit(sk, skb); -} -#endif - -static int br_nf_ip_fragment(struct sock *sk, struct sk_buff *skb, - int (*output)(struct sock *, struct sk_buff *)) -{ - unsigned int mtu = ip_skb_dst_mtu(skb); - struct iphdr *iph = ip_hdr(skb); - struct rtable *rt = skb_rtable(skb); - struct net_device *dev = rt->dst.dev; - - if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) || - (IPCB(skb)->frag_max_size && - IPCB(skb)->frag_max_size > mtu))) { - IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); - kfree_skb(skb); - return -EMSGSIZE; - } - - return ip_do_fragment(sk, skb, output); -} - -static unsigned int nf_bridge_mtu_reduction(const struct sk_buff *skb) -{ - if (skb->nf_bridge->orig_proto == BRNF_PROTO_PPPOE) - return PPPOE_SES_HLEN; - return 0; -} - -static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb) -{ - struct nf_bridge_info *nf_bridge; - unsigned int mtu_reserved; - - mtu_reserved = nf_bridge_mtu_reduction(skb); - - if (skb_is_gso(skb) || skb->len + mtu_reserved <= skb->dev->mtu) { - nf_bridge_info_free(skb); - return br_dev_queue_push_xmit(sk, skb); - } - - nf_bridge = nf_bridge_info_get(skb); - -#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) - /* This is wrong! We should preserve the original fragment - * boundaries by preserving frag_list rather than refragmenting. - */ - if (skb->protocol == htons(ETH_P_IP)) { - struct brnf_frag_data *data; - - if (br_validate_ipv4(skb)) - return NF_DROP; - - IPCB(skb)->frag_max_size = nf_bridge->frag_max_size; - - nf_bridge_update_protocol(skb); - - data = this_cpu_ptr(&brnf_frag_data_storage); - - data->vlan_tci = skb->vlan_tci; - data->vlan_proto = skb->vlan_proto; - data->encap_size = nf_bridge_encap_header_len(skb); - data->size = ETH_HLEN + data->encap_size; - - skb_copy_from_linear_data_offset(skb, -data->size, data->mac, - data->size); - - return br_nf_ip_fragment(sk, skb, br_nf_push_frag_xmit); - } -#endif -#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) - if (skb->protocol == htons(ETH_P_IPV6)) { - const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops(); - struct brnf_frag_data *data; - - if (br_validate_ipv6(skb)) - return NF_DROP; - - IP6CB(skb)->frag_max_size = nf_bridge->frag_max_size; - - nf_bridge_update_protocol(skb); - - data = this_cpu_ptr(&brnf_frag_data_storage); - data->encap_size = nf_bridge_encap_header_len(skb); - data->size = ETH_HLEN + data->encap_size; - - skb_copy_from_linear_data_offset(skb, -data->size, data->mac, - data->size); - - if (v6ops) - return v6ops->fragment(sk, skb, br_nf_push_frag_xmit); - else - return -EMSGSIZE; - } -#endif - nf_bridge_info_free(skb); - return br_dev_queue_push_xmit(sk, skb); -} - -/* PF_BRIDGE/POST_ROUTING ********************************************/ -static unsigned int br_nf_post_routing(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); - struct net_device *realoutdev = bridge_parent(skb->dev); - u_int8_t pf; - - /* if nf_bridge is set, but ->physoutdev is NULL, this packet came in - * on a bridge, but was delivered locally and is now being routed: - * - * POST_ROUTING was already invoked from the ip stack. - */ - if (!nf_bridge || !nf_bridge->physoutdev) - return NF_ACCEPT; - - if (!realoutdev) - return NF_DROP; - - if (IS_IP(skb) || IS_VLAN_IP(skb) || IS_PPPOE_IP(skb)) - pf = NFPROTO_IPV4; - else if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) - pf = NFPROTO_IPV6; - else - return NF_ACCEPT; - - /* We assume any code from br_dev_queue_push_xmit onwards doesn't care - * about the value of skb->pkt_type. */ - if (skb->pkt_type == PACKET_OTHERHOST) { - skb->pkt_type = PACKET_HOST; - nf_bridge->pkt_otherhost = true; - } - - nf_bridge_pull_encap_header(skb); - if (pf == NFPROTO_IPV4) - skb->protocol = htons(ETH_P_IP); - else - skb->protocol = htons(ETH_P_IPV6); - - NF_HOOK(pf, NF_INET_POST_ROUTING, state->sk, skb, - NULL, realoutdev, - br_nf_dev_queue_xmit); - - return NF_STOLEN; -} - -/* IP/SABOTAGE *****************************************************/ -/* Don't hand locally destined packets to PF_INET(6)/PRE_ROUTING - * for the second time. */ -static unsigned int ip_sabotage_in(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - if (skb->nf_bridge && - !(skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)) { - return NF_STOP; - } - - return NF_ACCEPT; -} - -/* This is called when br_netfilter has called into iptables/netfilter, - * and DNAT has taken place on a bridge-forwarded packet. - * - * neigh->output has created a new MAC header, with local br0 MAC - * as saddr. - * - * This restores the original MAC saddr of the bridged packet - * before invoking bridge forward logic to transmit the packet. - */ -static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb) -{ - struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); - - skb_pull(skb, ETH_HLEN); - nf_bridge->mask &= ~BRNF_BRIDGED_DNAT; - - BUILD_BUG_ON(sizeof(nf_bridge->neigh_header) != (ETH_HLEN - ETH_ALEN)); - - skb_copy_to_linear_data_offset(skb, -(ETH_HLEN - ETH_ALEN), - nf_bridge->neigh_header, - ETH_HLEN - ETH_ALEN); - skb->dev = nf_bridge->physindev; - - nf_bridge->physoutdev = NULL; - br_handle_frame_finish(NULL, skb); -} - -static int br_nf_dev_xmit(struct sk_buff *skb) -{ - if (skb->nf_bridge && (skb->nf_bridge->mask & BRNF_BRIDGED_DNAT)) { - br_nf_pre_routing_finish_bridge_slow(skb); - return 1; - } - return 0; -} - -static const struct nf_br_ops br_ops = { - .br_dev_xmit_hook = br_nf_dev_xmit, -}; - -void br_netfilter_enable(void) -{ -} -EXPORT_SYMBOL_GPL(br_netfilter_enable); - -/* For br_nf_post_routing, we need (prio = NF_BR_PRI_LAST), because - * br_dev_queue_push_xmit is called afterwards */ -static struct nf_hook_ops br_nf_ops[] __read_mostly = { - { - .hook = br_nf_pre_routing, - .owner = THIS_MODULE, - .pf = NFPROTO_BRIDGE, - .hooknum = NF_BR_PRE_ROUTING, - .priority = NF_BR_PRI_BRNF, - }, - { - .hook = br_nf_local_in, - .owner = THIS_MODULE, - .pf = NFPROTO_BRIDGE, - .hooknum = NF_BR_LOCAL_IN, - .priority = NF_BR_PRI_BRNF, - }, - { - .hook = br_nf_forward_ip, - .owner = THIS_MODULE, - .pf = NFPROTO_BRIDGE, - .hooknum = NF_BR_FORWARD, - .priority = NF_BR_PRI_BRNF - 1, - }, - { - .hook = br_nf_forward_arp, - .owner = THIS_MODULE, - .pf = NFPROTO_BRIDGE, - .hooknum = NF_BR_FORWARD, - .priority = NF_BR_PRI_BRNF, - }, - { - .hook = br_nf_post_routing, - .owner = THIS_MODULE, - .pf = NFPROTO_BRIDGE, - .hooknum = NF_BR_POST_ROUTING, - .priority = NF_BR_PRI_LAST, - }, - { - .hook = ip_sabotage_in, - .owner = THIS_MODULE, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_PRE_ROUTING, - .priority = NF_IP_PRI_FIRST, - }, - { - .hook = ip_sabotage_in, - .owner = THIS_MODULE, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_PRE_ROUTING, - .priority = NF_IP6_PRI_FIRST, - }, -}; - -#ifdef CONFIG_SYSCTL -static -int brnf_sysctl_call_tables(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int ret; - - ret = proc_dointvec(ctl, write, buffer, lenp, ppos); - - if (write && *(int *)(ctl->data)) - *(int *)(ctl->data) = 1; - return ret; -} - -static struct ctl_table brnf_table[] = { - { - .procname = "bridge-nf-call-arptables", - .data = &brnf_call_arptables, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = brnf_sysctl_call_tables, - }, - { - .procname = "bridge-nf-call-iptables", - .data = &brnf_call_iptables, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = brnf_sysctl_call_tables, - }, - { - .procname = "bridge-nf-call-ip6tables", - .data = &brnf_call_ip6tables, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = brnf_sysctl_call_tables, - }, - { - .procname = "bridge-nf-filter-vlan-tagged", - .data = &brnf_filter_vlan_tagged, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = brnf_sysctl_call_tables, - }, - { - .procname = "bridge-nf-filter-pppoe-tagged", - .data = &brnf_filter_pppoe_tagged, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = brnf_sysctl_call_tables, - }, - { - .procname = "bridge-nf-pass-vlan-input-dev", - .data = &brnf_pass_vlan_indev, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = brnf_sysctl_call_tables, - }, - { } -}; -#endif - -static int __init br_netfilter_init(void) -{ - int ret; - - ret = nf_register_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); - if (ret < 0) - return ret; - -#ifdef CONFIG_SYSCTL - brnf_sysctl_header = register_net_sysctl(&init_net, "net/bridge", brnf_table); - if (brnf_sysctl_header == NULL) { - printk(KERN_WARNING - "br_netfilter: can't register to sysctl.\n"); - nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); - return -ENOMEM; - } -#endif - RCU_INIT_POINTER(nf_br_ops, &br_ops); - printk(KERN_NOTICE "Bridge firewalling registered\n"); - return 0; -} - -static void __exit br_netfilter_fini(void) -{ - RCU_INIT_POINTER(nf_br_ops, NULL); - nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); -#ifdef CONFIG_SYSCTL - unregister_net_sysctl_table(brnf_sysctl_header); -#endif -} - -module_init(br_netfilter_init); -module_exit(br_netfilter_fini); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Lennert Buytenhek "); -MODULE_AUTHOR("Bart De Schuymer "); -MODULE_DESCRIPTION("Linux ethernet netfilter firewall bridge"); diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c new file mode 100644 index 000000000000..e4e5f2f29173 --- /dev/null +++ b/net/bridge/br_netfilter_hooks.c @@ -0,0 +1,1282 @@ +/* + * Handle firewalling + * Linux ethernet bridge + * + * Authors: + * Lennert Buytenhek + * Bart De Schuymer + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Lennert dedicates this file to Kerstin Wurdinger. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include "br_private.h" +#ifdef CONFIG_SYSCTL +#include +#endif + +#ifdef CONFIG_SYSCTL +static struct ctl_table_header *brnf_sysctl_header; +static int brnf_call_iptables __read_mostly = 1; +static int brnf_call_ip6tables __read_mostly = 1; +static int brnf_call_arptables __read_mostly = 1; +static int brnf_filter_vlan_tagged __read_mostly = 0; +static int brnf_filter_pppoe_tagged __read_mostly = 0; +static int brnf_pass_vlan_indev __read_mostly = 0; +#else +#define brnf_call_iptables 1 +#define brnf_call_ip6tables 1 +#define brnf_call_arptables 1 +#define brnf_filter_vlan_tagged 0 +#define brnf_filter_pppoe_tagged 0 +#define brnf_pass_vlan_indev 0 +#endif + +#define IS_IP(skb) \ + (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IP)) + +#define IS_IPV6(skb) \ + (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IPV6)) + +#define IS_ARP(skb) \ + (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_ARP)) + +static inline __be16 vlan_proto(const struct sk_buff *skb) +{ + if (skb_vlan_tag_present(skb)) + return skb->protocol; + else if (skb->protocol == htons(ETH_P_8021Q)) + return vlan_eth_hdr(skb)->h_vlan_encapsulated_proto; + else + return 0; +} + +#define IS_VLAN_IP(skb) \ + (vlan_proto(skb) == htons(ETH_P_IP) && \ + brnf_filter_vlan_tagged) + +#define IS_VLAN_IPV6(skb) \ + (vlan_proto(skb) == htons(ETH_P_IPV6) && \ + brnf_filter_vlan_tagged) + +#define IS_VLAN_ARP(skb) \ + (vlan_proto(skb) == htons(ETH_P_ARP) && \ + brnf_filter_vlan_tagged) + +static inline __be16 pppoe_proto(const struct sk_buff *skb) +{ + return *((__be16 *)(skb_mac_header(skb) + ETH_HLEN + + sizeof(struct pppoe_hdr))); +} + +#define IS_PPPOE_IP(skb) \ + (skb->protocol == htons(ETH_P_PPP_SES) && \ + pppoe_proto(skb) == htons(PPP_IP) && \ + brnf_filter_pppoe_tagged) + +#define IS_PPPOE_IPV6(skb) \ + (skb->protocol == htons(ETH_P_PPP_SES) && \ + pppoe_proto(skb) == htons(PPP_IPV6) && \ + brnf_filter_pppoe_tagged) + +/* largest possible L2 header, see br_nf_dev_queue_xmit() */ +#define NF_BRIDGE_MAX_MAC_HEADER_LENGTH (PPPOE_SES_HLEN + ETH_HLEN) + +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) +struct brnf_frag_data { + char mac[NF_BRIDGE_MAX_MAC_HEADER_LENGTH]; + u8 encap_size; + u8 size; + u16 vlan_tci; + __be16 vlan_proto; +}; + +static DEFINE_PER_CPU(struct brnf_frag_data, brnf_frag_data_storage); +#endif + +static struct nf_bridge_info *nf_bridge_info_get(const struct sk_buff *skb) +{ + return skb->nf_bridge; +} + +static void nf_bridge_info_free(struct sk_buff *skb) +{ + if (skb->nf_bridge) { + nf_bridge_put(skb->nf_bridge); + skb->nf_bridge = NULL; + } +} + +static inline struct rtable *bridge_parent_rtable(const struct net_device *dev) +{ + struct net_bridge_port *port; + + port = br_port_get_rcu(dev); + return port ? &port->br->fake_rtable : NULL; +} + +static inline struct net_device *bridge_parent(const struct net_device *dev) +{ + struct net_bridge_port *port; + + port = br_port_get_rcu(dev); + return port ? port->br->dev : NULL; +} + +static inline struct nf_bridge_info *nf_bridge_alloc(struct sk_buff *skb) +{ + skb->nf_bridge = kzalloc(sizeof(struct nf_bridge_info), GFP_ATOMIC); + if (likely(skb->nf_bridge)) + atomic_set(&(skb->nf_bridge->use), 1); + + return skb->nf_bridge; +} + +static inline struct nf_bridge_info *nf_bridge_unshare(struct sk_buff *skb) +{ + struct nf_bridge_info *nf_bridge = skb->nf_bridge; + + if (atomic_read(&nf_bridge->use) > 1) { + struct nf_bridge_info *tmp = nf_bridge_alloc(skb); + + if (tmp) { + memcpy(tmp, nf_bridge, sizeof(struct nf_bridge_info)); + atomic_set(&tmp->use, 1); + } + nf_bridge_put(nf_bridge); + nf_bridge = tmp; + } + return nf_bridge; +} + +static unsigned int nf_bridge_encap_header_len(const struct sk_buff *skb) +{ + switch (skb->protocol) { + case __cpu_to_be16(ETH_P_8021Q): + return VLAN_HLEN; + case __cpu_to_be16(ETH_P_PPP_SES): + return PPPOE_SES_HLEN; + default: + return 0; + } +} + +static inline void nf_bridge_push_encap_header(struct sk_buff *skb) +{ + unsigned int len = nf_bridge_encap_header_len(skb); + + skb_push(skb, len); + skb->network_header -= len; +} + +static inline void nf_bridge_pull_encap_header(struct sk_buff *skb) +{ + unsigned int len = nf_bridge_encap_header_len(skb); + + skb_pull(skb, len); + skb->network_header += len; +} + +static inline void nf_bridge_pull_encap_header_rcsum(struct sk_buff *skb) +{ + unsigned int len = nf_bridge_encap_header_len(skb); + + skb_pull_rcsum(skb, len); + skb->network_header += len; +} + +/* When handing a packet over to the IP layer + * check whether we have a skb that is in the + * expected format + */ + +static int br_validate_ipv4(struct sk_buff *skb) +{ + const struct iphdr *iph; + struct net_device *dev = skb->dev; + u32 len; + + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + goto inhdr_error; + + iph = ip_hdr(skb); + + /* Basic sanity checks */ + if (iph->ihl < 5 || iph->version != 4) + goto inhdr_error; + + if (!pskb_may_pull(skb, iph->ihl*4)) + goto inhdr_error; + + iph = ip_hdr(skb); + if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) + goto inhdr_error; + + len = ntohs(iph->tot_len); + if (skb->len < len) { + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS); + goto drop; + } else if (len < (iph->ihl*4)) + goto inhdr_error; + + if (pskb_trim_rcsum(skb, len)) { + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS); + goto drop; + } + + memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); + /* We should really parse IP options here but until + * somebody who actually uses IP options complains to + * us we'll just silently ignore the options because + * we're lazy! + */ + return 0; + +inhdr_error: + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS); +drop: + return -1; +} + +/* We only check the length. A bridge shouldn't do any hop-by-hop stuff + * anyway + */ +static int check_hbh_len(struct sk_buff *skb) +{ + unsigned char *raw = (u8 *)(ipv6_hdr(skb) + 1); + u32 pkt_len; + const unsigned char *nh = skb_network_header(skb); + int off = raw - nh; + int len = (raw[1] + 1) << 3; + + if ((raw + len) - skb->data > skb_headlen(skb)) + goto bad; + + off += 2; + len -= 2; + + while (len > 0) { + int optlen = nh[off + 1] + 2; + + switch (nh[off]) { + case IPV6_TLV_PAD1: + optlen = 1; + break; + + case IPV6_TLV_PADN: + break; + + case IPV6_TLV_JUMBO: + if (nh[off + 1] != 4 || (off & 3) != 2) + goto bad; + pkt_len = ntohl(*(__be32 *)(nh + off + 2)); + if (pkt_len <= IPV6_MAXPLEN || + ipv6_hdr(skb)->payload_len) + goto bad; + if (pkt_len > skb->len - sizeof(struct ipv6hdr)) + goto bad; + if (pskb_trim_rcsum(skb, + pkt_len + sizeof(struct ipv6hdr))) + goto bad; + nh = skb_network_header(skb); + break; + default: + if (optlen > len) + goto bad; + break; + } + off += optlen; + len -= optlen; + } + if (len == 0) + return 0; +bad: + return -1; +} + +/* Equivalent to br_validate_ipv4 for IPv6 */ +static int br_validate_ipv6(struct sk_buff *skb) +{ + const struct ipv6hdr *hdr; + struct net_device *dev = skb->dev; + struct inet6_dev *idev = in6_dev_get(skb->dev); + u32 pkt_len; + u8 ip6h_len = sizeof(struct ipv6hdr); + + if (!pskb_may_pull(skb, ip6h_len)) + goto inhdr_error; + + if (skb->len < ip6h_len) + goto drop; + + hdr = ipv6_hdr(skb); + + if (hdr->version != 6) + goto inhdr_error; + + pkt_len = ntohs(hdr->payload_len); + + if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) { + if (pkt_len + ip6h_len > skb->len) { + IP6_INC_STATS_BH(dev_net(dev), idev, + IPSTATS_MIB_INTRUNCATEDPKTS); + goto drop; + } + if (pskb_trim_rcsum(skb, pkt_len + ip6h_len)) { + IP6_INC_STATS_BH(dev_net(dev), idev, + IPSTATS_MIB_INDISCARDS); + goto drop; + } + } + if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb)) + goto drop; + + memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); + /* No IP options in IPv6 header; however it should be + * checked if some next headers need special treatment + */ + return 0; + +inhdr_error: + IP6_INC_STATS_BH(dev_net(dev), idev, IPSTATS_MIB_INHDRERRORS); +drop: + return -1; +} + +static void nf_bridge_update_protocol(struct sk_buff *skb) +{ + switch (skb->nf_bridge->orig_proto) { + case BRNF_PROTO_8021Q: + skb->protocol = htons(ETH_P_8021Q); + break; + case BRNF_PROTO_PPPOE: + skb->protocol = htons(ETH_P_PPP_SES); + break; + case BRNF_PROTO_UNCHANGED: + break; + } +} + +/* Obtain the correct destination MAC address, while preserving the original + * source MAC address. If we already know this address, we just copy it. If we + * don't, we use the neighbour framework to find out. In both cases, we make + * sure that br_handle_frame_finish() is called afterwards. + */ +static int br_nf_pre_routing_finish_bridge(struct sock *sk, struct sk_buff *skb) +{ + struct neighbour *neigh; + struct dst_entry *dst; + + skb->dev = bridge_parent(skb->dev); + if (!skb->dev) + goto free_skb; + dst = skb_dst(skb); + neigh = dst_neigh_lookup_skb(dst, skb); + if (neigh) { + struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); + int ret; + + if (neigh->hh.hh_len) { + neigh_hh_bridge(&neigh->hh, skb); + skb->dev = nf_bridge->physindev; + ret = br_handle_frame_finish(sk, skb); + } else { + /* the neighbour function below overwrites the complete + * MAC header, so we save the Ethernet source address and + * protocol number. + */ + skb_copy_from_linear_data_offset(skb, + -(ETH_HLEN-ETH_ALEN), + nf_bridge->neigh_header, + ETH_HLEN-ETH_ALEN); + /* tell br_dev_xmit to continue with forwarding */ + nf_bridge->mask |= BRNF_BRIDGED_DNAT; + /* FIXME Need to refragment */ + ret = neigh->output(neigh, skb); + } + neigh_release(neigh); + return ret; + } +free_skb: + kfree_skb(skb); + return 0; +} + +static bool daddr_was_changed(const struct sk_buff *skb, + const struct nf_bridge_info *nf_bridge) +{ + switch (skb->protocol) { + case htons(ETH_P_IP): + return ip_hdr(skb)->daddr != nf_bridge->ipv4_daddr; + case htons(ETH_P_IPV6): + return memcmp(&nf_bridge->ipv6_daddr, &ipv6_hdr(skb)->daddr, + sizeof(ipv6_hdr(skb)->daddr)) != 0; + default: + return false; + } +} + +/* PF_BRIDGE/PRE_ROUTING: Undo the changes made for ip6tables + * PREROUTING and continue the bridge PRE_ROUTING hook. See comment + * for br_nf_pre_routing_finish(), same logic is used here but + * equivalent IPv6 function ip6_route_input() called indirectly. + */ +static int br_nf_pre_routing_finish_ipv6(struct sock *sk, struct sk_buff *skb) +{ + struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); + struct rtable *rt; + struct net_device *dev = skb->dev; + const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops(); + + nf_bridge->frag_max_size = IP6CB(skb)->frag_max_size; + + if (nf_bridge->pkt_otherhost) { + skb->pkt_type = PACKET_OTHERHOST; + nf_bridge->pkt_otherhost = false; + } + nf_bridge->mask &= ~BRNF_NF_BRIDGE_PREROUTING; + if (daddr_was_changed(skb, nf_bridge)) { + skb_dst_drop(skb); + v6ops->route_input(skb); + + if (skb_dst(skb)->error) { + kfree_skb(skb); + return 0; + } + + if (skb_dst(skb)->dev == dev) { + skb->dev = nf_bridge->physindev; + nf_bridge_update_protocol(skb); + nf_bridge_push_encap_header(skb); + NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, + sk, skb, skb->dev, NULL, + br_nf_pre_routing_finish_bridge, + 1); + return 0; + } + ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr); + skb->pkt_type = PACKET_HOST; + } else { + rt = bridge_parent_rtable(nf_bridge->physindev); + if (!rt) { + kfree_skb(skb); + return 0; + } + skb_dst_set_noref(skb, &rt->dst); + } + + skb->dev = nf_bridge->physindev; + nf_bridge_update_protocol(skb); + nf_bridge_push_encap_header(skb); + NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, sk, skb, + skb->dev, NULL, + br_handle_frame_finish, 1); + + return 0; +} + +/* This requires some explaining. If DNAT has taken place, + * we will need to fix up the destination Ethernet address. + * This is also true when SNAT takes place (for the reply direction). + * + * There are two cases to consider: + * 1. The packet was DNAT'ed to a device in the same bridge + * port group as it was received on. We can still bridge + * the packet. + * 2. The packet was DNAT'ed to a different device, either + * a non-bridged device or another bridge port group. + * The packet will need to be routed. + * + * The correct way of distinguishing between these two cases is to + * call ip_route_input() and to look at skb->dst->dev, which is + * changed to the destination device if ip_route_input() succeeds. + * + * Let's first consider the case that ip_route_input() succeeds: + * + * If the output device equals the logical bridge device the packet + * came in on, we can consider this bridging. The corresponding MAC + * address will be obtained in br_nf_pre_routing_finish_bridge. + * Otherwise, the packet is considered to be routed and we just + * change the destination MAC address so that the packet will + * later be passed up to the IP stack to be routed. For a redirected + * packet, ip_route_input() will give back the localhost as output device, + * which differs from the bridge device. + * + * Let's now consider the case that ip_route_input() fails: + * + * This can be because the destination address is martian, in which case + * the packet will be dropped. + * If IP forwarding is disabled, ip_route_input() will fail, while + * ip_route_output_key() can return success. The source + * address for ip_route_output_key() is set to zero, so ip_route_output_key() + * thinks we're handling a locally generated packet and won't care + * if IP forwarding is enabled. If the output device equals the logical bridge + * device, we proceed as if ip_route_input() succeeded. If it differs from the + * logical bridge port or if ip_route_output_key() fails we drop the packet. + */ +static int br_nf_pre_routing_finish(struct sock *sk, struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct iphdr *iph = ip_hdr(skb); + struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); + struct rtable *rt; + int err; + + nf_bridge->frag_max_size = IPCB(skb)->frag_max_size; + + if (nf_bridge->pkt_otherhost) { + skb->pkt_type = PACKET_OTHERHOST; + nf_bridge->pkt_otherhost = false; + } + nf_bridge->mask &= ~BRNF_NF_BRIDGE_PREROUTING; + if (daddr_was_changed(skb, nf_bridge)) { + if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) { + struct in_device *in_dev = __in_dev_get_rcu(dev); + + /* If err equals -EHOSTUNREACH the error is due to a + * martian destination or due to the fact that + * forwarding is disabled. For most martian packets, + * ip_route_output_key() will fail. It won't fail for 2 types of + * martian destinations: loopback destinations and destination + * 0.0.0.0. In both cases the packet will be dropped because the + * destination is the loopback device and not the bridge. */ + if (err != -EHOSTUNREACH || !in_dev || IN_DEV_FORWARD(in_dev)) + goto free_skb; + + rt = ip_route_output(dev_net(dev), iph->daddr, 0, + RT_TOS(iph->tos), 0); + if (!IS_ERR(rt)) { + /* - Bridged-and-DNAT'ed traffic doesn't + * require ip_forwarding. */ + if (rt->dst.dev == dev) { + skb_dst_set(skb, &rt->dst); + goto bridged_dnat; + } + ip_rt_put(rt); + } +free_skb: + kfree_skb(skb); + return 0; + } else { + if (skb_dst(skb)->dev == dev) { +bridged_dnat: + skb->dev = nf_bridge->physindev; + nf_bridge_update_protocol(skb); + nf_bridge_push_encap_header(skb); + NF_HOOK_THRESH(NFPROTO_BRIDGE, + NF_BR_PRE_ROUTING, + sk, skb, skb->dev, NULL, + br_nf_pre_routing_finish_bridge, + 1); + return 0; + } + ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr); + skb->pkt_type = PACKET_HOST; + } + } else { + rt = bridge_parent_rtable(nf_bridge->physindev); + if (!rt) { + kfree_skb(skb); + return 0; + } + skb_dst_set_noref(skb, &rt->dst); + } + + skb->dev = nf_bridge->physindev; + nf_bridge_update_protocol(skb); + nf_bridge_push_encap_header(skb); + NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, sk, skb, + skb->dev, NULL, + br_handle_frame_finish, 1); + + return 0; +} + +static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct net_device *dev) +{ + struct net_device *vlan, *br; + + br = bridge_parent(dev); + if (brnf_pass_vlan_indev == 0 || !skb_vlan_tag_present(skb)) + return br; + + vlan = __vlan_find_dev_deep_rcu(br, skb->vlan_proto, + skb_vlan_tag_get(skb) & VLAN_VID_MASK); + + return vlan ? vlan : br; +} + +/* Some common code for IPv4/IPv6 */ +static struct net_device *setup_pre_routing(struct sk_buff *skb) +{ + struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); + + if (skb->pkt_type == PACKET_OTHERHOST) { + skb->pkt_type = PACKET_HOST; + nf_bridge->pkt_otherhost = true; + } + + nf_bridge->mask |= BRNF_NF_BRIDGE_PREROUTING; + nf_bridge->physindev = skb->dev; + skb->dev = brnf_get_logical_dev(skb, skb->dev); + + if (skb->protocol == htons(ETH_P_8021Q)) + nf_bridge->orig_proto = BRNF_PROTO_8021Q; + else if (skb->protocol == htons(ETH_P_PPP_SES)) + nf_bridge->orig_proto = BRNF_PROTO_PPPOE; + + /* Must drop socket now because of tproxy. */ + skb_orphan(skb); + return skb->dev; +} + +/* Replicate the checks that IPv6 does on packet reception and pass the packet + * to ip6tables. + */ +static unsigned int br_nf_pre_routing_ipv6(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nf_bridge_info *nf_bridge; + + if (br_validate_ipv6(skb)) + return NF_DROP; + + nf_bridge_put(skb->nf_bridge); + if (!nf_bridge_alloc(skb)) + return NF_DROP; + if (!setup_pre_routing(skb)) + return NF_DROP; + + nf_bridge = nf_bridge_info_get(skb); + nf_bridge->ipv6_daddr = ipv6_hdr(skb)->daddr; + + skb->protocol = htons(ETH_P_IPV6); + NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, state->sk, skb, + skb->dev, NULL, + br_nf_pre_routing_finish_ipv6); + + return NF_STOLEN; +} + +/* Direct IPv6 traffic to br_nf_pre_routing_ipv6. + * Replicate the checks that IPv4 does on packet reception. + * Set skb->dev to the bridge device (i.e. parent of the + * receiving device) to make netfilter happy, the REDIRECT + * target in particular. Save the original destination IP + * address to be able to detect DNAT afterwards. */ +static unsigned int br_nf_pre_routing(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nf_bridge_info *nf_bridge; + struct net_bridge_port *p; + struct net_bridge *br; + __u32 len = nf_bridge_encap_header_len(skb); + + if (unlikely(!pskb_may_pull(skb, len))) + return NF_DROP; + + p = br_port_get_rcu(state->in); + if (p == NULL) + return NF_DROP; + br = p->br; + + if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) { + if (!brnf_call_ip6tables && !br->nf_call_ip6tables) + return NF_ACCEPT; + + nf_bridge_pull_encap_header_rcsum(skb); + return br_nf_pre_routing_ipv6(ops, skb, state); + } + + if (!brnf_call_iptables && !br->nf_call_iptables) + return NF_ACCEPT; + + if (!IS_IP(skb) && !IS_VLAN_IP(skb) && !IS_PPPOE_IP(skb)) + return NF_ACCEPT; + + nf_bridge_pull_encap_header_rcsum(skb); + + if (br_validate_ipv4(skb)) + return NF_DROP; + + nf_bridge_put(skb->nf_bridge); + if (!nf_bridge_alloc(skb)) + return NF_DROP; + if (!setup_pre_routing(skb)) + return NF_DROP; + + nf_bridge = nf_bridge_info_get(skb); + nf_bridge->ipv4_daddr = ip_hdr(skb)->daddr; + + skb->protocol = htons(ETH_P_IP); + + NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, state->sk, skb, + skb->dev, NULL, + br_nf_pre_routing_finish); + + return NF_STOLEN; +} + + +/* PF_BRIDGE/LOCAL_IN ************************************************/ +/* The packet is locally destined, which requires a real + * dst_entry, so detach the fake one. On the way up, the + * packet would pass through PRE_ROUTING again (which already + * took place when the packet entered the bridge), but we + * register an IPv4 PRE_ROUTING 'sabotage' hook that will + * prevent this from happening. */ +static unsigned int br_nf_local_in(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + br_drop_fake_rtable(skb); + return NF_ACCEPT; +} + +/* PF_BRIDGE/FORWARD *************************************************/ +static int br_nf_forward_finish(struct sock *sk, struct sk_buff *skb) +{ + struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); + struct net_device *in; + + if (!IS_ARP(skb) && !IS_VLAN_ARP(skb)) { + + if (skb->protocol == htons(ETH_P_IP)) + nf_bridge->frag_max_size = IPCB(skb)->frag_max_size; + + if (skb->protocol == htons(ETH_P_IPV6)) + nf_bridge->frag_max_size = IP6CB(skb)->frag_max_size; + + in = nf_bridge->physindev; + if (nf_bridge->pkt_otherhost) { + skb->pkt_type = PACKET_OTHERHOST; + nf_bridge->pkt_otherhost = false; + } + nf_bridge_update_protocol(skb); + } else { + in = *((struct net_device **)(skb->cb)); + } + nf_bridge_push_encap_header(skb); + + NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_FORWARD, sk, skb, + in, skb->dev, br_forward_finish, 1); + return 0; +} + + +/* This is the 'purely bridged' case. For IP, we pass the packet to + * netfilter with indev and outdev set to the bridge device, + * but we are still able to filter on the 'real' indev/outdev + * because of the physdev module. For ARP, indev and outdev are the + * bridge ports. */ +static unsigned int br_nf_forward_ip(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nf_bridge_info *nf_bridge; + struct net_device *parent; + u_int8_t pf; + + if (!skb->nf_bridge) + return NF_ACCEPT; + + /* Need exclusive nf_bridge_info since we might have multiple + * different physoutdevs. */ + if (!nf_bridge_unshare(skb)) + return NF_DROP; + + nf_bridge = nf_bridge_info_get(skb); + if (!nf_bridge) + return NF_DROP; + + parent = bridge_parent(state->out); + if (!parent) + return NF_DROP; + + if (IS_IP(skb) || IS_VLAN_IP(skb) || IS_PPPOE_IP(skb)) + pf = NFPROTO_IPV4; + else if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) + pf = NFPROTO_IPV6; + else + return NF_ACCEPT; + + nf_bridge_pull_encap_header(skb); + + if (skb->pkt_type == PACKET_OTHERHOST) { + skb->pkt_type = PACKET_HOST; + nf_bridge->pkt_otherhost = true; + } + + if (pf == NFPROTO_IPV4) { + if (br_validate_ipv4(skb)) + return NF_DROP; + IPCB(skb)->frag_max_size = nf_bridge->frag_max_size; + } + + if (pf == NFPROTO_IPV6) { + if (br_validate_ipv6(skb)) + return NF_DROP; + IP6CB(skb)->frag_max_size = nf_bridge->frag_max_size; + } + + nf_bridge->physoutdev = skb->dev; + if (pf == NFPROTO_IPV4) + skb->protocol = htons(ETH_P_IP); + else + skb->protocol = htons(ETH_P_IPV6); + + NF_HOOK(pf, NF_INET_FORWARD, NULL, skb, + brnf_get_logical_dev(skb, state->in), + parent, br_nf_forward_finish); + + return NF_STOLEN; +} + +static unsigned int br_nf_forward_arp(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct net_bridge_port *p; + struct net_bridge *br; + struct net_device **d = (struct net_device **)(skb->cb); + + p = br_port_get_rcu(state->out); + if (p == NULL) + return NF_ACCEPT; + br = p->br; + + if (!brnf_call_arptables && !br->nf_call_arptables) + return NF_ACCEPT; + + if (!IS_ARP(skb)) { + if (!IS_VLAN_ARP(skb)) + return NF_ACCEPT; + nf_bridge_pull_encap_header(skb); + } + + if (arp_hdr(skb)->ar_pln != 4) { + if (IS_VLAN_ARP(skb)) + nf_bridge_push_encap_header(skb); + return NF_ACCEPT; + } + *d = state->in; + NF_HOOK(NFPROTO_ARP, NF_ARP_FORWARD, state->sk, skb, + state->in, state->out, br_nf_forward_finish); + + return NF_STOLEN; +} + +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) || IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) +static int br_nf_push_frag_xmit(struct sock *sk, struct sk_buff *skb) +{ + struct brnf_frag_data *data; + int err; + + data = this_cpu_ptr(&brnf_frag_data_storage); + err = skb_cow_head(skb, data->size); + + if (err) { + kfree_skb(skb); + return 0; + } + + if (data->vlan_tci) { + skb->vlan_tci = data->vlan_tci; + skb->vlan_proto = data->vlan_proto; + } + + skb_copy_to_linear_data_offset(skb, -data->size, data->mac, data->size); + __skb_push(skb, data->encap_size); + + nf_bridge_info_free(skb); + return br_dev_queue_push_xmit(sk, skb); +} +#endif + +static int br_nf_ip_fragment(struct sock *sk, struct sk_buff *skb, + int (*output)(struct sock *, struct sk_buff *)) +{ + unsigned int mtu = ip_skb_dst_mtu(skb); + struct iphdr *iph = ip_hdr(skb); + struct rtable *rt = skb_rtable(skb); + struct net_device *dev = rt->dst.dev; + + if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) || + (IPCB(skb)->frag_max_size && + IPCB(skb)->frag_max_size > mtu))) { + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); + kfree_skb(skb); + return -EMSGSIZE; + } + + return ip_do_fragment(sk, skb, output); +} + +static unsigned int nf_bridge_mtu_reduction(const struct sk_buff *skb) +{ + if (skb->nf_bridge->orig_proto == BRNF_PROTO_PPPOE) + return PPPOE_SES_HLEN; + return 0; +} + +static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb) +{ + struct nf_bridge_info *nf_bridge; + unsigned int mtu_reserved; + + mtu_reserved = nf_bridge_mtu_reduction(skb); + + if (skb_is_gso(skb) || skb->len + mtu_reserved <= skb->dev->mtu) { + nf_bridge_info_free(skb); + return br_dev_queue_push_xmit(sk, skb); + } + + nf_bridge = nf_bridge_info_get(skb); + +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) + /* This is wrong! We should preserve the original fragment + * boundaries by preserving frag_list rather than refragmenting. + */ + if (skb->protocol == htons(ETH_P_IP)) { + struct brnf_frag_data *data; + + if (br_validate_ipv4(skb)) + return NF_DROP; + + IPCB(skb)->frag_max_size = nf_bridge->frag_max_size; + + nf_bridge_update_protocol(skb); + + data = this_cpu_ptr(&brnf_frag_data_storage); + + data->vlan_tci = skb->vlan_tci; + data->vlan_proto = skb->vlan_proto; + data->encap_size = nf_bridge_encap_header_len(skb); + data->size = ETH_HLEN + data->encap_size; + + skb_copy_from_linear_data_offset(skb, -data->size, data->mac, + data->size); + + return br_nf_ip_fragment(sk, skb, br_nf_push_frag_xmit); + } +#endif +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) + if (skb->protocol == htons(ETH_P_IPV6)) { + const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops(); + struct brnf_frag_data *data; + + if (br_validate_ipv6(skb)) + return NF_DROP; + + IP6CB(skb)->frag_max_size = nf_bridge->frag_max_size; + + nf_bridge_update_protocol(skb); + + data = this_cpu_ptr(&brnf_frag_data_storage); + data->encap_size = nf_bridge_encap_header_len(skb); + data->size = ETH_HLEN + data->encap_size; + + skb_copy_from_linear_data_offset(skb, -data->size, data->mac, + data->size); + + if (v6ops) + return v6ops->fragment(sk, skb, br_nf_push_frag_xmit); + else + return -EMSGSIZE; + } +#endif + nf_bridge_info_free(skb); + return br_dev_queue_push_xmit(sk, skb); +} + +/* PF_BRIDGE/POST_ROUTING ********************************************/ +static unsigned int br_nf_post_routing(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); + struct net_device *realoutdev = bridge_parent(skb->dev); + u_int8_t pf; + + /* if nf_bridge is set, but ->physoutdev is NULL, this packet came in + * on a bridge, but was delivered locally and is now being routed: + * + * POST_ROUTING was already invoked from the ip stack. + */ + if (!nf_bridge || !nf_bridge->physoutdev) + return NF_ACCEPT; + + if (!realoutdev) + return NF_DROP; + + if (IS_IP(skb) || IS_VLAN_IP(skb) || IS_PPPOE_IP(skb)) + pf = NFPROTO_IPV4; + else if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) + pf = NFPROTO_IPV6; + else + return NF_ACCEPT; + + /* We assume any code from br_dev_queue_push_xmit onwards doesn't care + * about the value of skb->pkt_type. */ + if (skb->pkt_type == PACKET_OTHERHOST) { + skb->pkt_type = PACKET_HOST; + nf_bridge->pkt_otherhost = true; + } + + nf_bridge_pull_encap_header(skb); + if (pf == NFPROTO_IPV4) + skb->protocol = htons(ETH_P_IP); + else + skb->protocol = htons(ETH_P_IPV6); + + NF_HOOK(pf, NF_INET_POST_ROUTING, state->sk, skb, + NULL, realoutdev, + br_nf_dev_queue_xmit); + + return NF_STOLEN; +} + +/* IP/SABOTAGE *****************************************************/ +/* Don't hand locally destined packets to PF_INET(6)/PRE_ROUTING + * for the second time. */ +static unsigned int ip_sabotage_in(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + if (skb->nf_bridge && + !(skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)) { + return NF_STOP; + } + + return NF_ACCEPT; +} + +/* This is called when br_netfilter has called into iptables/netfilter, + * and DNAT has taken place on a bridge-forwarded packet. + * + * neigh->output has created a new MAC header, with local br0 MAC + * as saddr. + * + * This restores the original MAC saddr of the bridged packet + * before invoking bridge forward logic to transmit the packet. + */ +static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb) +{ + struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); + + skb_pull(skb, ETH_HLEN); + nf_bridge->mask &= ~BRNF_BRIDGED_DNAT; + + BUILD_BUG_ON(sizeof(nf_bridge->neigh_header) != (ETH_HLEN - ETH_ALEN)); + + skb_copy_to_linear_data_offset(skb, -(ETH_HLEN - ETH_ALEN), + nf_bridge->neigh_header, + ETH_HLEN - ETH_ALEN); + skb->dev = nf_bridge->physindev; + + nf_bridge->physoutdev = NULL; + br_handle_frame_finish(NULL, skb); +} + +static int br_nf_dev_xmit(struct sk_buff *skb) +{ + if (skb->nf_bridge && (skb->nf_bridge->mask & BRNF_BRIDGED_DNAT)) { + br_nf_pre_routing_finish_bridge_slow(skb); + return 1; + } + return 0; +} + +static const struct nf_br_ops br_ops = { + .br_dev_xmit_hook = br_nf_dev_xmit, +}; + +void br_netfilter_enable(void) +{ +} +EXPORT_SYMBOL_GPL(br_netfilter_enable); + +/* For br_nf_post_routing, we need (prio = NF_BR_PRI_LAST), because + * br_dev_queue_push_xmit is called afterwards */ +static struct nf_hook_ops br_nf_ops[] __read_mostly = { + { + .hook = br_nf_pre_routing, + .owner = THIS_MODULE, + .pf = NFPROTO_BRIDGE, + .hooknum = NF_BR_PRE_ROUTING, + .priority = NF_BR_PRI_BRNF, + }, + { + .hook = br_nf_local_in, + .owner = THIS_MODULE, + .pf = NFPROTO_BRIDGE, + .hooknum = NF_BR_LOCAL_IN, + .priority = NF_BR_PRI_BRNF, + }, + { + .hook = br_nf_forward_ip, + .owner = THIS_MODULE, + .pf = NFPROTO_BRIDGE, + .hooknum = NF_BR_FORWARD, + .priority = NF_BR_PRI_BRNF - 1, + }, + { + .hook = br_nf_forward_arp, + .owner = THIS_MODULE, + .pf = NFPROTO_BRIDGE, + .hooknum = NF_BR_FORWARD, + .priority = NF_BR_PRI_BRNF, + }, + { + .hook = br_nf_post_routing, + .owner = THIS_MODULE, + .pf = NFPROTO_BRIDGE, + .hooknum = NF_BR_POST_ROUTING, + .priority = NF_BR_PRI_LAST, + }, + { + .hook = ip_sabotage_in, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_PRE_ROUTING, + .priority = NF_IP_PRI_FIRST, + }, + { + .hook = ip_sabotage_in, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_PRE_ROUTING, + .priority = NF_IP6_PRI_FIRST, + }, +}; + +#ifdef CONFIG_SYSCTL +static +int brnf_sysctl_call_tables(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + + if (write && *(int *)(ctl->data)) + *(int *)(ctl->data) = 1; + return ret; +} + +static struct ctl_table brnf_table[] = { + { + .procname = "bridge-nf-call-arptables", + .data = &brnf_call_arptables, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = brnf_sysctl_call_tables, + }, + { + .procname = "bridge-nf-call-iptables", + .data = &brnf_call_iptables, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = brnf_sysctl_call_tables, + }, + { + .procname = "bridge-nf-call-ip6tables", + .data = &brnf_call_ip6tables, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = brnf_sysctl_call_tables, + }, + { + .procname = "bridge-nf-filter-vlan-tagged", + .data = &brnf_filter_vlan_tagged, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = brnf_sysctl_call_tables, + }, + { + .procname = "bridge-nf-filter-pppoe-tagged", + .data = &brnf_filter_pppoe_tagged, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = brnf_sysctl_call_tables, + }, + { + .procname = "bridge-nf-pass-vlan-input-dev", + .data = &brnf_pass_vlan_indev, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = brnf_sysctl_call_tables, + }, + { } +}; +#endif + +static int __init br_netfilter_init(void) +{ + int ret; + + ret = nf_register_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); + if (ret < 0) + return ret; + +#ifdef CONFIG_SYSCTL + brnf_sysctl_header = register_net_sysctl(&init_net, "net/bridge", brnf_table); + if (brnf_sysctl_header == NULL) { + printk(KERN_WARNING + "br_netfilter: can't register to sysctl.\n"); + nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); + return -ENOMEM; + } +#endif + RCU_INIT_POINTER(nf_br_ops, &br_ops); + printk(KERN_NOTICE "Bridge firewalling registered\n"); + return 0; +} + +static void __exit br_netfilter_fini(void) +{ + RCU_INIT_POINTER(nf_br_ops, NULL); + nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); +#ifdef CONFIG_SYSCTL + unregister_net_sysctl_table(brnf_sysctl_header); +#endif +} + +module_init(br_netfilter_init); +module_exit(br_netfilter_fini); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Lennert Buytenhek "); +MODULE_AUTHOR("Bart De Schuymer "); +MODULE_DESCRIPTION("Linux ethernet netfilter firewall bridge"); -- cgit From d2609b345ebf0547015a78588c4d7ad68c9ccf26 Mon Sep 17 00:00:00 2001 From: Florian Grandel Date: Thu, 18 Jun 2015 03:16:34 +0200 Subject: Bluetooth: hci_core/mgmt: Introduce multi-adv list The current hci dev structure only supports a single advertising instance. To support multi-instance advertising it is necessary to introduce a linked list of advertising instances so that multiple advertising instances can be dynamically added and/or removed. In a first step, the existing adv_instance member of the hci_dev struct is supplemented by a linked list of advertising instances. This patch introduces the list and supporting list management infrastructure. The list is not being used yet. Signed-off-by: Florian Grandel Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_core.c | 117 +++++++++++++++++++++++++++++++++++++++++++++++ net/bluetooth/mgmt.c | 2 +- 2 files changed, 118 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 573711c2d09e..ebf37ebcfd12 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2610,6 +2610,119 @@ int hci_add_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr, return 0; } +/* This function requires the caller holds hdev->lock */ +struct adv_info *hci_find_adv_instance(struct hci_dev *hdev, u8 instance) +{ + struct adv_info *adv_instance; + + list_for_each_entry(adv_instance, &hdev->adv_instances, list) { + if (adv_instance->instance == instance) + return adv_instance; + } + + return NULL; +} + +/* This function requires the caller holds hdev->lock */ +struct adv_info *hci_get_next_instance(struct hci_dev *hdev, u8 instance) { + struct adv_info *cur_instance; + + cur_instance = hci_find_adv_instance(hdev, instance); + if (!cur_instance) + return NULL; + + if (cur_instance == list_last_entry(&hdev->adv_instances, + struct adv_info, list)) + return list_first_entry(&hdev->adv_instances, + struct adv_info, list); + else + return list_next_entry(cur_instance, list); +} + +/* This function requires the caller holds hdev->lock */ +int hci_remove_adv_instance(struct hci_dev *hdev, u8 instance) +{ + struct adv_info *adv_instance; + + adv_instance = hci_find_adv_instance(hdev, instance); + if (!adv_instance) + return -ENOENT; + + BT_DBG("%s removing %dMR", hdev->name, instance); + + list_del(&adv_instance->list); + kfree(adv_instance); + + hdev->adv_instance_cnt--; + + return 0; +} + +/* This function requires the caller holds hdev->lock */ +void hci_adv_instances_clear(struct hci_dev *hdev) +{ + struct adv_info *adv_instance, *n; + + list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, list) { + list_del(&adv_instance->list); + kfree(adv_instance); + } + + hdev->adv_instance_cnt = 0; +} + +/* This function requires the caller holds hdev->lock */ +int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags, + u16 adv_data_len, u8 *adv_data, + u16 scan_rsp_len, u8 *scan_rsp_data, + u16 timeout, u16 duration) +{ + struct adv_info *adv_instance; + + adv_instance = hci_find_adv_instance(hdev, instance); + if (adv_instance) { + memset(adv_instance->adv_data, 0, + sizeof(adv_instance->adv_data)); + memset(adv_instance->scan_rsp_data, 0, + sizeof(adv_instance->scan_rsp_data)); + } else { + if (hdev->adv_instance_cnt >= HCI_MAX_ADV_INSTANCES || + instance < 1 || instance > HCI_MAX_ADV_INSTANCES) + return -EOVERFLOW; + + adv_instance = kmalloc(sizeof(*adv_instance), GFP_KERNEL); + if (!adv_instance) + return -ENOMEM; + + memset(adv_instance, 0, sizeof(*adv_instance)); + adv_instance->instance = instance; + list_add(&adv_instance->list, &hdev->adv_instances); + hdev->adv_instance_cnt++; + } + + adv_instance->flags = flags; + adv_instance->adv_data_len = adv_data_len; + adv_instance->scan_rsp_len = scan_rsp_len; + + if (adv_data_len) + memcpy(adv_instance->adv_data, adv_data, adv_data_len); + + if (scan_rsp_len) + memcpy(adv_instance->scan_rsp_data, + scan_rsp_data, scan_rsp_len); + + adv_instance->timeout = timeout; + + if (duration == 0) + adv_instance->duration = HCI_DEFAULT_ADV_DURATION; + else + adv_instance->duration = duration; + + BT_DBG("%s for %dMR", hdev->name, instance); + + return 0; +} + struct bdaddr_list *hci_bdaddr_list_lookup(struct list_head *bdaddr_list, bdaddr_t *bdaddr, u8 type) { @@ -3015,6 +3128,8 @@ struct hci_dev *hci_alloc_dev(void) hdev->manufacturer = 0xffff; /* Default to internal use */ hdev->inq_tx_power = HCI_TX_POWER_INVALID; hdev->adv_tx_power = HCI_TX_POWER_INVALID; + hdev->adv_instance_cnt = 0; + hdev->cur_adv_instance = 0x00; hdev->sniff_max_interval = 800; hdev->sniff_min_interval = 80; @@ -3056,6 +3171,7 @@ struct hci_dev *hci_alloc_dev(void) INIT_LIST_HEAD(&hdev->pend_le_conns); INIT_LIST_HEAD(&hdev->pend_le_reports); INIT_LIST_HEAD(&hdev->conn_hash.list); + INIT_LIST_HEAD(&hdev->adv_instances); INIT_WORK(&hdev->rx_work, hci_rx_work); INIT_WORK(&hdev->cmd_work, hci_cmd_work); @@ -3249,6 +3365,7 @@ void hci_unregister_dev(struct hci_dev *hdev) hci_smp_ltks_clear(hdev); hci_smp_irks_clear(hdev); hci_remote_oob_data_clear(hdev); + hci_adv_instances_clear(hdev); hci_bdaddr_list_clear(&hdev->le_white_list); hci_conn_params_clear_all(hdev); hci_discovery_filter_clear(hdev); diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index e41bbe28a36e..92c50a17fdf9 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -6813,7 +6813,7 @@ static int read_adv_features(struct sock *sk, struct hci_dev *hdev, rp->supported_flags = cpu_to_le32(supported_flags); rp->max_adv_data_len = HCI_MAX_AD_LENGTH; rp->max_scan_rsp_len = HCI_MAX_AD_LENGTH; - rp->max_instances = 1; + rp->max_instances = HCI_MAX_ADV_INSTANCES; /* Currently only one instance is supported, so simply return the * current instance number. -- cgit From 5d900e4601391576a3c0644d7fcad1ebf41a516e Mon Sep 17 00:00:00 2001 From: Florian Grandel Date: Thu, 18 Jun 2015 03:16:35 +0200 Subject: Bluetooth: hci_core/mgmt: move adv timeout to hdev Currently the delayed work managing advertising duration and timeout is part of the advertising instance structure. This is not correct as only a single instance can be advertised at any given time. To implement round robin advertising a single delayed work structure is needed. To fix this the delayed work structure is being moved to the hci_dev structure. The instance specific variable is renamed to "remaining_time" to make it clear that this is the remaining lifetime of the instance and not the current advertising timeout. Signed-off-by: Florian Grandel Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_core.c | 29 +++++++++++++++++++++++++++++ net/bluetooth/mgmt.c | 27 +++++++++++---------------- 2 files changed, 40 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index ebf37ebcfd12..d1110db3b0d4 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1591,6 +1591,11 @@ static int hci_dev_do_close(struct hci_dev *hdev) if (hci_dev_test_flag(hdev, HCI_MGMT)) cancel_delayed_work_sync(&hdev->rpa_expired); + if (hdev->adv_instance_timeout) { + cancel_delayed_work_sync(&hdev->adv_instance_expire); + hdev->adv_instance_timeout = 0; + } + /* Avoid potential lockdep warnings from the *_flush() calls by * ensuring the workqueue is empty up front. */ @@ -2147,6 +2152,17 @@ static void hci_discov_off(struct work_struct *work) mgmt_discoverable_timeout(hdev); } +static void hci_adv_timeout_expire(struct work_struct *work) +{ + struct hci_dev *hdev; + + hdev = container_of(work, struct hci_dev, adv_instance_expire.work); + + BT_DBG("%s", hdev->name); + + mgmt_adv_timeout_expired(hdev); +} + void hci_uuids_clear(struct hci_dev *hdev) { struct bt_uuid *uuid, *tmp; @@ -2650,6 +2666,11 @@ int hci_remove_adv_instance(struct hci_dev *hdev, u8 instance) BT_DBG("%s removing %dMR", hdev->name, instance); + if (hdev->cur_adv_instance == instance && hdev->adv_instance_timeout) { + cancel_delayed_work(&hdev->adv_instance_expire); + hdev->adv_instance_timeout = 0; + } + list_del(&adv_instance->list); kfree(adv_instance); @@ -2663,6 +2684,11 @@ void hci_adv_instances_clear(struct hci_dev *hdev) { struct adv_info *adv_instance, *n; + if (hdev->adv_instance_timeout) { + cancel_delayed_work(&hdev->adv_instance_expire); + hdev->adv_instance_timeout = 0; + } + list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, list) { list_del(&adv_instance->list); kfree(adv_instance); @@ -2712,6 +2738,7 @@ int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags, scan_rsp_data, scan_rsp_len); adv_instance->timeout = timeout; + adv_instance->remaining_time = timeout; if (duration == 0) adv_instance->duration = HCI_DEFAULT_ADV_DURATION; @@ -3130,6 +3157,7 @@ struct hci_dev *hci_alloc_dev(void) hdev->adv_tx_power = HCI_TX_POWER_INVALID; hdev->adv_instance_cnt = 0; hdev->cur_adv_instance = 0x00; + hdev->adv_instance_timeout = 0; hdev->sniff_max_interval = 800; hdev->sniff_min_interval = 80; @@ -3183,6 +3211,7 @@ struct hci_dev *hci_alloc_dev(void) INIT_DELAYED_WORK(&hdev->discov_off, hci_discov_off); INIT_DELAYED_WORK(&hdev->le_scan_disable, le_scan_disable_work); INIT_DELAYED_WORK(&hdev->le_scan_restart, le_scan_restart_work); + INIT_DELAYED_WORK(&hdev->adv_instance_expire, hci_adv_timeout_expire); skb_queue_head_init(&hdev->rx_q); skb_queue_head_init(&hdev->cmd_q); diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 92c50a17fdf9..a8319f6cfa65 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -1466,8 +1466,8 @@ static void clear_adv_instance(struct hci_dev *hdev) if (!hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE)) return; - if (hdev->adv_instance.timeout) - cancel_delayed_work(&hdev->adv_instance.timeout_exp); + if (hdev->adv_instance_timeout) + cancel_delayed_work(&hdev->adv_instance_expire); memset(&hdev->adv_instance, 0, sizeof(hdev->adv_instance)); advertising_removed(NULL, hdev, 1); @@ -1497,7 +1497,7 @@ static int clean_up_hci_state(struct hci_dev *hdev) hci_req_add(&req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); } - if (hdev->adv_instance.timeout) + if (hdev->adv_instance_timeout) clear_adv_instance(hdev); if (hci_dev_test_flag(hdev, HCI_LE_ADV)) @@ -6914,12 +6914,9 @@ unlock: hci_dev_unlock(hdev); } -static void adv_timeout_expired(struct work_struct *work) +void mgmt_adv_timeout_expired(struct hci_dev *hdev) { - struct hci_dev *hdev = container_of(work, struct hci_dev, - adv_instance.timeout_exp.work); - - hdev->adv_instance.timeout = 0; + hdev->adv_instance_timeout = 0; hci_dev_lock(hdev); clear_adv_instance(hdev); @@ -6981,8 +6978,6 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev, goto unlock; } - INIT_DELAYED_WORK(&hdev->adv_instance.timeout_exp, adv_timeout_expired); - hdev->adv_instance.flags = flags; hdev->adv_instance.adv_data_len = cp->adv_data_len; hdev->adv_instance.scan_rsp_len = cp->scan_rsp_len; @@ -6994,14 +6989,14 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev, memcpy(hdev->adv_instance.scan_rsp_data, cp->data + cp->adv_data_len, cp->scan_rsp_len); - if (hdev->adv_instance.timeout) - cancel_delayed_work(&hdev->adv_instance.timeout_exp); + if (hdev->adv_instance_timeout) + cancel_delayed_work(&hdev->adv_instance_expire); - hdev->adv_instance.timeout = timeout; + hdev->adv_instance_timeout = timeout; if (timeout) queue_delayed_work(hdev->workqueue, - &hdev->adv_instance.timeout_exp, + &hdev->adv_instance_expire, msecs_to_jiffies(timeout * 1000)); if (!hci_dev_test_and_set_flag(hdev, HCI_ADVERTISING_INSTANCE)) @@ -7106,8 +7101,8 @@ static int remove_advertising(struct sock *sk, struct hci_dev *hdev, goto unlock; } - if (hdev->adv_instance.timeout) - cancel_delayed_work(&hdev->adv_instance.timeout_exp); + if (hdev->adv_instance_timeout) + cancel_delayed_work(&hdev->adv_instance_expire); memset(&hdev->adv_instance, 0, sizeof(hdev->adv_instance)); -- cgit From 91aa9bb2e3d2130b0ef004d2a05cfda6f0222ce2 Mon Sep 17 00:00:00 2001 From: Florian Grandel Date: Thu, 18 Jun 2015 03:16:36 +0200 Subject: Bluetooth: mgmt: dry update_scan_rsp_data() update_scan_rsp_data() duplicates code from get_current_adv_instance(). This is being fixed by letting the former make use of the latter. Signed-off-by: Florian Grandel Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt.c | 50 +++++++++++++++++--------------------------------- 1 file changed, 17 insertions(+), 33 deletions(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index a8319f6cfa65..c3304164cab9 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -832,6 +832,20 @@ static struct mgmt_pending_cmd *pending_find_data(u16 opcode, return mgmt_pending_find_data(HCI_CHANNEL_CONTROL, opcode, hdev, data); } +static u8 get_current_adv_instance(struct hci_dev *hdev) +{ + /* The "Set Advertising" setting supersedes the "Add Advertising" + * setting. Here we set the advertising data based on which + * setting was set. When neither apply, default to the global settings, + * represented by instance "0". + */ + if (hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE) && + !hci_dev_test_flag(hdev, HCI_ADVERTISING)) + return 0x01; + + return 0x00; +} + static u8 create_default_scan_rsp_data(struct hci_dev *hdev, u8 *ptr) { u8 ad_len = 0; @@ -900,21 +914,8 @@ static void update_scan_rsp_data_for_instance(struct hci_request *req, static void update_scan_rsp_data(struct hci_request *req) { - struct hci_dev *hdev = req->hdev; - u8 instance; - - /* The "Set Advertising" setting supersedes the "Add Advertising" - * setting. Here we set the scan response data based on which - * setting was set. When neither apply, default to the global settings, - * represented by instance "0". - */ - if (hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE) && - !hci_dev_test_flag(hdev, HCI_ADVERTISING)) - instance = 0x01; - else - instance = 0x00; - - update_scan_rsp_data_for_instance(req, instance); + update_scan_rsp_data_for_instance(req, + get_current_adv_instance(req->hdev)); } static u8 get_adv_discov_flags(struct hci_dev *hdev) @@ -941,20 +942,6 @@ static u8 get_adv_discov_flags(struct hci_dev *hdev) return 0; } -static u8 get_current_adv_instance(struct hci_dev *hdev) -{ - /* The "Set Advertising" setting supersedes the "Add Advertising" - * setting. Here we set the advertising data based on which - * setting was set. When neither apply, default to the global settings, - * represented by instance "0". - */ - if (hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE) && - !hci_dev_test_flag(hdev, HCI_ADVERTISING)) - return 0x01; - - return 0x00; -} - static bool get_connectable(struct hci_dev *hdev) { struct mgmt_pending_cmd *cmd; @@ -1093,10 +1080,7 @@ static void update_adv_data_for_instance(struct hci_request *req, u8 instance) static void update_adv_data(struct hci_request *req) { - struct hci_dev *hdev = req->hdev; - u8 instance = get_current_adv_instance(hdev); - - update_adv_data_for_instance(req, instance); + update_adv_data_for_instance(req, get_current_adv_instance(req->hdev)); } int mgmt_update_adv_data(struct hci_dev *hdev) -- cgit From efae002c2c8acd1fd2d0c6e47122944094c8ae1f Mon Sep 17 00:00:00 2001 From: Florian Grandel Date: Thu, 18 Jun 2015 03:16:37 +0200 Subject: Bluetooth: mgmt: rename update_*_data_for_instance() The ...for_instance function name is quite long and does not follow the ..._inst_... convention followed elsewhere in the code. This patch renames the ...for_instance functions to their shorter ..._inst_... version. Signed-off-by: Florian Grandel Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index c3304164cab9..7fabcb60da2d 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -883,8 +883,7 @@ static u8 create_instance_scan_rsp_data(struct hci_dev *hdev, u8 *ptr) return hdev->adv_instance.scan_rsp_len; } -static void update_scan_rsp_data_for_instance(struct hci_request *req, - u8 instance) +static void update_inst_scan_rsp_data(struct hci_request *req, u8 instance) { struct hci_dev *hdev = req->hdev; struct hci_cp_le_set_scan_rsp_data cp; @@ -914,8 +913,7 @@ static void update_scan_rsp_data_for_instance(struct hci_request *req, static void update_scan_rsp_data(struct hci_request *req) { - update_scan_rsp_data_for_instance(req, - get_current_adv_instance(req->hdev)); + update_inst_scan_rsp_data(req, get_current_adv_instance(req->hdev)); } static u8 get_adv_discov_flags(struct hci_dev *hdev) @@ -1052,7 +1050,7 @@ static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr) return ad_len; } -static void update_adv_data_for_instance(struct hci_request *req, u8 instance) +static void update_inst_adv_data(struct hci_request *req, u8 instance) { struct hci_dev *hdev = req->hdev; struct hci_cp_le_set_adv_data cp; @@ -1080,7 +1078,7 @@ static void update_adv_data_for_instance(struct hci_request *req, u8 instance) static void update_adv_data(struct hci_request *req) { - update_adv_data_for_instance(req, get_current_adv_instance(req->hdev)); + update_inst_adv_data(req, get_current_adv_instance(req->hdev)); } int mgmt_update_adv_data(struct hci_dev *hdev) @@ -4776,8 +4774,8 @@ static int set_advertising(struct sock *sk, struct hci_dev *hdev, void *data, if (val) { /* Switch to instance "0" for the Set Advertising setting. */ - update_adv_data_for_instance(&req, 0); - update_scan_rsp_data_for_instance(&req, 0); + update_inst_adv_data(&req, 0x00); + update_inst_scan_rsp_data(&req, 0x00); enable_advertising(&req); } else { disable_advertising(&req); -- cgit From 286e0c83df948eef9f51d50ed68ec8a1bb5051e4 Mon Sep 17 00:00:00 2001 From: Florian Grandel Date: Thu, 18 Jun 2015 03:16:38 +0200 Subject: Bluetooth: mgmt: multi adv for read_adv_features() The read_adv_features() method had a single instance identifier hard coded. Refer to the advertising instance list instead to return a dynamically generated list of instance identifiers. Signed-off-by: Florian Grandel Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 7fabcb60da2d..49356c7788f9 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -6763,8 +6763,9 @@ static int read_adv_features(struct sock *sk, struct hci_dev *hdev, { struct mgmt_rp_read_adv_features *rp; size_t rp_len; - int err; + int err, i; bool instance; + struct adv_info *adv_instance; u32 supported_flags; BT_DBG("%s", hdev->name); @@ -6777,12 +6778,9 @@ static int read_adv_features(struct sock *sk, struct hci_dev *hdev, rp_len = sizeof(*rp); - /* Currently only one instance is supported, so just add 1 to the - * response length. - */ instance = hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE); if (instance) - rp_len++; + rp_len += hdev->adv_instance_cnt; rp = kmalloc(rp_len, GFP_ATOMIC); if (!rp) { @@ -6797,12 +6795,16 @@ static int read_adv_features(struct sock *sk, struct hci_dev *hdev, rp->max_scan_rsp_len = HCI_MAX_AD_LENGTH; rp->max_instances = HCI_MAX_ADV_INSTANCES; - /* Currently only one instance is supported, so simply return the - * current instance number. - */ if (instance) { - rp->num_instances = 1; - rp->instance[0] = 1; + i = 0; + list_for_each_entry(adv_instance, &hdev->adv_instances, list) { + if (i >= hdev->adv_instance_cnt) + break; + + rp->instance[i] = adv_instance->instance; + i++; + } + rp->num_instances = hdev->adv_instance_cnt; } else { rp->num_instances = 0; } -- cgit From 3ff37e6b8cfcc57412320ec30f2b66d47f271cad Mon Sep 17 00:00:00 2001 From: Florian Grandel Date: Thu, 18 Jun 2015 03:16:39 +0200 Subject: Bluetooth: mgmt: multi adv for get_current_adv_instance() Replaces the hard coded instance identifier in get_current_adv_instance() with the actual current instance identifier so that this method is prepared to work with more than one advertising instance. Signed-off-by: Florian Grandel Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 49356c7788f9..55b91530ea26 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -841,7 +841,7 @@ static u8 get_current_adv_instance(struct hci_dev *hdev) */ if (hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE) && !hci_dev_test_flag(hdev, HCI_ADVERTISING)) - return 0x01; + return hdev->cur_adv_instance; return 0x00; } -- cgit From 411b4121e3a38ff78695ae019642a9cab1babf12 Mon Sep 17 00:00:00 2001 From: Florian Grandel Date: Thu, 18 Jun 2015 03:16:40 +0200 Subject: Bluetooth: mgmt: multi adv for get_adv_instance_flags() The get_adv_instance_flags() would not work with instance identifiers other than 0x01. This is being fixed so that arbitrary instance identifiers can be dealt with while still correctly dealing with the special case of the 0x00 identifier. Signed-off-by: Florian Grandel Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 55b91530ea26..05f14c54b7ed 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -960,12 +960,17 @@ static bool get_connectable(struct hci_dev *hdev) static u32 get_adv_instance_flags(struct hci_dev *hdev, u8 instance) { u32 flags; + struct adv_info *adv_instance; - if (instance > 0x01) - return 0; + if (instance != 0x00) { + adv_instance = hci_find_adv_instance(hdev, instance); - if (instance == 0x01) - return hdev->adv_instance.flags; + /* Return 0 when we got an invalid instance identifier. */ + if (!adv_instance) + return 0; + + return adv_instance->flags; + } /* Instance 0 always manages the "Tx Power" and "Flags" fields */ flags = MGMT_ADV_FLAG_TX_POWER | MGMT_ADV_FLAG_MANAGED_FLAGS; -- cgit From bea28e65994de7fda06eb5f76aef3d25bd9c135f Mon Sep 17 00:00:00 2001 From: Florian Grandel Date: Thu, 18 Jun 2015 03:16:41 +0200 Subject: Bluetooth: mgmt: improve get_adv_instance_flags() readability Switch if and else conditions to replace a negative statement by a positive one which makes the condition more readable. Signed-off-by: Florian Grandel Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 05f14c54b7ed..47fa16bffbe2 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -962,26 +962,28 @@ static u32 get_adv_instance_flags(struct hci_dev *hdev, u8 instance) u32 flags; struct adv_info *adv_instance; - if (instance != 0x00) { - adv_instance = hci_find_adv_instance(hdev, instance); + if (instance == 0x00) { + /* Instance 0 always manages the "Tx Power" and "Flags" + * fields + */ + flags = MGMT_ADV_FLAG_TX_POWER | MGMT_ADV_FLAG_MANAGED_FLAGS; - /* Return 0 when we got an invalid instance identifier. */ - if (!adv_instance) - return 0; + /* For instance 0, the HCI_ADVERTISING_CONNECTABLE setting + * corresponds to the "connectable" instance flag. + */ + if (hci_dev_test_flag(hdev, HCI_ADVERTISING_CONNECTABLE)) + flags |= MGMT_ADV_FLAG_CONNECTABLE; - return adv_instance->flags; + return flags; } - /* Instance 0 always manages the "Tx Power" and "Flags" fields */ - flags = MGMT_ADV_FLAG_TX_POWER | MGMT_ADV_FLAG_MANAGED_FLAGS; + adv_instance = hci_find_adv_instance(hdev, instance); - /* For instance 0, the HCI_ADVERTISING_CONNECTABLE setting corresponds - * to the "connectable" instance flag. - */ - if (hci_dev_test_flag(hdev, HCI_ADVERTISING_CONNECTABLE)) - flags |= MGMT_ADV_FLAG_CONNECTABLE; + /* Return 0 when we got an invalid instance identifier. */ + if (!adv_instance) + return 0; - return flags; + return adv_instance->flags; } static u8 get_adv_instance_scan_rsp_len(struct hci_dev *hdev, u8 instance) -- cgit From 7b683b744ef9492e91dd849e0a7451f55661c9d7 Mon Sep 17 00:00:00 2001 From: Florian Grandel Date: Thu, 18 Jun 2015 03:16:42 +0200 Subject: Bluetooth: mgmt: multi adv for enable_advertising() Previously enable_advertising() would rely on get_adv_instance_scan_rsp_len() which checked for a hard coded instance identifier. This is being changed to check for the current advertising instance's scan response length instead. The function is renamed accordingly. Signed-off-by: Florian Grandel Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 47fa16bffbe2..76aee8a3acf0 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -986,16 +986,23 @@ static u32 get_adv_instance_flags(struct hci_dev *hdev, u8 instance) return adv_instance->flags; } -static u8 get_adv_instance_scan_rsp_len(struct hci_dev *hdev, u8 instance) +static u8 get_cur_adv_instance_scan_rsp_len(struct hci_dev *hdev) { - /* Ignore instance 0 and other unsupported instances */ - if (instance != 0x01) + u8 instance = get_current_adv_instance(hdev); + struct adv_info *adv_instance; + + /* Ignore instance 0 */ + if (instance == 0x00) + return 0; + + adv_instance = hci_find_adv_instance(hdev, instance); + if (!adv_instance) return 0; /* TODO: Take into account the "appearance" and "local-name" flags here. * These are currently being ignored as they are not supported. */ - return hdev->adv_instance.scan_rsp_len; + return adv_instance->scan_rsp_len; } static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr) @@ -1266,7 +1273,7 @@ static void enable_advertising(struct hci_request *req) if (connectable) cp.type = LE_ADV_IND; - else if (get_adv_instance_scan_rsp_len(hdev, instance)) + else if (get_cur_adv_instance_scan_rsp_len(hdev)) cp.type = LE_ADV_SCAN_IND; else cp.type = LE_ADV_NONCONN_IND; -- cgit From ca21fbe97c444c42fee7211dd625f2f4d373fad8 Mon Sep 17 00:00:00 2001 From: Florian Grandel Date: Thu, 18 Jun 2015 03:16:43 +0200 Subject: Bluetooth: mgmt: multi adv for create_instance_scan_rsp_data() The create_instance_scan_rsp_data() function could not deal with multiple advertising instances previously. This is being fixed by adding an additional instance parameter. Signed-off-by: Florian Grandel Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 76aee8a3acf0..fc807dcc2533 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -872,15 +872,22 @@ static u8 create_default_scan_rsp_data(struct hci_dev *hdev, u8 *ptr) return ad_len; } -static u8 create_instance_scan_rsp_data(struct hci_dev *hdev, u8 *ptr) +static u8 create_instance_scan_rsp_data(struct hci_dev *hdev, u8 instance, + u8 *ptr) { + struct adv_info *adv_instance; + + adv_instance = hci_find_adv_instance(hdev, instance); + if (!adv_instance) + return 0; + /* TODO: Set the appropriate entries based on advertising instance flags * here once flags other than 0 are supported. */ - memcpy(ptr, hdev->adv_instance.scan_rsp_data, - hdev->adv_instance.scan_rsp_len); + memcpy(ptr, adv_instance->scan_rsp_data, + adv_instance->scan_rsp_len); - return hdev->adv_instance.scan_rsp_len; + return adv_instance->scan_rsp_len; } static void update_inst_scan_rsp_data(struct hci_request *req, u8 instance) @@ -895,7 +902,7 @@ static void update_inst_scan_rsp_data(struct hci_request *req, u8 instance) memset(&cp, 0, sizeof(cp)); if (instance) - len = create_instance_scan_rsp_data(hdev, cp.data); + len = create_instance_scan_rsp_data(hdev, instance, cp.data); else len = create_default_scan_rsp_data(hdev, cp.data); -- cgit From f63ba24b97ac795c516315c2b1b8a8463a6acd46 Mon Sep 17 00:00:00 2001 From: Florian Grandel Date: Thu, 18 Jun 2015 03:16:44 +0200 Subject: Bluetooth: mgmt: multi adv for create_instance_adv_data() The create_instance_adv_data() function could not deal with multiple advertising instances previously. This is being fixed by retrieving advertising instances from the newly introduced dynamic advertising instance list. Signed-off-by: Florian Grandel Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index fc807dcc2533..04efc56d1641 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -1014,8 +1014,18 @@ static u8 get_cur_adv_instance_scan_rsp_len(struct hci_dev *hdev) static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr) { + struct adv_info *adv_instance = NULL; u8 ad_len = 0, flags = 0; - u32 instance_flags = get_adv_instance_flags(hdev, instance); + u32 instance_flags; + + /* Return 0 when the current instance identifier is invalid. */ + if (instance) { + adv_instance = hci_find_adv_instance(hdev, instance); + if (!adv_instance) + return 0; + } + + instance_flags = get_adv_instance_flags(hdev, instance); /* The Add Advertising command allows userspace to set both the general * and limited discoverable flags. @@ -1049,12 +1059,11 @@ static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr) } } - if (instance) { - memcpy(ptr, hdev->adv_instance.adv_data, - hdev->adv_instance.adv_data_len); - - ad_len += hdev->adv_instance.adv_data_len; - ptr += hdev->adv_instance.adv_data_len; + if (adv_instance) { + memcpy(ptr, adv_instance->adv_data, + adv_instance->adv_data_len); + ad_len += adv_instance->adv_data_len; + ptr += adv_instance->adv_data_len; } /* Provide Tx Power only if we can provide a valid value for it */ -- cgit From 7816b82039b56308a0d685e97d4a9f4b52e239bd Mon Sep 17 00:00:00 2001 From: Florian Grandel Date: Thu, 18 Jun 2015 03:16:45 +0200 Subject: Bluetooth: mgmt: multi adv for set_advertising*() The set_advertising() and set_advertising_complete() methods rely on the now obsolete hci_dev->adv_instance structure. We replace this reference by an equivalent access to the newly introduced dynamic advertising instance list. This patch introduces a helper function that schedules an advertising instance correctly calculating advertising timing based on the timeout and duration settings of the instance. Scheduling is factored into its own function for readability and code sharing. Signed-off-by: Florian Grandel Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt.c | 100 +++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 94 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 04efc56d1641..55765dd79070 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -1471,6 +1471,73 @@ static void advertising_removed(struct sock *sk, struct hci_dev *hdev, mgmt_event(MGMT_EV_ADVERTISING_REMOVED, hdev, &ev, sizeof(ev), sk); } +static int schedule_adv_instance(struct hci_request *req, u8 instance, + bool force) { + struct hci_dev *hdev = req->hdev; + struct adv_info *adv_instance = NULL; + u16 timeout; + + if (hci_dev_test_flag(hdev, HCI_ADVERTISING) || + !hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE)) + return -EPERM; + + if (hdev->adv_instance_timeout) + return -EBUSY; + + adv_instance = hci_find_adv_instance(hdev, instance); + if (!adv_instance) + return -ENOENT; + + /* A zero timeout means unlimited advertising. As long as there is + * only one instance, duration should be ignored. We still set a timeout + * in case further instances are being added later on. + * + * If the remaining lifetime of the instance is more than the duration + * then the timeout corresponds to the duration, otherwise it will be + * reduced to the remaining instance lifetime. + */ + if (adv_instance->timeout == 0 || + adv_instance->duration <= adv_instance->remaining_time) + timeout = adv_instance->duration; + else + timeout = adv_instance->remaining_time; + + /* The remaining time is being reduced unless the instance is being + * advertised without time limit. + */ + if (adv_instance->timeout) + adv_instance->remaining_time = + adv_instance->remaining_time - timeout; + + hdev->adv_instance_timeout = timeout; + queue_delayed_work(hdev->workqueue, + &hdev->adv_instance_expire, + msecs_to_jiffies(timeout * 1000)); + + /* If we're just re-scheduling the same instance again then do not + * execute any HCI commands. This happens when a single instance is + * being advertised. + */ + if (!force && hdev->cur_adv_instance == instance && + hci_dev_test_flag(hdev, HCI_LE_ADV)) + return 0; + + hdev->cur_adv_instance = instance; + update_adv_data(req); + update_scan_rsp_data(req); + enable_advertising(req); + + return 0; +} + +static void cancel_adv_timeout(struct hci_dev *hdev) +{ + if (hdev->adv_instance_timeout) { + hdev->adv_instance_timeout = 0; + cancel_delayed_work(&hdev->adv_instance_expire); + } +} + static void clear_adv_instance(struct hci_dev *hdev) { struct hci_request req; @@ -4681,6 +4748,9 @@ static void set_advertising_complete(struct hci_dev *hdev, u8 status, { struct cmd_lookup match = { NULL, hdev }; struct hci_request req; + u8 instance; + struct adv_info *adv_instance; + int err; hci_dev_lock(hdev); @@ -4706,18 +4776,31 @@ static void set_advertising_complete(struct hci_dev *hdev, u8 status, sock_put(match.sk); /* If "Set Advertising" was just disabled and instance advertising was - * set up earlier, then enable the advertising instance. + * set up earlier, then re-enable multi-instance advertising. */ if (hci_dev_test_flag(hdev, HCI_ADVERTISING) || - !hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE)) + !hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE) || + list_empty(&hdev->adv_instances)) goto unlock; + instance = hdev->cur_adv_instance; + if (!instance) { + adv_instance = list_first_entry_or_null(&hdev->adv_instances, + struct adv_info, list); + if (!adv_instance) + goto unlock; + + instance = adv_instance->instance; + } + hci_req_init(&req, hdev); - update_adv_data(&req); - enable_advertising(&req); + err = schedule_adv_instance(&req, instance, true); + + if (!err) + err = hci_req_run(&req, enable_advertising_instance); - if (hci_req_run(&req, enable_advertising_instance) < 0) + if (err) BT_ERR("Failed to re-configure advertising"); unlock: @@ -4802,8 +4885,13 @@ static int set_advertising(struct sock *sk, struct hci_dev *hdev, void *data, else hci_dev_clear_flag(hdev, HCI_ADVERTISING_CONNECTABLE); + cancel_adv_timeout(hdev); + if (val) { - /* Switch to instance "0" for the Set Advertising setting. */ + /* Switch to instance "0" for the Set Advertising setting. + * We cannot use update_[adv|scan_rsp]_data() here as the + * HCI_ADVERTISING flag is not yet set. + */ update_inst_adv_data(&req, 0x00); update_inst_scan_rsp_data(&req, 0x00); enable_advertising(&req); -- cgit From 847818d9c05f8951270600c0d3260871dbc23134 Mon Sep 17 00:00:00 2001 From: Florian Grandel Date: Thu, 18 Jun 2015 03:16:46 +0200 Subject: Bluetooth: mgmt: multi adv for clear_adv_instances() The clear_adv_instance() function could not clean up multiple advertising instances previously. It is being changed to provide both, a means to clean up a single instance and cleaning up all instances at once. An additional instance parameter is being introduced to achieve this. Passing in 0x00 to this parameter signifies that all instances should be cleaned up. This semantics has been chosen similarly to the semantics of the instance parameter in the remove_advertising() function. When removing a single instance the method also ensures that another instance will be scheduled if available. When the currently advertising method is being removed, it will be canceled immediately. Signed-off-by: Florian Grandel Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt.c | 97 +++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 81 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 55765dd79070..ac5fc357c757 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -1538,27 +1538,74 @@ static void cancel_adv_timeout(struct hci_dev *hdev) } } -static void clear_adv_instance(struct hci_dev *hdev) +/* For a single instance: + * - force == true: The instance will be removed even when its remaining + * lifetime is not zero. + * - force == false: the instance will be deactivated but kept stored unless + * the remaining lifetime is zero. + * + * For instance == 0x00: + * - force == true: All instances will be removed regardless of their timeout + * setting. + * - force == false: Only instances that have a timeout will be removed. + */ +static void clear_adv_instance(struct hci_dev *hdev, struct hci_request *req, + u8 instance, bool force) { - struct hci_request req; + struct adv_info *adv_instance, *n, *next_instance = NULL; + int err; + u8 rem_inst; - if (!hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE)) - return; + /* Cancel any timeout concerning the removed instance(s). */ + if (!instance || hdev->cur_adv_instance == instance) + cancel_adv_timeout(hdev); - if (hdev->adv_instance_timeout) - cancel_delayed_work(&hdev->adv_instance_expire); + /* Get the next instance to advertise BEFORE we remove + * the current one. This can be the same instance again + * if there is only one instance. + */ + if (instance && hdev->cur_adv_instance == instance) + next_instance = hci_get_next_instance(hdev, instance); - memset(&hdev->adv_instance, 0, sizeof(hdev->adv_instance)); - advertising_removed(NULL, hdev, 1); - hci_dev_clear_flag(hdev, HCI_ADVERTISING_INSTANCE); + if (instance == 0x00) { + list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, + list) { + if (!(force || adv_instance->timeout)) + continue; - if (!hdev_is_powered(hdev) || + rem_inst = adv_instance->instance; + err = hci_remove_adv_instance(hdev, rem_inst); + if (!err) + advertising_removed(NULL, hdev, rem_inst); + } + hdev->cur_adv_instance = 0x00; + } else { + adv_instance = hci_find_adv_instance(hdev, instance); + + if (force || (adv_instance && adv_instance->timeout && + !adv_instance->remaining_time)) { + /* Don't advertise a removed instance. */ + if (next_instance && + next_instance->instance == instance) + next_instance = NULL; + + err = hci_remove_adv_instance(hdev, instance); + if (!err) + advertising_removed(NULL, hdev, instance); + } + } + + if (list_empty(&hdev->adv_instances)) { + hdev->cur_adv_instance = 0x00; + hci_dev_clear_flag(hdev, HCI_ADVERTISING_INSTANCE); + } + + if (!req || !hdev_is_powered(hdev) || hci_dev_test_flag(hdev, HCI_ADVERTISING)) return; - hci_req_init(&req, hdev); - disable_advertising(&req); - hci_req_run(&req, NULL); + if (next_instance) + schedule_adv_instance(req, next_instance->instance, false); } static int clean_up_hci_state(struct hci_dev *hdev) @@ -1576,8 +1623,7 @@ static int clean_up_hci_state(struct hci_dev *hdev) hci_req_add(&req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); } - if (hdev->adv_instance_timeout) - clear_adv_instance(hdev); + clear_adv_instance(hdev, NULL, 0x00, false); if (hci_dev_test_flag(hdev, HCI_LE_ADV)) disable_advertising(&req); @@ -2532,6 +2578,9 @@ static int set_le(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) val = !!cp->val; enabled = lmp_host_le_capable(hdev); + if (!val) + clear_adv_instance(hdev, NULL, 0x00, true); + if (!hdev_is_powered(hdev) || val == enabled) { bool changed = false; @@ -7018,10 +7067,26 @@ unlock: void mgmt_adv_timeout_expired(struct hci_dev *hdev) { + u8 instance; + struct hci_request req; + hdev->adv_instance_timeout = 0; + instance = get_current_adv_instance(hdev); + if (instance == 0x00) + return; + hci_dev_lock(hdev); - clear_adv_instance(hdev); + hci_req_init(&req, hdev); + + clear_adv_instance(hdev, &req, instance, false); + + if (list_empty(&hdev->adv_instances)) + disable_advertising(&req); + + if (!skb_queue_empty(&req.cmd_q)) + hci_req_run(&req, NULL); + hci_dev_unlock(hdev); } -- cgit From fffd38bca51c9a1c00508b754ab66edb6f39cf37 Mon Sep 17 00:00:00 2001 From: Florian Grandel Date: Thu, 18 Jun 2015 03:16:47 +0200 Subject: Bluetooth: mgmt/hci_core: multi-adv for add_advertising*() The add_advertising() and add_advertising_complete() functions reference the now obsolete hdev->adv_instance struct. Both methods are being refactored to access the dynamic advertising instance list instead. This patch also introduces all logic necessary to actually deal with multiple instance advertising. Notably the mgmt_adv_inst_expired() and schedule_adv_inst() method are being referenced to schedule instances in a round robin fashion. This patch also introduces a "pending" flag into the adv_info struct. This is necessary to identify and remove recently added advertising instances when the HCI commands return with an error status code. Otherwise new advertising instances could be leaked without properly informing userspace about their existence. Signed-off-by: Florian Grandel Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_core.c | 1 + net/bluetooth/mgmt.c | 108 ++++++++++++++++++++++++++++++++--------------- 2 files changed, 74 insertions(+), 35 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index d1110db3b0d4..e50f7c3c67f8 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2721,6 +2721,7 @@ int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags, return -ENOMEM; memset(adv_instance, 0, sizeof(*adv_instance)); + adv_instance->pending = true; adv_instance->instance = instance; list_add(&adv_instance->list, &hdev->adv_instances); hdev->adv_instance_cnt++; diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index ac5fc357c757..0cc685495510 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -7033,7 +7033,10 @@ static void add_advertising_complete(struct hci_dev *hdev, u8 status, u16 opcode) { struct mgmt_pending_cmd *cmd; + struct mgmt_cp_add_advertising *cp; struct mgmt_rp_add_advertising rp; + struct adv_info *adv_instance, *n; + u8 instance; BT_DBG("status %d", status); @@ -7041,16 +7044,32 @@ static void add_advertising_complete(struct hci_dev *hdev, u8 status, cmd = pending_find(MGMT_OP_ADD_ADVERTISING, hdev); - if (status) { + if (status) hci_dev_clear_flag(hdev, HCI_ADVERTISING_INSTANCE); - memset(&hdev->adv_instance, 0, sizeof(hdev->adv_instance)); - advertising_removed(cmd ? cmd->sk : NULL, hdev, 1); + + list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, list) { + if (!adv_instance->pending) + continue; + + if (!status) { + adv_instance->pending = false; + continue; + } + + instance = adv_instance->instance; + + if (hdev->cur_adv_instance == instance) + cancel_adv_timeout(hdev); + + hci_remove_adv_instance(hdev, instance); + advertising_removed(cmd ? cmd->sk : NULL, hdev, instance); } if (!cmd) goto unlock; - rp.instance = 0x01; + cp = cmd->param; + rp.instance = cp->instance; if (status) mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, @@ -7098,7 +7117,10 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev, u32 flags; u32 supported_flags; u8 status; - u16 timeout; + u16 timeout, duration; + unsigned int prev_instance_cnt = hdev->adv_instance_cnt; + u8 schedule_instance = 0; + struct adv_info *next_instance; int err; struct mgmt_pending_cmd *cmd; struct hci_request req; @@ -7112,12 +7134,13 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev, flags = __le32_to_cpu(cp->flags); timeout = __le16_to_cpu(cp->timeout); + duration = __le16_to_cpu(cp->duration); - /* The current implementation only supports adding one instance and only - * a subset of the specified flags. + /* The current implementation only supports a subset of the specified + * flags. */ supported_flags = get_supported_adv_flags(hdev); - if (cp->instance != 0x01 || (flags & ~supported_flags)) + if (flags & ~supported_flags) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING, MGMT_STATUS_INVALID_PARAMS); @@ -7145,36 +7168,51 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev, goto unlock; } - hdev->adv_instance.flags = flags; - hdev->adv_instance.adv_data_len = cp->adv_data_len; - hdev->adv_instance.scan_rsp_len = cp->scan_rsp_len; - - if (cp->adv_data_len) - memcpy(hdev->adv_instance.adv_data, cp->data, cp->adv_data_len); - - if (cp->scan_rsp_len) - memcpy(hdev->adv_instance.scan_rsp_data, - cp->data + cp->adv_data_len, cp->scan_rsp_len); + err = hci_add_adv_instance(hdev, cp->instance, flags, + cp->adv_data_len, cp->data, + cp->scan_rsp_len, + cp->data + cp->adv_data_len, + timeout, duration); + if (err < 0) { + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING, + MGMT_STATUS_FAILED); + goto unlock; + } - if (hdev->adv_instance_timeout) - cancel_delayed_work(&hdev->adv_instance_expire); + /* Only trigger an advertising added event if a new instance was + * actually added. + */ + if (hdev->adv_instance_cnt > prev_instance_cnt) + advertising_added(sk, hdev, cp->instance); - hdev->adv_instance_timeout = timeout; + hci_dev_set_flag(hdev, HCI_ADVERTISING_INSTANCE); - if (timeout) - queue_delayed_work(hdev->workqueue, - &hdev->adv_instance_expire, - msecs_to_jiffies(timeout * 1000)); + if (hdev->cur_adv_instance == cp->instance) { + /* If the currently advertised instance is being changed then + * cancel the current advertising and schedule the next + * instance. If there is only one instance then the overridden + * advertising data will be visible right away. + */ + cancel_adv_timeout(hdev); - if (!hci_dev_test_and_set_flag(hdev, HCI_ADVERTISING_INSTANCE)) - advertising_added(sk, hdev, 1); + next_instance = hci_get_next_instance(hdev, cp->instance); + if (next_instance) + schedule_instance = next_instance->instance; + } else if (!hdev->adv_instance_timeout) { + /* Immediately advertise the new instance if no other + * instance is currently being advertised. + */ + schedule_instance = cp->instance; + } - /* If the HCI_ADVERTISING flag is set or the device isn't powered then - * we have no HCI communication to make. Simply return. + /* If the HCI_ADVERTISING flag is set or the device isn't powered or + * there is no instance to be advertised then we have no HCI + * communication to make. Simply return. */ if (!hdev_is_powered(hdev) || - hci_dev_test_flag(hdev, HCI_ADVERTISING)) { - rp.instance = 0x01; + hci_dev_test_flag(hdev, HCI_ADVERTISING) || + !schedule_instance) { + rp.instance = cp->instance; err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_ADVERTISING, MGMT_STATUS_SUCCESS, &rp, sizeof(rp)); goto unlock; @@ -7192,11 +7230,11 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev, hci_req_init(&req, hdev); - update_adv_data(&req); - update_scan_rsp_data(&req); - enable_advertising(&req); + err = schedule_adv_instance(&req, schedule_instance, true); + + if (!err) + err = hci_req_run(&req, add_advertising_complete); - err = hci_req_run(&req, add_advertising_complete); if (err < 0) mgmt_pending_remove(cmd); -- cgit From 01948331af001cd893c8733a4288e9ad246f62f3 Mon Sep 17 00:00:00 2001 From: Florian Grandel Date: Thu, 18 Jun 2015 03:16:48 +0200 Subject: Bluetooth: mgmt: multi adv for remove_advertising*() The remove_advertising() and remove_advertising_complete() functions had instance identifiers hard coded. Notably, when passing in 0x00 as an instance identifier to signal that all instances should be removed then the mgmt API would return a hard coded 0x01 rather than returning the expected value 0x00. This bug is being fixed by always referencing the instance identifier from the management API call instead. remove_advertising() is refactored to use the new dynamic advertising instance list. The logic is being changed to make multi-instance advertising actually work, notably the schedule_adv_instance() method is being referenced to make sure that other instances will continue to advertise even if one instance is being removed. The code is made more readable by factoring advertising instance management and initialization into the low-level hci_remove_adv_instance() and hci_adv_instances_clear() functions. The method now references the clear_adv_instance() helper method to remove duplicate logic and code. Signed-off-by: Florian Grandel Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt.c | 45 ++++++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 0cc685495510..c8ed16d8d999 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -7248,6 +7248,7 @@ static void remove_advertising_complete(struct hci_dev *hdev, u8 status, u16 opcode) { struct mgmt_pending_cmd *cmd; + struct mgmt_cp_remove_advertising *cp; struct mgmt_rp_remove_advertising rp; BT_DBG("status %d", status); @@ -7262,7 +7263,8 @@ static void remove_advertising_complete(struct hci_dev *hdev, u8 status, if (!cmd) goto unlock; - rp.instance = 1; + cp = cmd->param; + rp.instance = cp->instance; mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, MGMT_STATUS_SUCCESS, &rp, sizeof(rp)); @@ -7277,21 +7279,25 @@ static int remove_advertising(struct sock *sk, struct hci_dev *hdev, { struct mgmt_cp_remove_advertising *cp = data; struct mgmt_rp_remove_advertising rp; + struct adv_info *adv_instance; int err; struct mgmt_pending_cmd *cmd; struct hci_request req; BT_DBG("%s", hdev->name); - /* The current implementation only allows modifying instance no 1. A - * value of 0 indicates that all instances should be cleared. - */ - if (cp->instance > 1) - return mgmt_cmd_status(sk, hdev->id, MGMT_OP_REMOVE_ADVERTISING, - MGMT_STATUS_INVALID_PARAMS); - hci_dev_lock(hdev); + if (cp->instance) + adv_instance = hci_find_adv_instance(hdev, cp->instance); + + if (!(cp->instance == 0x00 || adv_instance)) { + err = mgmt_cmd_status(sk, hdev->id, + MGMT_OP_REMOVE_ADVERTISING, + MGMT_STATUS_INVALID_PARAMS); + goto unlock; + } + if (pending_find(MGMT_OP_ADD_ADVERTISING, hdev) || pending_find(MGMT_OP_REMOVE_ADVERTISING, hdev) || pending_find(MGMT_OP_SET_LE, hdev)) { @@ -7306,21 +7312,21 @@ static int remove_advertising(struct sock *sk, struct hci_dev *hdev, goto unlock; } - if (hdev->adv_instance_timeout) - cancel_delayed_work(&hdev->adv_instance_expire); - - memset(&hdev->adv_instance, 0, sizeof(hdev->adv_instance)); + hci_req_init(&req, hdev); - advertising_removed(sk, hdev, 1); + clear_adv_instance(hdev, &req, cp->instance, true); - hci_dev_clear_flag(hdev, HCI_ADVERTISING_INSTANCE); + if (list_empty(&hdev->adv_instances)) + disable_advertising(&req); - /* If the HCI_ADVERTISING flag is set or the device isn't powered then - * we have no HCI communication to make. Simply return. + /* If no HCI commands have been collected so far or the HCI_ADVERTISING + * flag is set or the device isn't powered then we have no HCI + * communication to make. Simply return. */ - if (!hdev_is_powered(hdev) || + if (skb_queue_empty(&req.cmd_q) || + !hdev_is_powered(hdev) || hci_dev_test_flag(hdev, HCI_ADVERTISING)) { - rp.instance = 1; + rp.instance = cp->instance; err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_REMOVE_ADVERTISING, MGMT_STATUS_SUCCESS, &rp, sizeof(rp)); @@ -7334,9 +7340,6 @@ static int remove_advertising(struct sock *sk, struct hci_dev *hdev, goto unlock; } - hci_req_init(&req, hdev); - disable_advertising(&req); - err = hci_req_run(&req, remove_advertising_complete); if (err < 0) mgmt_pending_remove(cmd); -- cgit From 320b3bf7027b89821030e5e7c5fbd1f6aecb4af0 Mon Sep 17 00:00:00 2001 From: Florian Grandel Date: Thu, 18 Jun 2015 03:16:49 +0200 Subject: Bluetooth: mgmt: program multi-adv on power on Advertising instances programmed while powered off should be advertised once the device is powered. This patch ensures that all combinations of setting and/or adding advertising configuration while powered off will be correctly activated on power on. Signed-off-by: Florian Grandel Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index c8ed16d8d999..57e27b7ae3db 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -7564,6 +7564,7 @@ static void powered_complete(struct hci_dev *hdev, u8 status, u16 opcode) static int powered_update_hci(struct hci_dev *hdev) { struct hci_request req; + struct adv_info *adv_instance; u8 link_sec; hci_req_init(&req, hdev); @@ -7603,14 +7604,27 @@ static int powered_update_hci(struct hci_dev *hdev) * advertising data. This also applies to the case * where BR/EDR was toggled during the AUTO_OFF phase. */ - if (hci_dev_test_flag(hdev, HCI_LE_ENABLED)) { + if (hci_dev_test_flag(hdev, HCI_LE_ENABLED) && + (hci_dev_test_flag(hdev, HCI_ADVERTISING) || + !hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE))) { update_adv_data(&req); update_scan_rsp_data(&req); } - if (hci_dev_test_flag(hdev, HCI_ADVERTISING) || - hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE)) + if (hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE) && + hdev->cur_adv_instance == 0x00 && + !list_empty(&hdev->adv_instances)) { + adv_instance = list_first_entry(&hdev->adv_instances, + struct adv_info, list); + hdev->cur_adv_instance = adv_instance->instance; + } + + if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) enable_advertising(&req); + else if (hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE) && + hdev->cur_adv_instance) + schedule_adv_instance(&req, hdev->cur_adv_instance, + true); restart_le_actions(&req); } -- cgit From 9d5fc2f23afc842cbb3ee85600cd92bf5120a71d Mon Sep 17 00:00:00 2001 From: Florian Grandel Date: Thu, 18 Jun 2015 03:16:50 +0200 Subject: Bluetooth: mgmt: multi-adv for trigger_le_scan() This patch ensures that instance advertising is correctly canceled before starting a le scan. Signed-off-by: Florian Grandel Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 57e27b7ae3db..f44c6e3be938 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -4215,6 +4215,7 @@ static bool trigger_le_scan(struct hci_request *req, u16 interval, u8 *status) return false; } + cancel_adv_timeout(hdev); disable_advertising(req); } -- cgit From eb6f95f9c38de4cf64600c1ad29fc8e02224b155 Mon Sep 17 00:00:00 2001 From: Florian Grandel Date: Thu, 18 Jun 2015 03:16:51 +0200 Subject: Bluetooth: mgmt: multi-adv for mgmt_reenable_advertising() During service discovery, advertising will be disabled. This patch ensures that it is correctly being re-enabled, both for configuration made via set advertising and add advertising, once the scanning times out. Signed-off-by: Florian Grandel Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index f44c6e3be938..b132032f5710 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -8610,13 +8610,24 @@ static void adv_enable_complete(struct hci_dev *hdev, u8 status, u16 opcode) void mgmt_reenable_advertising(struct hci_dev *hdev) { struct hci_request req; + u8 instance; if (!hci_dev_test_flag(hdev, HCI_ADVERTISING) && !hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE)) return; + instance = get_current_adv_instance(hdev); + hci_req_init(&req, hdev); - enable_advertising(&req); + + if (instance) { + schedule_adv_instance(&req, instance, true); + } else { + update_adv_data(&req); + update_scan_rsp_data(&req); + enable_advertising(&req); + } + hci_req_run(&req, adv_enable_complete); } -- cgit From d4c5af8f71c8104504a83f7c71911550ebe43ac3 Mon Sep 17 00:00:00 2001 From: Florian Grandel Date: Thu, 18 Jun 2015 03:16:52 +0200 Subject: Bluetooth: hci_core: remove obsolete adv_instance Now that the obsolete adv_instance is no longer being referenced anywhere in the code it can be removed without breaking the build. Signed-off-by: Florian Grandel Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_core.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index e50f7c3c67f8..86ed44e39649 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3224,7 +3224,6 @@ struct hci_dev *hci_alloc_dev(void) hci_init_sysfs(hdev); discovery_init(hdev); - adv_info_init(hdev); return hdev; } -- cgit From e58627d1ec840ee2d8aca5aeae9899319b58e8f0 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Thu, 18 Jun 2015 18:58:03 +0200 Subject: Bluetooth: Increment management interface revision This patch increments the management interface revision due to introduction of new multi-advertising feature and various bug fixes. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/mgmt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index b132032f5710..2fe6f3bfc579 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -38,7 +38,7 @@ #include "mgmt_util.h" #define MGMT_VERSION 1 -#define MGMT_REVISION 9 +#define MGMT_REVISION 10 static const u16 mgmt_commands[] = { MGMT_OP_READ_INDEX_LIST, -- cgit From 39ecfad68f14b71b6d1917c27eb01c4936f88bda Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Thu, 18 Jun 2015 20:50:08 +0300 Subject: Bluetooth: Use zalloc when possible Use zallog for adv_instance allocation instead of kmalloc + memset. This also fixes the following coccinelle warning: >> net/bluetooth/hci_core.c:2693:17-24: WARNING: kzalloc should be used for adv_instance, instead of kmalloc/memset Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 86ed44e39649..2f8fb33067e1 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2716,11 +2716,10 @@ int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags, instance < 1 || instance > HCI_MAX_ADV_INSTANCES) return -EOVERFLOW; - adv_instance = kmalloc(sizeof(*adv_instance), GFP_KERNEL); + adv_instance = kzalloc(sizeof(*adv_instance), GFP_KERNEL); if (!adv_instance) return -ENOMEM; - memset(adv_instance, 0, sizeof(*adv_instance)); adv_instance->pending = true; adv_instance->instance = instance; list_add(&adv_instance->list, &hdev->adv_instances); -- cgit From 952497b159468477392f9b562b904da9bc76d468 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Thu, 18 Jun 2015 21:05:31 +0300 Subject: Bluetooth: Fix warning of potentially uninitialized adv_instance variable Rework the logic of checking for a valid adv_instance for non-zero cp->instance values. Without this change we may get (false positive) warnings as follows: >> net/bluetooth/mgmt.c:7294:29: warning: 'adv_instance' may be used uninitialized in this function [-Wuninitialized] Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 2fe6f3bfc579..7998fb279165 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -7280,19 +7280,15 @@ static int remove_advertising(struct sock *sk, struct hci_dev *hdev, { struct mgmt_cp_remove_advertising *cp = data; struct mgmt_rp_remove_advertising rp; - struct adv_info *adv_instance; - int err; struct mgmt_pending_cmd *cmd; struct hci_request req; + int err; BT_DBG("%s", hdev->name); hci_dev_lock(hdev); - if (cp->instance) - adv_instance = hci_find_adv_instance(hdev, cp->instance); - - if (!(cp->instance == 0x00 || adv_instance)) { + if (cp->instance && !hci_find_adv_instance(hdev, cp->instance)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_REMOVE_ADVERTISING, MGMT_STATUS_INVALID_PARAMS); -- cgit From 230ac490f7fba2aea52914c69d14b15dd515e49c Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 16 Jun 2015 14:07:03 +0200 Subject: netfilter: bridge: split ipv6 code into separated file Resolve compilation breakage when CONFIG_IPV6 is not set by moving the IPv6 code into a separated br_netfilter_ipv6.c file. Fixes: efb6de9b4ba0 ("netfilter: bridge: forward IPv6 fragmented packets") Reported-by: kbuild test robot Signed-off-by: Pablo Neira Ayuso --- net/bridge/Makefile | 1 + net/bridge/br_netfilter_hooks.c | 248 ++-------------------------------------- net/bridge/br_netfilter_ipv6.c | 245 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 255 insertions(+), 239 deletions(-) create mode 100644 net/bridge/br_netfilter_ipv6.c (limited to 'net') diff --git a/net/bridge/Makefile b/net/bridge/Makefile index c52577ac718e..a1cda5d4718d 100644 --- a/net/bridge/Makefile +++ b/net/bridge/Makefile @@ -13,6 +13,7 @@ bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o bridge-$(subst m,y,$(CONFIG_BRIDGE_NETFILTER)) += br_nf_core.o br_netfilter-y := br_netfilter_hooks.o +br_netfilter-$(subst m,y,$(CONFIG_IPV6)) += br_netfilter_ipv6.o obj-$(CONFIG_BRIDGE_NETFILTER) += br_netfilter.o bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o br_mdb.o diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index e4e5f2f29173..d89f4fac0bc5 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -123,11 +123,6 @@ struct brnf_frag_data { static DEFINE_PER_CPU(struct brnf_frag_data, brnf_frag_data_storage); #endif -static struct nf_bridge_info *nf_bridge_info_get(const struct sk_buff *skb) -{ - return skb->nf_bridge; -} - static void nf_bridge_info_free(struct sk_buff *skb) { if (skb->nf_bridge) { @@ -136,14 +131,6 @@ static void nf_bridge_info_free(struct sk_buff *skb) } } -static inline struct rtable *bridge_parent_rtable(const struct net_device *dev) -{ - struct net_bridge_port *port; - - port = br_port_get_rcu(dev); - return port ? &port->br->fake_rtable : NULL; -} - static inline struct net_device *bridge_parent(const struct net_device *dev) { struct net_bridge_port *port; @@ -152,15 +139,6 @@ static inline struct net_device *bridge_parent(const struct net_device *dev) return port ? port->br->dev : NULL; } -static inline struct nf_bridge_info *nf_bridge_alloc(struct sk_buff *skb) -{ - skb->nf_bridge = kzalloc(sizeof(struct nf_bridge_info), GFP_ATOMIC); - if (likely(skb->nf_bridge)) - atomic_set(&(skb->nf_bridge->use), 1); - - return skb->nf_bridge; -} - static inline struct nf_bridge_info *nf_bridge_unshare(struct sk_buff *skb) { struct nf_bridge_info *nf_bridge = skb->nf_bridge; @@ -178,7 +156,7 @@ static inline struct nf_bridge_info *nf_bridge_unshare(struct sk_buff *skb) return nf_bridge; } -static unsigned int nf_bridge_encap_header_len(const struct sk_buff *skb) +unsigned int nf_bridge_encap_header_len(const struct sk_buff *skb) { switch (skb->protocol) { case __cpu_to_be16(ETH_P_8021Q): @@ -190,14 +168,6 @@ static unsigned int nf_bridge_encap_header_len(const struct sk_buff *skb) } } -static inline void nf_bridge_push_encap_header(struct sk_buff *skb) -{ - unsigned int len = nf_bridge_encap_header_len(skb); - - skb_push(skb, len); - skb->network_header -= len; -} - static inline void nf_bridge_pull_encap_header(struct sk_buff *skb) { unsigned int len = nf_bridge_encap_header_len(skb); @@ -267,112 +237,7 @@ drop: return -1; } -/* We only check the length. A bridge shouldn't do any hop-by-hop stuff - * anyway - */ -static int check_hbh_len(struct sk_buff *skb) -{ - unsigned char *raw = (u8 *)(ipv6_hdr(skb) + 1); - u32 pkt_len; - const unsigned char *nh = skb_network_header(skb); - int off = raw - nh; - int len = (raw[1] + 1) << 3; - - if ((raw + len) - skb->data > skb_headlen(skb)) - goto bad; - - off += 2; - len -= 2; - - while (len > 0) { - int optlen = nh[off + 1] + 2; - - switch (nh[off]) { - case IPV6_TLV_PAD1: - optlen = 1; - break; - - case IPV6_TLV_PADN: - break; - - case IPV6_TLV_JUMBO: - if (nh[off + 1] != 4 || (off & 3) != 2) - goto bad; - pkt_len = ntohl(*(__be32 *)(nh + off + 2)); - if (pkt_len <= IPV6_MAXPLEN || - ipv6_hdr(skb)->payload_len) - goto bad; - if (pkt_len > skb->len - sizeof(struct ipv6hdr)) - goto bad; - if (pskb_trim_rcsum(skb, - pkt_len + sizeof(struct ipv6hdr))) - goto bad; - nh = skb_network_header(skb); - break; - default: - if (optlen > len) - goto bad; - break; - } - off += optlen; - len -= optlen; - } - if (len == 0) - return 0; -bad: - return -1; -} - -/* Equivalent to br_validate_ipv4 for IPv6 */ -static int br_validate_ipv6(struct sk_buff *skb) -{ - const struct ipv6hdr *hdr; - struct net_device *dev = skb->dev; - struct inet6_dev *idev = in6_dev_get(skb->dev); - u32 pkt_len; - u8 ip6h_len = sizeof(struct ipv6hdr); - - if (!pskb_may_pull(skb, ip6h_len)) - goto inhdr_error; - - if (skb->len < ip6h_len) - goto drop; - - hdr = ipv6_hdr(skb); - - if (hdr->version != 6) - goto inhdr_error; - - pkt_len = ntohs(hdr->payload_len); - - if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) { - if (pkt_len + ip6h_len > skb->len) { - IP6_INC_STATS_BH(dev_net(dev), idev, - IPSTATS_MIB_INTRUNCATEDPKTS); - goto drop; - } - if (pskb_trim_rcsum(skb, pkt_len + ip6h_len)) { - IP6_INC_STATS_BH(dev_net(dev), idev, - IPSTATS_MIB_INDISCARDS); - goto drop; - } - } - if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb)) - goto drop; - - memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); - /* No IP options in IPv6 header; however it should be - * checked if some next headers need special treatment - */ - return 0; - -inhdr_error: - IP6_INC_STATS_BH(dev_net(dev), idev, IPSTATS_MIB_INHDRERRORS); -drop: - return -1; -} - -static void nf_bridge_update_protocol(struct sk_buff *skb) +void nf_bridge_update_protocol(struct sk_buff *skb) { switch (skb->nf_bridge->orig_proto) { case BRNF_PROTO_8021Q: @@ -391,7 +256,7 @@ static void nf_bridge_update_protocol(struct sk_buff *skb) * don't, we use the neighbour framework to find out. In both cases, we make * sure that br_handle_frame_finish() is called afterwards. */ -static int br_nf_pre_routing_finish_bridge(struct sock *sk, struct sk_buff *skb) +int br_nf_pre_routing_finish_bridge(struct sock *sk, struct sk_buff *skb) { struct neighbour *neigh; struct dst_entry *dst; @@ -431,77 +296,11 @@ free_skb: return 0; } -static bool daddr_was_changed(const struct sk_buff *skb, - const struct nf_bridge_info *nf_bridge) +static inline bool +br_nf_ipv4_daddr_was_changed(const struct sk_buff *skb, + const struct nf_bridge_info *nf_bridge) { - switch (skb->protocol) { - case htons(ETH_P_IP): - return ip_hdr(skb)->daddr != nf_bridge->ipv4_daddr; - case htons(ETH_P_IPV6): - return memcmp(&nf_bridge->ipv6_daddr, &ipv6_hdr(skb)->daddr, - sizeof(ipv6_hdr(skb)->daddr)) != 0; - default: - return false; - } -} - -/* PF_BRIDGE/PRE_ROUTING: Undo the changes made for ip6tables - * PREROUTING and continue the bridge PRE_ROUTING hook. See comment - * for br_nf_pre_routing_finish(), same logic is used here but - * equivalent IPv6 function ip6_route_input() called indirectly. - */ -static int br_nf_pre_routing_finish_ipv6(struct sock *sk, struct sk_buff *skb) -{ - struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); - struct rtable *rt; - struct net_device *dev = skb->dev; - const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops(); - - nf_bridge->frag_max_size = IP6CB(skb)->frag_max_size; - - if (nf_bridge->pkt_otherhost) { - skb->pkt_type = PACKET_OTHERHOST; - nf_bridge->pkt_otherhost = false; - } - nf_bridge->mask &= ~BRNF_NF_BRIDGE_PREROUTING; - if (daddr_was_changed(skb, nf_bridge)) { - skb_dst_drop(skb); - v6ops->route_input(skb); - - if (skb_dst(skb)->error) { - kfree_skb(skb); - return 0; - } - - if (skb_dst(skb)->dev == dev) { - skb->dev = nf_bridge->physindev; - nf_bridge_update_protocol(skb); - nf_bridge_push_encap_header(skb); - NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, - sk, skb, skb->dev, NULL, - br_nf_pre_routing_finish_bridge, - 1); - return 0; - } - ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr); - skb->pkt_type = PACKET_HOST; - } else { - rt = bridge_parent_rtable(nf_bridge->physindev); - if (!rt) { - kfree_skb(skb); - return 0; - } - skb_dst_set_noref(skb, &rt->dst); - } - - skb->dev = nf_bridge->physindev; - nf_bridge_update_protocol(skb); - nf_bridge_push_encap_header(skb); - NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, sk, skb, - skb->dev, NULL, - br_handle_frame_finish, 1); - - return 0; + return ip_hdr(skb)->daddr != nf_bridge->ipv4_daddr; } /* This requires some explaining. If DNAT has taken place, @@ -558,7 +357,7 @@ static int br_nf_pre_routing_finish(struct sock *sk, struct sk_buff *skb) nf_bridge->pkt_otherhost = false; } nf_bridge->mask &= ~BRNF_NF_BRIDGE_PREROUTING; - if (daddr_was_changed(skb, nf_bridge)) { + if (br_nf_ipv4_daddr_was_changed(skb, nf_bridge)) { if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) { struct in_device *in_dev = __in_dev_get_rcu(dev); @@ -636,7 +435,7 @@ static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct } /* Some common code for IPv4/IPv6 */ -static struct net_device *setup_pre_routing(struct sk_buff *skb) +struct net_device *setup_pre_routing(struct sk_buff *skb) { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); @@ -659,35 +458,6 @@ static struct net_device *setup_pre_routing(struct sk_buff *skb) return skb->dev; } -/* Replicate the checks that IPv6 does on packet reception and pass the packet - * to ip6tables. - */ -static unsigned int br_nf_pre_routing_ipv6(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct nf_bridge_info *nf_bridge; - - if (br_validate_ipv6(skb)) - return NF_DROP; - - nf_bridge_put(skb->nf_bridge); - if (!nf_bridge_alloc(skb)) - return NF_DROP; - if (!setup_pre_routing(skb)) - return NF_DROP; - - nf_bridge = nf_bridge_info_get(skb); - nf_bridge->ipv6_daddr = ipv6_hdr(skb)->daddr; - - skb->protocol = htons(ETH_P_IPV6); - NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, state->sk, skb, - skb->dev, NULL, - br_nf_pre_routing_finish_ipv6); - - return NF_STOLEN; -} - /* Direct IPv6 traffic to br_nf_pre_routing_ipv6. * Replicate the checks that IPv4 does on packet reception. * Set skb->dev to the bridge device (i.e. parent of the diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c new file mode 100644 index 000000000000..6d12d2675c80 --- /dev/null +++ b/net/bridge/br_netfilter_ipv6.c @@ -0,0 +1,245 @@ +/* + * Handle firewalling + * Linux ethernet bridge + * + * Authors: + * Lennert Buytenhek + * Bart De Schuymer + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Lennert dedicates this file to Kerstin Wurdinger. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include "br_private.h" +#ifdef CONFIG_SYSCTL +#include +#endif + +/* We only check the length. A bridge shouldn't do any hop-by-hop stuff + * anyway + */ +static int br_nf_check_hbh_len(struct sk_buff *skb) +{ + unsigned char *raw = (u8 *)(ipv6_hdr(skb) + 1); + u32 pkt_len; + const unsigned char *nh = skb_network_header(skb); + int off = raw - nh; + int len = (raw[1] + 1) << 3; + + if ((raw + len) - skb->data > skb_headlen(skb)) + goto bad; + + off += 2; + len -= 2; + + while (len > 0) { + int optlen = nh[off + 1] + 2; + + switch (nh[off]) { + case IPV6_TLV_PAD1: + optlen = 1; + break; + + case IPV6_TLV_PADN: + break; + + case IPV6_TLV_JUMBO: + if (nh[off + 1] != 4 || (off & 3) != 2) + goto bad; + pkt_len = ntohl(*(__be32 *)(nh + off + 2)); + if (pkt_len <= IPV6_MAXPLEN || + ipv6_hdr(skb)->payload_len) + goto bad; + if (pkt_len > skb->len - sizeof(struct ipv6hdr)) + goto bad; + if (pskb_trim_rcsum(skb, + pkt_len + sizeof(struct ipv6hdr))) + goto bad; + nh = skb_network_header(skb); + break; + default: + if (optlen > len) + goto bad; + break; + } + off += optlen; + len -= optlen; + } + if (len == 0) + return 0; +bad: + return -1; +} + +int br_validate_ipv6(struct sk_buff *skb) +{ + const struct ipv6hdr *hdr; + struct net_device *dev = skb->dev; + struct inet6_dev *idev = in6_dev_get(skb->dev); + u32 pkt_len; + u8 ip6h_len = sizeof(struct ipv6hdr); + + if (!pskb_may_pull(skb, ip6h_len)) + goto inhdr_error; + + if (skb->len < ip6h_len) + goto drop; + + hdr = ipv6_hdr(skb); + + if (hdr->version != 6) + goto inhdr_error; + + pkt_len = ntohs(hdr->payload_len); + + if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) { + if (pkt_len + ip6h_len > skb->len) { + IP6_INC_STATS_BH(dev_net(dev), idev, + IPSTATS_MIB_INTRUNCATEDPKTS); + goto drop; + } + if (pskb_trim_rcsum(skb, pkt_len + ip6h_len)) { + IP6_INC_STATS_BH(dev_net(dev), idev, + IPSTATS_MIB_INDISCARDS); + goto drop; + } + } + if (hdr->nexthdr == NEXTHDR_HOP && br_nf_check_hbh_len(skb)) + goto drop; + + memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); + /* No IP options in IPv6 header; however it should be + * checked if some next headers need special treatment + */ + return 0; + +inhdr_error: + IP6_INC_STATS_BH(dev_net(dev), idev, IPSTATS_MIB_INHDRERRORS); +drop: + return -1; +} + +static inline bool +br_nf_ipv6_daddr_was_changed(const struct sk_buff *skb, + const struct nf_bridge_info *nf_bridge) +{ + return memcmp(&nf_bridge->ipv6_daddr, &ipv6_hdr(skb)->daddr, + sizeof(ipv6_hdr(skb)->daddr)) != 0; +} + +/* PF_BRIDGE/PRE_ROUTING: Undo the changes made for ip6tables + * PREROUTING and continue the bridge PRE_ROUTING hook. See comment + * for br_nf_pre_routing_finish(), same logic is used here but + * equivalent IPv6 function ip6_route_input() called indirectly. + */ +static int br_nf_pre_routing_finish_ipv6(struct sock *sk, struct sk_buff *skb) +{ + struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); + struct rtable *rt; + struct net_device *dev = skb->dev; + const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops(); + + nf_bridge->frag_max_size = IP6CB(skb)->frag_max_size; + + if (nf_bridge->pkt_otherhost) { + skb->pkt_type = PACKET_OTHERHOST; + nf_bridge->pkt_otherhost = false; + } + nf_bridge->mask &= ~BRNF_NF_BRIDGE_PREROUTING; + if (br_nf_ipv6_daddr_was_changed(skb, nf_bridge)) { + skb_dst_drop(skb); + v6ops->route_input(skb); + + if (skb_dst(skb)->error) { + kfree_skb(skb); + return 0; + } + + if (skb_dst(skb)->dev == dev) { + skb->dev = nf_bridge->physindev; + nf_bridge_update_protocol(skb); + nf_bridge_push_encap_header(skb); + NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, + sk, skb, skb->dev, NULL, + br_nf_pre_routing_finish_bridge, + 1); + return 0; + } + ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr); + skb->pkt_type = PACKET_HOST; + } else { + rt = bridge_parent_rtable(nf_bridge->physindev); + if (!rt) { + kfree_skb(skb); + return 0; + } + skb_dst_set_noref(skb, &rt->dst); + } + + skb->dev = nf_bridge->physindev; + nf_bridge_update_protocol(skb); + nf_bridge_push_encap_header(skb); + NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, sk, skb, + skb->dev, NULL, + br_handle_frame_finish, 1); + + return 0; +} + +/* Replicate the checks that IPv6 does on packet reception and pass the packet + * to ip6tables. + */ +unsigned int br_nf_pre_routing_ipv6(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nf_bridge_info *nf_bridge; + + if (br_validate_ipv6(skb)) + return NF_DROP; + + nf_bridge_put(skb->nf_bridge); + if (!nf_bridge_alloc(skb)) + return NF_DROP; + if (!setup_pre_routing(skb)) + return NF_DROP; + + nf_bridge = nf_bridge_info_get(skb); + nf_bridge->ipv6_daddr = ipv6_hdr(skb)->daddr; + + skb->protocol = htons(ETH_P_IPV6); + NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, state->sk, skb, + skb->dev, NULL, + br_nf_pre_routing_finish_ipv6); + + return NF_STOLEN; +} -- cgit From 2fd1dc910b39c97549d2d42a916985795ca8a641 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 17 Jun 2015 10:28:10 -0500 Subject: netfilter: Kill unused copies of RCV_SKB_FAIL This appears to have been a dead macro in both nfnetlink_log.c and nfnetlink_queue_core.c since these pieces of code were added in 2005. Signed-off-by: "Eric W. Biederman" Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_log.c | 2 -- net/netfilter/nfnetlink_queue_core.c | 2 -- 2 files changed, 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 4ef1fae8445e..4670821b569d 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -598,8 +598,6 @@ nla_put_failure: return -1; } -#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) - static struct nf_loginfo default_loginfo = { .type = NF_LOG_TYPE_ULOG, .u = { diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c index 6eccf0fcdc63..e26a46ef19ba 100644 --- a/net/netfilter/nfnetlink_queue_core.c +++ b/net/netfilter/nfnetlink_queue_core.c @@ -834,8 +834,6 @@ nfqnl_dev_drop(struct net *net, int ifindex) rcu_read_unlock(); } -#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) - static int nfqnl_rcv_dev_event(struct notifier_block *this, unsigned long event, void *ptr) -- cgit From 17cebfd097fe8a21902602db5196f48e5a9d34e8 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 17 Jun 2015 10:28:17 -0500 Subject: net: sched: Simplify em_ipset_match em->net is always set and always available, use it in preference to dev_net(skb->dev). Signed-off-by: "Eric W. Biederman" Signed-off-by: Pablo Neira Ayuso --- net/sched/em_ipset.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/em_ipset.c b/net/sched/em_ipset.c index a3d79c8bf3b8..df0328ba6a48 100644 --- a/net/sched/em_ipset.c +++ b/net/sched/em_ipset.c @@ -92,8 +92,8 @@ static int em_ipset_match(struct sk_buff *skb, struct tcf_ematch *em, rcu_read_lock(); - if (dev && skb->skb_iif) - indev = dev_get_by_index_rcu(dev_net(dev), skb->skb_iif); + if (skb->skb_iif) + indev = dev_get_by_index_rcu(em->net, skb->skb_iif); acpar.in = indev ? indev : dev; acpar.out = dev; -- cgit From 10c04a8e715cca824f96bcbf4af07f5a40985357 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 17 Jun 2015 10:28:26 -0500 Subject: netfilter: use forward declaration instead of including linux/proc_fs.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We don't need to pull the full definitions in that file, a simple forward declaration is enough. Moreover, include linux/procfs.h from nf_synproxy_core, otherwise this hits a compilation error due to missing declarations, ie. net/netfilter/nf_synproxy_core.c: In function ‘synproxy_proc_init’: net/netfilter/nf_synproxy_core.c:326:2: error: implicit declaration of function ‘proc_create’ [-Werror=implicit-function-declaration] if (!proc_create("synproxy", S_IRUGO, net->proc_net_stat, ^ Signed-off-by: Pablo Neira Ayuso Signed-off-by: Eric W. Biederman --- net/netfilter/nf_synproxy_core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 52e20c9a46a5..789feeae6c44 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include -- cgit From a263653ed798216c0069922d7b5237ca49436007 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 17 Jun 2015 10:28:27 -0500 Subject: netfilter: don't pull include/linux/netfilter.h from netns headers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This pulls the full hook netfilter definitions from all those that include net_namespace.h. Instead let's just include the bare minimum required in the new linux/netfilter_defs.h file, and use it from the netfilter netns header files. I also needed to include in.h and in6.h from linux/netfilter.h otherwise we hit this compilation error: In file included from include/linux/netfilter_defs.h:4:0, from include/net/netns/netfilter.h:4, from include/net/net_namespace.h:22, from include/linux/netdevice.h:43, from net/netfilter/nfnetlink_queue_core.c:23: include/uapi/linux/netfilter.h:76:17: error: field ‘in’ has incomplete type struct in_addr in; And also explicit include linux/netfilter.h in several spots. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Eric W. Biederman --- net/ipv6/output_core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index 21678acd4521..928a0fb0b744 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -8,6 +8,7 @@ #include #include #include +#include static u32 __ipv6_select_ident(struct net *net, u32 hashrnd, const struct in6_addr *dst, -- cgit From 8f481b50ea653ff0aea6accbb4bb02a15cf00531 Mon Sep 17 00:00:00 2001 From: Eric W Biederman Date: Wed, 17 Jun 2015 10:28:35 -0500 Subject: netfilter: Remove spurios included of netfilter.h While testing my netfilter changes I noticed several files where recompiling unncessarily because they unncessarily included netfilter.h. Signed-off-by: "Eric W. Biederman" Signed-off-by: Pablo Neira Ayuso --- net/ax25/af_ax25.c | 1 - net/ax25/ax25_in.c | 1 - net/ax25/ax25_ip.c | 1 - net/ax25/ax25_out.c | 1 - net/ax25/ax25_uid.c | 1 - net/netrom/nr_route.c | 1 - net/rose/rose_link.c | 1 - net/rose/rose_route.c | 1 - 8 files changed, 8 deletions(-) (limited to 'net') diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 4273533d22b1..9c891d0412a2 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -40,7 +40,6 @@ #include #include #include -#include #include #include #include diff --git a/net/ax25/ax25_in.c b/net/ax25/ax25_in.c index 7ed8ab724819..29a3687237aa 100644 --- a/net/ax25/ax25_in.c +++ b/net/ax25/ax25_in.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include diff --git a/net/ax25/ax25_ip.c b/net/ax25/ax25_ip.c index 7c646bb2c6f7..b563a3f5f2a8 100644 --- a/net/ax25/ax25_ip.c +++ b/net/ax25/ax25_ip.c @@ -31,7 +31,6 @@ #include #include #include -#include #include #include #include diff --git a/net/ax25/ax25_out.c b/net/ax25/ax25_out.c index be2acab9be9d..8ddd41baa81c 100644 --- a/net/ax25/ax25_out.c +++ b/net/ax25/ax25_out.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include diff --git a/net/ax25/ax25_uid.c b/net/ax25/ax25_uid.c index 71c4badbc807..4ad2fb7bcd35 100644 --- a/net/ax25/ax25_uid.c +++ b/net/ax25/ax25_uid.c @@ -34,7 +34,6 @@ #include #include #include -#include #include #include #include diff --git a/net/netrom/nr_route.c b/net/netrom/nr_route.c index 96b64d2f6dbf..d72a4f1558f2 100644 --- a/net/netrom/nr_route.c +++ b/net/netrom/nr_route.c @@ -31,7 +31,6 @@ #include #include #include -#include #include #include #include diff --git a/net/rose/rose_link.c b/net/rose/rose_link.c index e873d7d9f857..c76638cc2cd5 100644 --- a/net/rose/rose_link.c +++ b/net/rose/rose_link.c @@ -25,7 +25,6 @@ #include #include #include -#include #include static void rose_ftimer_expiry(unsigned long); diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index 40148932c8a4..0fc76d845103 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -31,7 +31,6 @@ #include #include #include -#include #include #include #include -- cgit From f98f4514d07871da7a113dd9e3e330743fd70ae4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Jun 2015 07:59:11 -0700 Subject: packet: read num_members once in packet_rcv_fanout() We need to tell compiler it must not read f->num_members multiple times. Otherwise testing if num is not zero is flaky, and we could attempt an invalid divide by 0 in fanout_demux_cpu() Note bug was present in packet_rcv_fanout_hash() and packet_rcv_fanout_lb() but final 3.1 had a simple location after commit 95ec3eb417115fb ("packet: Add 'cpu' fanout policy.") Fixes: dc99f600698dc ("packet: Add fanout support.") Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Signed-off-by: David S. Miller --- net/packet/af_packet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index b5989c6ee551..131545a06f05 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1353,7 +1353,7 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct packet_fanout *f = pt->af_packet_priv; - unsigned int num = f->num_members; + unsigned int num = READ_ONCE(f->num_members); struct packet_sock *po; unsigned int idx; -- cgit From 59f211181b5b10d4a95e1b9226febb0c0b6497c6 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 16 Jun 2015 12:51:37 -0400 Subject: packet: free packet_rollover after synchronize_net Destruction of the po->rollover must be delayed until there are no more packets in flight that can access it. The field is destroyed in packet_release, before synchronize_net. Delay using rcu. Fixes: 0648ab70afe6 ("packet: rollover prepare: per-socket state") Suggested-by: Eric Dumazet Signed-off-by: Willem de Bruijn Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/packet/af_packet.c | 3 ++- net/packet/internal.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index fd5164139bf0..20e8c40da90d 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1634,7 +1634,8 @@ static void fanout_release(struct sock *sk) } mutex_unlock(&fanout_mutex); - kfree(po->rollover); + if (po->rollover) + kfree_rcu(po->rollover, rcu); } static const struct proto_ops packet_ops; diff --git a/net/packet/internal.h b/net/packet/internal.h index c035d263c1e8..e20b3e8829b8 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -89,6 +89,7 @@ struct packet_fanout { struct packet_rollover { int sock; + struct rcu_head rcu; atomic_long_t num; atomic_long_t num_huge; atomic_long_t num_failed; -- cgit From d2a9ec6472650c49ac3f285637b9c04b53f11ae6 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Tue, 16 Jun 2015 20:44:07 +0200 Subject: net: rds: use for_each_sg() for scatterlist parsing This patch also renames sg to sglist and aligns function parameters. See Documentation/DMA-API.txt - Part Id for scatterlist details Signed-off-by: Fabian Frederick Signed-off-by: David S. Miller --- net/rds/ib.h | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/rds/ib.h b/net/rds/ib.h index c36d713229e0..2de28982260c 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -235,28 +235,34 @@ extern struct workqueue_struct *rds_ib_wq; * doesn't define it. */ static inline void rds_ib_dma_sync_sg_for_cpu(struct ib_device *dev, - struct scatterlist *sg, unsigned int sg_dma_len, int direction) + struct scatterlist *sglist, + unsigned int sg_dma_len, + int direction) { + struct scatterlist *sg; unsigned int i; - for (i = 0; i < sg_dma_len; ++i) { + for_each_sg(sglist, sg, sg_dma_len, i) { ib_dma_sync_single_for_cpu(dev, - ib_sg_dma_address(dev, &sg[i]), - ib_sg_dma_len(dev, &sg[i]), + ib_sg_dma_address(dev, sg), + ib_sg_dma_len(dev, sg), direction); } } #define ib_dma_sync_sg_for_cpu rds_ib_dma_sync_sg_for_cpu static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev, - struct scatterlist *sg, unsigned int sg_dma_len, int direction) + struct scatterlist *sglist, + unsigned int sg_dma_len, + int direction) { + struct scatterlist *sg; unsigned int i; - for (i = 0; i < sg_dma_len; ++i) { + for_each_sg(sglist, sg, sg_dma_len, i) { ib_dma_sync_single_for_device(dev, - ib_sg_dma_address(dev, &sg[i]), - ib_sg_dma_len(dev, &sg[i]), + ib_sg_dma_address(dev, sg), + ib_sg_dma_len(dev, sg), direction); } } -- cgit From 2c51a97f76d20ebf1f50fef908b986cb051fdff9 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 16 Jun 2015 22:56:39 +0300 Subject: neigh: do not modify unlinked entries The lockless lookups can return entry that is unlinked. Sometimes they get reference before last neigh_cleanup_and_release, sometimes they do not need reference. Later, any modification attempts may result in the following problems: 1. entry is not destroyed immediately because neigh_update can start the timer for dead entry, eg. on change to NUD_REACHABLE state. As result, entry lives for some time but is invisible and out of control. 2. __neigh_event_send can run in parallel with neigh_destroy while refcnt=0 but if timer is started and expired refcnt can reach 0 for second time leading to second neigh_destroy and possible crash. Thanks to Eric Dumazet and Ying Xue for their work and analyze on the __neigh_event_send change. Fixes: 767e97e1e0db ("neigh: RCU conversion of struct neighbour") Fixes: a263b3093641 ("ipv4: Make neigh lookups directly in output packet path.") Fixes: 6fd6ce2056de ("ipv6: Do not depend on rt->n in ip6_finish_output2().") Cc: Eric Dumazet Cc: Ying Xue Signed-off-by: Julian Anastasov Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/neighbour.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'net') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 3de654256028..2237c1b3cdd2 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -957,6 +957,8 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) rc = 0; if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE)) goto out_unlock_bh; + if (neigh->dead) + goto out_dead; if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) { if (NEIGH_VAR(neigh->parms, MCAST_PROBES) + @@ -1013,6 +1015,13 @@ out_unlock_bh: write_unlock(&neigh->lock); local_bh_enable(); return rc; + +out_dead: + if (neigh->nud_state & NUD_STALE) + goto out_unlock_bh; + write_unlock_bh(&neigh->lock); + kfree_skb(skb); + return 1; } EXPORT_SYMBOL(__neigh_event_send); @@ -1076,6 +1085,8 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, if (!(flags & NEIGH_UPDATE_F_ADMIN) && (old & (NUD_NOARP | NUD_PERMANENT))) goto out; + if (neigh->dead) + goto out; if (!(new & NUD_VALID)) { neigh_del_timer(neigh); @@ -1225,6 +1236,8 @@ EXPORT_SYMBOL(neigh_update); */ void __neigh_set_probe_once(struct neighbour *neigh) { + if (neigh->dead) + return; neigh->updated = jiffies; if (!(neigh->nud_state & NUD_FAILED)) return; -- cgit From a55e1c5c2640549ff3beb843f7ecaa242d69c351 Mon Sep 17 00:00:00 2001 From: Andrea Parri Date: Wed, 17 Jun 2015 00:16:59 +0200 Subject: pkt_sched: sch_qfq: remove redundant -if- control statement The control !hlist_unhashed() in qfq_destroy_agg() is unnecessary because already performed in hlist_del_init(), so remove it. Signed-off-by: Andrea Parri Signed-off-by: David S. Miller --- net/sched/sch_qfq.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 3ec7e88a43ca..b8d73bca683c 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -339,8 +339,7 @@ static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *); static void qfq_destroy_agg(struct qfq_sched *q, struct qfq_aggregate *agg) { - if (!hlist_unhashed(&agg->nonfull_next)) - hlist_del_init(&agg->nonfull_next); + hlist_del_init(&agg->nonfull_next); q->wsum -= agg->class_weight; if (q->wsum != 0) q->iwsum = ONE_FP / q->wsum; -- cgit From 36c01245eb8046c16eee6431e7dbfbb302635fa8 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Sun, 21 Jun 2015 18:50:44 +0200 Subject: can: fix loss of CAN frames in raw_rcv As reported by Manfred Schlaegl here http://marc.info/?l=linux-netdev&m=143482089824232&w=2 commit 514ac99c64b "can: fix multiple delivery of a single CAN frame for overlapping CAN filters" requires the skb->tstamp to be set to check for identical CAN skbs. As net timestamping is influenced by several players (netstamp_needed and netdev_tstamp_prequeue) Manfred missed a proper timestamp which leads to CAN frame loss. As skb timestamping became now mandatory for CAN related skbs this patch makes sure that received CAN skbs always have a proper timestamp set. Maybe there's a better solution in the future but this patch fixes the CAN frame loss so far. Reported-by: Manfred Schlaegl Signed-off-by: Oliver Hartkopp Cc: linux-stable Signed-off-by: Marc Kleine-Budde --- net/can/af_can.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/can/af_can.c b/net/can/af_can.c index 32d710eaf1fc..689c818ed007 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -310,8 +310,12 @@ int can_send(struct sk_buff *skb, int loop) return err; } - if (newskb) + if (newskb) { + if (!(newskb->tstamp.tv64)) + __net_timestamp(newskb); + netif_rx_ni(newskb); + } /* update statistics */ can_stats.tx_frames++; -- cgit From 51f458d9612177f69c2e2c437034ae15f93078e7 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 17 Jun 2015 13:54:54 +0200 Subject: mac80211: fix locking in update_vlan_tailroom_need_count() Unfortunately, Michal's change to fix AP_VLAN crypto tailroom caused a locking issue that was reported by lockdep, but only in a few cases - the issue was a classic ABBA deadlock caused by taking the mtx after the key_mtx, where normally they're taken the other way around. As the key mutex protects the field in question (I'm adding a few annotations to make that clear) only the iteration needs to be protected, but we can also iterate the interface list with just RCU protection while holding the key mutex. Fixes: f9dca80b98ca ("mac80211: fix AP_VLAN crypto tailroom calculation") Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- net/mac80211/key.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mac80211/key.c b/net/mac80211/key.c index a907f2d5c12d..81e9785f38bc 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -66,12 +66,15 @@ update_vlan_tailroom_need_count(struct ieee80211_sub_if_data *sdata, int delta) if (sdata->vif.type != NL80211_IFTYPE_AP) return; - mutex_lock(&sdata->local->mtx); + /* crypto_tx_tailroom_needed_cnt is protected by this */ + assert_key_lock(sdata->local); + + rcu_read_lock(); - list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) + list_for_each_entry_rcu(vlan, &sdata->u.ap.vlans, u.vlan.list) vlan->crypto_tx_tailroom_needed_cnt += delta; - mutex_unlock(&sdata->local->mtx); + rcu_read_unlock(); } static void increment_tailroom_need_count(struct ieee80211_sub_if_data *sdata) @@ -95,6 +98,8 @@ static void increment_tailroom_need_count(struct ieee80211_sub_if_data *sdata) * http://mid.gmane.org/1308590980.4322.19.camel@jlt3.sipsolutions.net */ + assert_key_lock(sdata->local); + update_vlan_tailroom_need_count(sdata, 1); if (!sdata->crypto_tx_tailroom_needed_cnt++) { @@ -109,6 +114,8 @@ static void increment_tailroom_need_count(struct ieee80211_sub_if_data *sdata) static void decrease_tailroom_need_count(struct ieee80211_sub_if_data *sdata, int delta) { + assert_key_lock(sdata->local); + WARN_ON_ONCE(sdata->crypto_tx_tailroom_needed_cnt < delta); update_vlan_tailroom_need_count(sdata, -delta); -- cgit From e0df02e0c2118892b302e8248dbbb21698bedfed Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Wed, 17 Jun 2015 10:59:10 -0400 Subject: sock_diag: fetch source port from inet_sock When an inet_sock is destroyed, its source port (sk_num) is set to zero as part of the unhash procedure. In order to supply a source port as part of the NETLINK_SOCK_DIAG socket destruction broadcasts, the source port number must be read from inet_sport instead. Tested: ss -E Signed-off-by: Craig Gallek Signed-off-by: David S. Miller --- net/ipv4/inet_diag.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 21985d8d41e7..ba6a80f90e7d 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -1095,6 +1095,8 @@ int inet_diag_handler_get_info(struct sk_buff *skb, struct sock *sk) r = nlmsg_data(nlh); memset(r, 0, sizeof(*r)); inet_diag_msg_common_fill(r, sk); + if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_STREAM) + r->id.idiag_sport = inet_sk(sk)->inet_sport; r->idiag_state = sk->sk_state; if ((err = nla_put_u8(skb, INET_DIAG_PROTOCOL, sk->sk_protocol))) { -- cgit From b42be38b2778eda2237fc759e55e3b698b05b315 Mon Sep 17 00:00:00 2001 From: David Herrmann Date: Wed, 17 Jun 2015 17:14:33 +0200 Subject: netlink: add API to retrieve all group memberships This patch adds getsockopt(SOL_NETLINK, NETLINK_LIST_MEMBERSHIPS) to retrieve all groups a socket is a member of. Currently, we have to use getsockname() and look at the nl.nl_groups bitmask. However, this mask is limited to 32 groups. Hence, similar to NETLINK_ADD_MEMBERSHIP and NETLINK_DROP_MEMBERSHIP, this adds a separate sockopt to manager higher groups IDs than 32. This new NETLINK_LIST_MEMBERSHIPS option takes a pointer to __u32 and the size of the array. The array is filled with the full membership-set of the socket, and the required array size is returned in optlen. Hence, user-space can retry with a properly sized array in case it was too small. Signed-off-by: David Herrmann Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 69d67c300b80..dea925388a5b 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2290,6 +2290,28 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname, return -EFAULT; err = 0; break; + case NETLINK_LIST_MEMBERSHIPS: { + int pos, idx, shift; + + err = 0; + netlink_table_grab(); + for (pos = 0; pos * 8 < nlk->ngroups; pos += sizeof(u32)) { + if (len - pos < sizeof(u32)) + break; + + idx = pos / sizeof(unsigned long); + shift = (pos % sizeof(unsigned long)) * 8; + if (put_user((u32)(nlk->groups[idx] >> shift), + (u32 __user *)(optval + pos))) { + err = -EFAULT; + break; + } + } + if (put_user(ALIGN(nlk->ngroups / 8, sizeof(u32)), optlen)) + err = -EFAULT; + netlink_table_ungrab(); + break; + } default: err = -ENOPROTOOPT; } -- cgit From a2bb6d7d6f4249691b6a554cde59969d55b0d9c3 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Wed, 17 Jun 2015 11:07:01 -0700 Subject: ipv4: include NLM_F_APPEND flag in append route notifications This patch adds NLM_F_APPEND flag to struct nlmsg_hdr->nlmsg_flags in newroute notifications if the route add was an append. (This is similar to how NLM_F_REPLACE is already part of new route replace notifications today) This helps userspace determine if the route add operation was an append. Signed-off-by: Roopa Prabhu Acked-by: Scott Feldman Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 3c699c4e90a4..6c666a9f1bd5 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1082,6 +1082,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) struct trie *t = (struct trie *)tb->tb_data; struct fib_alias *fa, *new_fa; struct key_vector *l, *tp; + unsigned int nlflags = 0; struct fib_info *fi; u8 plen = cfg->fc_dst_len; u8 slen = KEYLENGTH - plen; @@ -1201,7 +1202,9 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) if (fa_match) goto out; - if (!(cfg->fc_nlflags & NLM_F_APPEND)) + if (cfg->fc_nlflags & NLM_F_APPEND) + nlflags = NLM_F_APPEND; + else fa = fa_first; } err = -ENOENT; @@ -1238,7 +1241,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) rt_cache_flush(cfg->fc_nlinfo.nl_net); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id, - &cfg->fc_nlinfo, 0); + &cfg->fc_nlinfo, nlflags); succeeded: return 0; -- cgit From 468479e6043c84f5a65299cc07cb08a22a28c2b1 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 17 Jun 2015 15:59:34 -0400 Subject: packet: avoid out of bounds read in round robin fanout PACKET_FANOUT_LB computes f->rr_cur such that it is modulo f->num_members. It returns the old value unconditionally, but f->num_members may have changed since the last store. Ensure that the return value is always < num. When modifying the logic, simplify it further by replacing the loop with an unconditional atomic increment. Fixes: dc99f600698d ("packet: Add fanout support.") Suggested-by: Eric Dumazet Signed-off-by: Willem de Bruijn Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/packet/af_packet.c | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 131545a06f05..fe1610ddeacf 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1272,16 +1272,6 @@ static void packet_sock_destruct(struct sock *sk) sk_refcnt_debug_dec(sk); } -static int fanout_rr_next(struct packet_fanout *f, unsigned int num) -{ - int x = atomic_read(&f->rr_cur) + 1; - - if (x >= num) - x = 0; - - return x; -} - static unsigned int fanout_demux_hash(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) @@ -1293,13 +1283,9 @@ static unsigned int fanout_demux_lb(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { - int cur, old; + unsigned int val = atomic_inc_return(&f->rr_cur); - cur = atomic_read(&f->rr_cur); - while ((old = atomic_cmpxchg(&f->rr_cur, cur, - fanout_rr_next(f, num))) != cur) - cur = old; - return cur; + return val % num; } static unsigned int fanout_demux_cpu(struct packet_fanout *f, -- cgit From 10ea5165e44ec8467e393a84ecfbf5732605d504 Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Wed, 17 Jun 2015 16:08:31 -0700 Subject: switchdev: fdb filter_dev is always NULL for self (device), so remove check Remove the filter_dev check when dumping fdb entries, otherwise dump returns empty list. filter_dev is always passed as NULL when dumping fdbs on SELF. We want the fdbs installed on the device to be listed in the dump. Signed-off-by: Scott Feldman Fixes: 45d4122c ("switchdev: add support for fdb add/del/dump via switchdev_port_obj ops") Acked-by: Sridhar Samudrala Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/switchdev/switchdev.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'net') diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 658bc3ac8008..c29f2327f2e6 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -656,7 +656,6 @@ struct switchdev_fdb_dump { struct switchdev_obj obj; struct sk_buff *skb; struct netlink_callback *cb; - struct net_device *filter_dev; int idx; }; @@ -669,14 +668,10 @@ static int switchdev_port_fdb_dump_cb(struct net_device *dev, u32 seq = dump->cb->nlh->nlmsg_seq; struct nlmsghdr *nlh; struct ndmsg *ndm; - struct net_device *master = netdev_master_upper_dev_get(dev); if (dump->idx < dump->cb->args[0]) goto skip; - if (master && dump->filter_dev != master) - goto skip; - nlh = nlmsg_put(dump->skb, portid, seq, RTM_NEWNEIGH, sizeof(*ndm), NLM_F_MULTI); if (!nlh) @@ -730,7 +725,6 @@ int switchdev_port_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, }, .skb = skb, .cb = cb, - .filter_dev = filter_dev, .idx = idx, }; int err; -- cgit From 3b1884435aa61320ac12d3b91f69ab034946cc3d Mon Sep 17 00:00:00 2001 From: Hiroaki SHIMODA Date: Thu, 18 Jun 2015 20:40:54 +0900 Subject: inet_diag: Remove _bh suffix in inet_diag_dump_reqs(). inet_diag_dump_reqs() is called from inet_diag_dump_icsk() with BH disabled. So no need to disable BH in inet_diag_dump_reqs(). Signed-off-by: Hiroaki Shimoda Signed-off-by: David S. Miller --- net/ipv4/inet_diag.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index ba6a80f90e7d..516a1f6fbdd3 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -746,7 +746,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, entry.family = sk->sk_family; - spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + spin_lock(&icsk->icsk_accept_queue.syn_wait_lock); lopt = icsk->icsk_accept_queue.listen_opt; if (!lopt || !listen_sock_qlen(lopt)) @@ -794,7 +794,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, } out: - spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock); return err; } -- cgit From dfea2aa654243f70dc53b8648d0bbdeec55a7df1 Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Thu, 18 Jun 2015 09:15:34 -0700 Subject: tcp: Do not call tcp_fastopen_reset_cipher from interrupt context tcp_fastopen_reset_cipher really cannot be called from interrupt context. It allocates the tcp_fastopen_context with GFP_KERNEL and calls crypto_alloc_cipher, which allocates all kind of stuff with GFP_KERNEL. Thus, we might sleep when the key-generation is triggered by an incoming TFO cookie-request which would then happen in interrupt- context, as shown by enabling CONFIG_DEBUG_ATOMIC_SLEEP: [ 36.001813] BUG: sleeping function called from invalid context at mm/slub.c:1266 [ 36.003624] in_atomic(): 1, irqs_disabled(): 0, pid: 1016, name: packetdrill [ 36.004859] CPU: 1 PID: 1016 Comm: packetdrill Not tainted 4.1.0-rc7 #14 [ 36.006085] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014 [ 36.008250] 00000000000004f2 ffff88007f8838a8 ffffffff8171d53a ffff880075a084a8 [ 36.009630] ffff880075a08000 ffff88007f8838c8 ffffffff810967d3 ffff88007f883928 [ 36.011076] 0000000000000000 ffff88007f8838f8 ffffffff81096892 ffff88007f89be00 [ 36.012494] Call Trace: [ 36.012953] [] dump_stack+0x4f/0x6d [ 36.014085] [] ___might_sleep+0x103/0x170 [ 36.015117] [] __might_sleep+0x52/0x90 [ 36.016117] [] kmem_cache_alloc_trace+0x47/0x190 [ 36.017266] [] ? tcp_fastopen_reset_cipher+0x42/0x130 [ 36.018485] [] tcp_fastopen_reset_cipher+0x42/0x130 [ 36.019679] [] tcp_fastopen_init_key_once+0x61/0x70 [ 36.020884] [] __tcp_fastopen_cookie_gen+0x1c/0x60 [ 36.022058] [] tcp_try_fastopen+0x58f/0x730 [ 36.023118] [] tcp_conn_request+0x3e8/0x7b0 [ 36.024185] [] ? __module_text_address+0x12/0x60 [ 36.025327] [] tcp_v4_conn_request+0x51/0x60 [ 36.026410] [] tcp_rcv_state_process+0x190/0xda0 [ 36.027556] [] ? __inet_lookup_established+0x47/0x170 [ 36.028784] [] tcp_v4_do_rcv+0x16d/0x3d0 [ 36.029832] [] ? security_sock_rcv_skb+0x16/0x20 [ 36.030936] [] tcp_v4_rcv+0x77a/0x7b0 [ 36.031875] [] ? iptable_filter_hook+0x33/0x70 [ 36.032953] [] ip_local_deliver_finish+0x92/0x1f0 [ 36.034065] [] ip_local_deliver+0x9a/0xb0 [ 36.035069] [] ? ip_rcv+0x3d0/0x3d0 [ 36.035963] [] ip_rcv_finish+0x119/0x330 [ 36.036950] [] ip_rcv+0x2e7/0x3d0 [ 36.037847] [] __netif_receive_skb_core+0x552/0x930 [ 36.038994] [] __netif_receive_skb+0x27/0x70 [ 36.040033] [] process_backlog+0xd2/0x1f0 [ 36.041025] [] net_rx_action+0x122/0x310 [ 36.042007] [] __do_softirq+0x103/0x2f0 [ 36.042978] [] do_softirq_own_stack+0x1c/0x30 This patch moves the call to tcp_fastopen_init_key_once to the places where a listener socket creates its TFO-state, which always happens in user-context (either from the setsockopt, or implicitly during the listen()-call) Cc: Eric Dumazet Cc: Hannes Frederic Sowa Fixes: 222e83d2e0ae ("tcp: switch tcp_fastopen key generation to net_get_random_once") Signed-off-by: Christoph Paasch Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 2 ++ net/ipv4/tcp.c | 7 +++++-- net/ipv4/tcp_fastopen.c | 2 -- 3 files changed, 7 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 8b47a4d79d04..a5aa54ea6533 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -228,6 +228,8 @@ int inet_listen(struct socket *sock, int backlog) err = 0; if (err) goto out; + + tcp_fastopen_init_key_once(true); } err = inet_csk_listen_start(sk, backlog); if (err) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f1377f2a0472..bb2ce74f6004 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2545,10 +2545,13 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_FASTOPEN: if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE | - TCPF_LISTEN))) + TCPF_LISTEN))) { + tcp_fastopen_init_key_once(true); + err = fastopen_init_queue(sk, val); - else + } else { err = -EINVAL; + } break; case TCP_TIMESTAMP: if (!tp->repair) diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 46b087a27503..f9c0fb84e435 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -78,8 +78,6 @@ static bool __tcp_fastopen_cookie_gen(const void *path, struct tcp_fastopen_context *ctx; bool ok = false; - tcp_fastopen_init_key_once(true); - rcu_read_lock(); ctx = rcu_dereference(tcp_fastopen_ctx); if (ctx) { -- cgit From d496f7842aada20c61e6044b3395383fa972872c Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Fri, 19 Jun 2015 00:46:53 +0200 Subject: NET: ROSE: Don't dereference NULL neighbour pointer. A ROSE socket doesn't necessarily always have a neighbour pointer so check if the neighbour pointer is valid before dereferencing it. Signed-off-by: Ralf Baechle Tested-by: Bernard Pidoux Cc: stable@vger.kernel.org #2.6.11+ Signed-off-by: David S. Miller --- net/rose/af_rose.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 8ae603069a1a..dd304bc40788 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -192,7 +192,8 @@ static void rose_kill_by_device(struct net_device *dev) if (rose->device == dev) { rose_disconnect(s, ENETUNREACH, ROSE_OUT_OF_ORDER, 0); - rose->neighbour->use--; + if (rose->neighbour) + rose->neighbour->use--; rose->device = NULL; } } -- cgit From 7ce42de1895d4787b47b004638d642dcacb464fe Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 19 Jun 2015 01:45:50 -0700 Subject: bridge: multicast: start querier timer when running user-space stp When STP is running in user-space and querier is configured, the querier timer is not started when a port goes to a non-blocking state. This patch unifies the user- and kernel-space stp multicast port enable path and enables it in all states different from blocking. Note that when a port goes in BR_STATE_DISABLED it's not enabled because that is handled in the beginning of the port list loop. Signed-off-by: Nikolay Aleksandrov Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/bridge/br_stp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index 45f1ff113af9..e7ab74b405a1 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -428,7 +428,6 @@ static void br_make_forwarding(struct net_bridge_port *p) else br_set_state(p, BR_STATE_LEARNING); - br_multicast_enable_port(p); br_log_state(p); br_ifinfo_notify(RTM_NEWLINK, p); @@ -462,6 +461,8 @@ void br_port_state_selection(struct net_bridge *br) } } + if (p->state != BR_STATE_BLOCKING) + br_multicast_enable_port(p); if (p->state == BR_STATE_FORWARDING) ++liveports; } -- cgit From 754bc547f0a79f7568b5b81c7fc0a8d044a6571a Mon Sep 17 00:00:00 2001 From: Satish Ashok Date: Fri, 19 Jun 2015 01:22:57 -0700 Subject: bridge: multicast: restore router configuration on port link down/up When a port goes through a link down/up the multicast router configuration is not restored. Signed-off-by: Satish Ashok Signed-off-by: Nikolay Aleksandrov Fixes: 0909e11758bd ("bridge: Add multicast_router sysfs entries") Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/bridge/br_multicast.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index ff667e18b2d6..761fc733bf6d 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -37,6 +37,8 @@ static void br_multicast_start_querier(struct net_bridge *br, struct bridge_mcast_own_query *query); +static void br_multicast_add_router(struct net_bridge *br, + struct net_bridge_port *port); unsigned int br_mdb_rehash_seq; static inline int br_ip_equal(const struct br_ip *a, const struct br_ip *b) @@ -936,6 +938,8 @@ void br_multicast_enable_port(struct net_bridge_port *port) #if IS_ENABLED(CONFIG_IPV6) br_multicast_enable(&port->ip6_own_query); #endif + if (port->multicast_router == 2 && hlist_unhashed(&port->rlist)) + br_multicast_add_router(br, port); out: spin_unlock(&br->multicast_lock); -- cgit From fdab6a4cbd8933092155449ca7253eba973ada14 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 19 Jun 2015 10:41:21 -0500 Subject: netfilter: nftables: Do not run chains in the wrong network namespace Currenlty nf_tables chains added in one network namespace are being run in all network namespace. The issues are myriad with the simplest being an unprivileged user can cause any network packets to be dropped. Address this by simply not running nf_tables chains in the wrong network namespace. Cc: stable@vger.kernel.org Signed-off-by: "Eric W. Biederman" Acked-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/netfilter/nf_tables_core.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index f153b07073af..f77bad46ac68 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -114,7 +114,8 @@ unsigned int nft_do_chain(struct nft_pktinfo *pkt, const struct nf_hook_ops *ops) { const struct nft_chain *chain = ops->priv, *basechain = chain; - const struct net *net = read_pnet(&nft_base_chain(basechain)->pnet); + const struct net *chain_net = read_pnet(&nft_base_chain(basechain)->pnet); + const struct net *net = dev_net(pkt->in ? pkt->in : pkt->out); const struct nft_rule *rule; const struct nft_expr *expr, *last; struct nft_regs regs; @@ -124,6 +125,10 @@ nft_do_chain(struct nft_pktinfo *pkt, const struct nf_hook_ops *ops) int rulenum; unsigned int gencursor = nft_genmask_cur(net); + /* Ignore chains that are not for the current network namespace */ + if (!net_eq(net, chain_net)) + return NF_ACCEPT; + do_chain: rulenum = 0; rule = list_entry(&chain->rules, struct nft_rule, list); -- cgit From 8405a8fff3f8545c888a872d6e3c0c8eecd4d348 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 19 Jun 2015 14:03:39 -0500 Subject: netfilter: nf_qeueue: Drop queue entries on nf_unregister_hook Add code to nf_unregister_hook to flush the nf_queue when a hook is unregistered. This guarantees that the pointer that the nf_queue code retains into the nf_hook list will remain valid while a packet is queued. I tested what would happen if we do not flush queued packets and was trivially able to obtain the oops below. All that was required was to stop the nf_queue listening process, to delete all of the nf_tables, and to awaken the nf_queue listening process. > BUG: unable to handle kernel paging request at 0000000100000001 > IP: [<0000000100000001>] 0x100000001 > PGD b9c35067 PUD 0 > Oops: 0010 [#1] SMP > Modules linked in: > CPU: 0 PID: 519 Comm: lt-nfqnl_test Not tainted > task: ffff8800b9c8c050 ti: ffff8800ba9d8000 task.ti: ffff8800ba9d8000 > RIP: 0010:[<0000000100000001>] [<0000000100000001>] 0x100000001 > RSP: 0018:ffff8800ba9dba40 EFLAGS: 00010a16 > RAX: ffff8800bab48a00 RBX: ffff8800ba9dba90 RCX: ffff8800ba9dba90 > RDX: ffff8800b9c10128 RSI: ffff8800ba940900 RDI: ffff8800bab48a00 > RBP: ffff8800b9c10128 R08: ffffffff82976660 R09: ffff8800ba9dbb28 > R10: dead000000100100 R11: dead000000200200 R12: ffff8800ba940900 > R13: ffffffff8313fd50 R14: ffff8800b9c95200 R15: 0000000000000000 > FS: 00007fb91fc34700(0000) GS:ffff8800bfa00000(0000) knlGS:0000000000000000 > CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > CR2: 0000000100000001 CR3: 00000000babfb000 CR4: 00000000000007f0 > Stack: > ffffffff8206ab0f ffffffff82982240 ffff8800bab48a00 ffff8800b9c100a8 > ffff8800b9c10100 0000000000000001 ffff8800ba940900 ffff8800b9c10128 > ffffffff8206bd65 ffff8800bfb0d5e0 ffff8800bab48a00 0000000000014dc0 > Call Trace: > [] ? nf_iterate+0x4f/0xa0 > [] ? nf_reinject+0x125/0x190 > [] ? nfqnl_recv_verdict+0x255/0x360 > [] ? nla_parse+0x80/0xf0 > [] ? nfnetlink_rcv_msg+0x13c/0x240 > [] ? __memcg_kmem_get_cache+0x4c/0x150 > [] ? nfnl_lock+0x20/0x20 > [] ? netlink_rcv_skb+0xa9/0xc0 > [] ? netlink_unicast+0x12f/0x1c0 > [] ? netlink_sendmsg+0x28e/0x650 > [] ? sock_sendmsg+0x44/0x50 > [] ? ___sys_sendmsg+0x2ab/0x2c0 > [] ? __wake_up+0x43/0x70 > [] ? tty_write+0x1c4/0x2a0 > [] ? __sys_sendmsg+0x44/0x80 > [] ? system_call_fastpath+0x12/0x6a > Code: Bad RIP value. > RIP [<0000000100000001>] 0x100000001 > RSP > CR2: 0000000100000001 > ---[ end trace 08eb65d42362793f ]--- Cc: stable@vger.kernel.org Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/netfilter/core.c | 1 + net/netfilter/nf_internals.h | 1 + net/netfilter/nf_queue.c | 17 +++++++++++++++++ net/netfilter/nfnetlink_queue_core.c | 24 +++++++++++++++++++++++- 4 files changed, 42 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 653e32eac08c..a0e54974e2c9 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -118,6 +118,7 @@ void nf_unregister_hook(struct nf_hook_ops *reg) static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif synchronize_net(); + nf_queue_nf_hook_drop(reg); } EXPORT_SYMBOL(nf_unregister_hook); diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h index ea7f36784b3d..399210693c2a 100644 --- a/net/netfilter/nf_internals.h +++ b/net/netfilter/nf_internals.h @@ -19,6 +19,7 @@ unsigned int nf_iterate(struct list_head *head, struct sk_buff *skb, /* nf_queue.c */ int nf_queue(struct sk_buff *skb, struct nf_hook_ops *elem, struct nf_hook_state *state, unsigned int queuenum); +void nf_queue_nf_hook_drop(struct nf_hook_ops *ops); int __init netfilter_queue_init(void); /* nf_log.c */ diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 2e88032cd5ad..cd60d397fe05 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -105,6 +105,23 @@ bool nf_queue_entry_get_refs(struct nf_queue_entry *entry) } EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs); +void nf_queue_nf_hook_drop(struct nf_hook_ops *ops) +{ + const struct nf_queue_handler *qh; + struct net *net; + + rtnl_lock(); + rcu_read_lock(); + qh = rcu_dereference(queue_handler); + if (qh) { + for_each_net(net) { + qh->nf_hook_drop(net, ops); + } + } + rcu_read_unlock(); + rtnl_unlock(); +} + /* * Any packet that leaves via this function must come back * through nf_reinject(). diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c index e26a46ef19ba..685cc6a17163 100644 --- a/net/netfilter/nfnetlink_queue_core.c +++ b/net/netfilter/nfnetlink_queue_core.c @@ -850,6 +850,27 @@ static struct notifier_block nfqnl_dev_notifier = { .notifier_call = nfqnl_rcv_dev_event, }; +static int nf_hook_cmp(struct nf_queue_entry *entry, unsigned long ops_ptr) +{ + return entry->elem == (struct nf_hook_ops *)ops_ptr; +} + +static void nfqnl_nf_hook_drop(struct net *net, struct nf_hook_ops *hook) +{ + struct nfnl_queue_net *q = nfnl_queue_pernet(net); + int i; + + rcu_read_lock(); + for (i = 0; i < INSTANCE_BUCKETS; i++) { + struct nfqnl_instance *inst; + struct hlist_head *head = &q->instance_table[i]; + + hlist_for_each_entry_rcu(inst, head, hlist) + nfqnl_flush(inst, nf_hook_cmp, (unsigned long)hook); + } + rcu_read_unlock(); +} + static int nfqnl_rcv_nl_event(struct notifier_block *this, unsigned long event, void *ptr) @@ -1057,7 +1078,8 @@ static const struct nla_policy nfqa_cfg_policy[NFQA_CFG_MAX+1] = { }; static const struct nf_queue_handler nfqh = { - .outfn = &nfqnl_enqueue_packet, + .outfn = &nfqnl_enqueue_packet, + .nf_hook_drop = &nfqnl_nf_hook_drop, }; static int -- cgit From e8e85cc5eb5701b935a06b5b3a03a8532946f969 Mon Sep 17 00:00:00 2001 From: Maninder Singh Date: Mon, 22 Jun 2015 12:39:16 +0530 Subject: packet: remove handling of tx_ring Remove handling of tx_ring in prb_setup_retire_blk_timer for TPACKET_V3 because init_prb_bdqc is called only for zero tx_ring and thus prb_setup_retire_blk_timer for zero tx_ring only. And also in functon init_prb_bdqc there is no usage of tx_ring. Thus removing tx_ring from init_prb_bdqc. Signed-off-by: Maninder Singh Suggested-by: Frans Klaver Signed-off-by: David S. Miller --- net/packet/af_packet.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 20e8c40da90d..f5c87031b121 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -543,15 +543,11 @@ static void prb_init_blk_timer(struct packet_sock *po, pkc->retire_blk_timer.expires = jiffies; } -static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring) +static void prb_setup_retire_blk_timer(struct packet_sock *po) { struct tpacket_kbdq_core *pkc; - if (tx_ring) - BUG(); - - pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) : - GET_PBDQC_FROM_RB(&po->rx_ring); + pkc = GET_PBDQC_FROM_RB(&po->rx_ring); prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired); } @@ -607,7 +603,7 @@ static void prb_init_ft_ops(struct tpacket_kbdq_core *p1, static void init_prb_bdqc(struct packet_sock *po, struct packet_ring_buffer *rb, struct pgv *pg_vec, - union tpacket_req_u *req_u, int tx_ring) + union tpacket_req_u *req_u) { struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb); struct tpacket_block_desc *pbd; @@ -634,7 +630,7 @@ static void init_prb_bdqc(struct packet_sock *po, p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv); prb_init_ft_ops(p1, req_u); - prb_setup_retire_blk_timer(po, tx_ring); + prb_setup_retire_blk_timer(po); prb_open_block(p1, pbd); } @@ -4002,7 +3998,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, * it above but just being paranoid */ if (!tx_ring) - init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring); + init_prb_bdqc(po, rb, pg_vec, req_u); break; default: break; -- cgit From 3e3a78b49508e58f798cf519876bbb9ca0f931af Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Mon, 22 Jun 2015 00:27:16 -0700 Subject: switchdev: rename vlan vid_start to vid_begin Use vid_begin/end to be consistent with BRIDGE_VLAN_INFO_RANGE_BEGIN/END. Signed-off-by: Scott Feldman Signed-off-by: David S. Miller --- net/bridge/br_vlan.c | 4 ++-- net/switchdev/switchdev.c | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 17fc358a5432..574feea6a8cc 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -54,7 +54,7 @@ static int __vlan_vid_add(struct net_device *dev, struct net_bridge *br, .id = SWITCHDEV_OBJ_PORT_VLAN, .u.vlan = { .flags = flags, - .vid_start = vid, + .vid_begin = vid, .vid_end = vid, }, }; @@ -132,7 +132,7 @@ static void __vlan_vid_del(struct net_device *dev, struct net_bridge *br, struct switchdev_obj vlan_obj = { .id = SWITCHDEV_OBJ_PORT_VLAN, .u.vlan = { - .vid_start = vid, + .vid_begin = vid, .vid_end = vid, }, }; diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index c29f2327f2e6..448d9199cea2 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -510,23 +510,23 @@ static int switchdev_port_br_afspec(struct net_device *dev, vinfo = nla_data(attr); vlan->flags = vinfo->flags; if (vinfo->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) { - if (vlan->vid_start) + if (vlan->vid_begin) return -EINVAL; - vlan->vid_start = vinfo->vid; + vlan->vid_begin = vinfo->vid; } else if (vinfo->flags & BRIDGE_VLAN_INFO_RANGE_END) { - if (!vlan->vid_start) + if (!vlan->vid_begin) return -EINVAL; vlan->vid_end = vinfo->vid; - if (vlan->vid_end <= vlan->vid_start) + if (vlan->vid_end <= vlan->vid_begin) return -EINVAL; err = f(dev, &obj); if (err) return err; memset(vlan, 0, sizeof(*vlan)); } else { - if (vlan->vid_start) + if (vlan->vid_begin) return -EINVAL; - vlan->vid_start = vinfo->vid; + vlan->vid_begin = vinfo->vid; vlan->vid_end = vinfo->vid; err = f(dev, &obj); if (err) -- cgit From 7d4f8d871ab15bd50a5771382ca2c9355b38d73c Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Mon, 22 Jun 2015 00:27:17 -0700 Subject: switchdev; add VLAN support for port's bridge_getlink One more missing piece of the puzzle. Add vlan dump support to switchdev port's bridge_getlink. iproute2 "bridge vlan show" cmd already knows how to show the vlans installed on the bridge and the device , but (until now) no one implemented the port vlan part of the netlink PF_BRIDGE:RTM_GETLINK msg. Before this patch, "bridge vlan show": $ bridge -c vlan show port vlan ids sw1p1 30-34 << bridge side vlans 57 sw1p1 << device side vlans (missing) sw1p2 57 sw1p2 sw1p3 sw1p4 br0 None (When the port is bridged, the output repeats the vlan list for the vlans on the bridge side of the port and the vlans on the device side of the port. The listing above show no vlans for the device side even though they are installed). After this patch: $ bridge -c vlan show port vlan ids sw1p1 30-34 << bridge side vlan 57 sw1p1 30-34 << device side vlans 57 3840 PVID sw1p2 57 sw1p2 57 3840 PVID sw1p3 3842 PVID sw1p4 3843 PVID br0 None I re-used ndo_dflt_bridge_getlink to add vlan fill call-back func. switchdev support adds an obj dump for VLAN objects, using the same call-back scheme as FDB dump. Support included for both compressed and un-compressed vlan dumps. Signed-off-by: Scott Feldman Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 18 +++++-- net/switchdev/switchdev.c | 123 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 137 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 2d102ce1474f..01ced4a889e0 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2908,7 +2908,11 @@ static int brport_nla_put_flag(struct sk_buff *skb, u32 flags, u32 mask, int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev, u16 mode, - u32 flags, u32 mask, int nlflags) + u32 flags, u32 mask, int nlflags, + u32 filter_mask, + int (*vlan_fill)(struct sk_buff *skb, + struct net_device *dev, + u32 filter_mask)) { struct nlmsghdr *nlh; struct ifinfomsg *ifm; @@ -2916,6 +2920,7 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct nlattr *protinfo; u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN; struct net_device *br_dev = netdev_master_upper_dev_get(dev); + int err = 0; nlh = nlmsg_put(skb, pid, seq, RTM_NEWLINK, sizeof(*ifm), nlflags); if (nlh == NULL) @@ -2956,6 +2961,13 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, goto nla_put_failure; } } + if (vlan_fill) { + err = vlan_fill(skb, dev, filter_mask); + if (err) { + nla_nest_cancel(skb, br_afspec); + goto nla_put_failure; + } + } nla_nest_end(skb, br_afspec); protinfo = nla_nest_start(skb, IFLA_PROTINFO | NLA_F_NESTED); @@ -2989,9 +3001,9 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, return 0; nla_put_failure: nlmsg_cancel(skb, nlh); - return -EMSGSIZE; + return err ? err : -EMSGSIZE; } -EXPORT_SYMBOL(ndo_dflt_bridge_getlink); +EXPORT_SYMBOL_GPL(ndo_dflt_bridge_getlink); static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) { diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 448d9199cea2..3e4b35a0c8cb 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -391,6 +391,126 @@ int call_switchdev_notifiers(unsigned long val, struct net_device *dev, } EXPORT_SYMBOL_GPL(call_switchdev_notifiers); +struct switchdev_vlan_dump { + struct switchdev_obj obj; + struct sk_buff *skb; + u32 filter_mask; + u16 flags; + u16 begin; + u16 end; +}; + +static int switchdev_port_vlan_dump_put(struct net_device *dev, + struct switchdev_vlan_dump *dump) +{ + struct bridge_vlan_info vinfo; + + vinfo.flags = dump->flags; + + if (dump->begin == 0 && dump->end == 0) { + return 0; + } else if (dump->begin == dump->end) { + vinfo.vid = dump->begin; + if (nla_put(dump->skb, IFLA_BRIDGE_VLAN_INFO, + sizeof(vinfo), &vinfo)) + return -EMSGSIZE; + } else { + vinfo.vid = dump->begin; + vinfo.flags |= BRIDGE_VLAN_INFO_RANGE_BEGIN; + if (nla_put(dump->skb, IFLA_BRIDGE_VLAN_INFO, + sizeof(vinfo), &vinfo)) + return -EMSGSIZE; + vinfo.vid = dump->end; + vinfo.flags &= ~BRIDGE_VLAN_INFO_RANGE_BEGIN; + vinfo.flags |= BRIDGE_VLAN_INFO_RANGE_END; + if (nla_put(dump->skb, IFLA_BRIDGE_VLAN_INFO, + sizeof(vinfo), &vinfo)) + return -EMSGSIZE; + } + + return 0; +} + +static int switchdev_port_vlan_dump_cb(struct net_device *dev, + struct switchdev_obj *obj) +{ + struct switchdev_vlan_dump *dump = + container_of(obj, struct switchdev_vlan_dump, obj); + struct switchdev_obj_vlan *vlan = &dump->obj.u.vlan; + int err = 0; + + if (vlan->vid_begin > vlan->vid_end) + return -EINVAL; + + if (dump->filter_mask & RTEXT_FILTER_BRVLAN) { + dump->flags = vlan->flags; + for (dump->begin = dump->end = vlan->vid_begin; + dump->begin <= vlan->vid_end; + dump->begin++, dump->end++) { + err = switchdev_port_vlan_dump_put(dev, dump); + if (err) + return err; + } + } else if (dump->filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED) { + if (dump->begin > vlan->vid_begin && + dump->begin >= vlan->vid_end) { + if ((dump->begin - 1) == vlan->vid_end && + dump->flags == vlan->flags) { + /* prepend */ + dump->begin = vlan->vid_begin; + } else { + err = switchdev_port_vlan_dump_put(dev, dump); + dump->flags = vlan->flags; + dump->begin = vlan->vid_begin; + dump->end = vlan->vid_end; + } + } else if (dump->end <= vlan->vid_begin && + dump->end < vlan->vid_end) { + if ((dump->end + 1) == vlan->vid_begin && + dump->flags == vlan->flags) { + /* append */ + dump->end = vlan->vid_end; + } else { + err = switchdev_port_vlan_dump_put(dev, dump); + dump->flags = vlan->flags; + dump->begin = vlan->vid_begin; + dump->end = vlan->vid_end; + } + } else { + err = -EINVAL; + } + } + + return err; +} + +static int switchdev_port_vlan_fill(struct sk_buff *skb, struct net_device *dev, + u32 filter_mask) +{ + struct switchdev_vlan_dump dump = { + .obj = { + .id = SWITCHDEV_OBJ_PORT_VLAN, + .cb = switchdev_port_vlan_dump_cb, + }, + .skb = skb, + .filter_mask = filter_mask, + }; + int err = 0; + + if ((filter_mask & RTEXT_FILTER_BRVLAN) || + (filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED)) { + err = switchdev_port_obj_dump(dev, &dump.obj); + if (err) + goto err_out; + if (filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED) + /* last one */ + err = switchdev_port_vlan_dump_put(dev, &dump); + } + +err_out: + return err == -EOPNOTSUPP ? 0 : err; +} + /** * switchdev_port_bridge_getlink - Get bridge port attributes * @@ -415,7 +535,8 @@ int switchdev_port_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, return err; return ndo_dflt_bridge_getlink(skb, pid, seq, dev, mode, - attr.u.brport_flags, mask, nlflags); + attr.u.brport_flags, mask, nlflags, + filter_mask, switchdev_port_vlan_fill); } EXPORT_SYMBOL_GPL(switchdev_port_bridge_getlink); -- cgit From e9fdaec0e0d40f548c2b79e147c7ffd2809d2a64 Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Thu, 11 Jun 2015 11:20:42 -0700 Subject: switchdev: change BUG_ON to WARN for attr set failure case This particular BUG_ON condition was checking for attr set err in the COMMIT phase, which isn't expected (it's a driver bug if PREPARE phase is OK but COMMIT fails). But BUG_ON() is too strong for this case, so change to WARN(). BUG_ON() would be warranted if the system was corrupted beyond repair, but this is not the case here. Signed-off-by: Scott Feldman Signed-off-by: David S. Miller --- net/switchdev/switchdev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 3e4b35a0c8cb..f01d34075749 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -184,7 +184,8 @@ int switchdev_port_attr_set(struct net_device *dev, struct switchdev_attr *attr) attr->trans = SWITCHDEV_TRANS_COMMIT; err = __switchdev_port_attr_set(dev, attr); - BUG_ON(err); + WARN(err, "%s: Commit of attribute (id=%d) failed.\n", + dev->name, attr->id); return err; } -- cgit From 34b99df4e6256ddafb663c6de0711dceceddfe0e Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 23 Jun 2015 08:34:39 +0300 Subject: ip: report the original address of ICMP messages ICMP messages can trigger ICMP and local errors. In this case serr->port is 0 and starting from Linux 4.0 we do not return the original target address to the error queue readers. Add function to define which errors provide addr_offset. With this fix my ping command is not silent anymore. Fixes: c247f0534cc5 ("ip: fix error queue empty skb handling") Signed-off-by: Julian Anastasov Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/ip_sockglue.c | 11 ++++++++++- net/ipv6/datagram.c | 12 +++++++++++- 2 files changed, 21 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 7cfb0893f263..6ddde89996f4 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -432,6 +432,15 @@ void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 inf kfree_skb(skb); } +/* For some errors we have valid addr_offset even with zero payload and + * zero port. Also, addr_offset should be supported if port is set. + */ +static inline bool ipv4_datagram_support_addr(struct sock_exterr_skb *serr) +{ + return serr->ee.ee_origin == SO_EE_ORIGIN_ICMP || + serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL || serr->port; +} + /* IPv4 supports cmsg on all imcp errors and some timestamps * * Timestamp code paths do not initialize the fields expected by cmsg: @@ -498,7 +507,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) serr = SKB_EXT_ERR(skb); - if (sin && serr->port) { + if (sin && ipv4_datagram_support_addr(serr)) { sin->sin_family = AF_INET; sin->sin_addr.s_addr = *(__be32 *)(skb_network_header(skb) + serr->addr_offset); diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 762a58c772b8..62d908e64eeb 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -325,6 +325,16 @@ void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu) kfree_skb(skb); } +/* For some errors we have valid addr_offset even with zero payload and + * zero port. Also, addr_offset should be supported if port is set. + */ +static inline bool ipv6_datagram_support_addr(struct sock_exterr_skb *serr) +{ + return serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6 || + serr->ee.ee_origin == SO_EE_ORIGIN_ICMP || + serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL || serr->port; +} + /* IPv6 supports cmsg on all origins aside from SO_EE_ORIGIN_LOCAL. * * At one point, excluding local errors was a quick test to identify icmp/icmp6 @@ -389,7 +399,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) serr = SKB_EXT_ERR(skb); - if (sin && serr->port) { + if (sin && ipv6_datagram_support_addr(serr)) { const unsigned char *nh = skb_network_header(skb); sin->sin6_family = AF_INET6; sin->sin6_flowinfo = 0; -- cgit From 5c8079d049c8452aeacec96a260200d468afc87d Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Tue, 23 Jun 2015 10:26:04 -0400 Subject: net: switchdev: ignore unsupported bridge flags switchdev_port_bridge_getlink() queries SWITCHDEV_ATTR_PORT_BRIDGE_FLAGS attributes, but a driver doesn't need to implement this in order to get bridge link information. So error out only on errors different than -EOPNOTSUPP. (This is a follow-up patch for 7d4f8d8.) Fixes: 8793d0a664a8 ("switchdev: add new switchdev_port_bridge_getlink") Signed-off-by: Vivien Didelot Acked-by: Jiri Pirko Acked-by: Scott Feldman Signed-off-by: David S. Miller --- net/switchdev/switchdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index f01d34075749..84f77a054025 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -532,7 +532,7 @@ int switchdev_port_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, int err; err = switchdev_port_attr_get(dev, &attr); - if (err) + if (err && err != -EOPNOTSUPP) return err; return ndo_dflt_bridge_getlink(skb, pid, seq, dev, mode, -- cgit From 8a3d03166f19329b46c6f9e900f93a89f446077b Mon Sep 17 00:00:00 2001 From: Andy Gospodarek Date: Tue, 23 Jun 2015 13:45:36 -0400 Subject: net: track link-status of ipv4 nexthops Add a fib flag called RTNH_F_LINKDOWN to any ipv4 nexthops that are reachable via an interface where carrier is off. No action is taken, but additional flags are passed to userspace to indicate carrier status. This also includes a cleanup to fib_disable_ip to more clearly indicate what event made the function call to replace the more cryptic force option previously used. v2: Split out kernel functionality into 2 patches, this patch simply sets and clears new nexthop flag RTNH_F_LINKDOWN. v3: Cleanups suggested by Alex as well as a bug noticed in fib_sync_down_dev and fib_sync_up when multipath was not enabled. v5: Whitespace and variable declaration fixups suggested by Dave. v6: Style fixups noticed by Dave; ran checkpatch to be sure I got them all. Signed-off-by: Andy Gospodarek Signed-off-by: Dinesh Dutt Acked-by: Scott Feldman Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 23 ++++++++++++------- net/ipv4/fib_semantics.c | 60 +++++++++++++++++++++++++++++++++++++----------- 2 files changed, 62 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 872494e6e6eb..534eb1485045 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1063,9 +1063,9 @@ static void nl_fib_lookup_exit(struct net *net) net->ipv4.fibnl = NULL; } -static void fib_disable_ip(struct net_device *dev, int force) +static void fib_disable_ip(struct net_device *dev, unsigned long event) { - if (fib_sync_down_dev(dev, force)) + if (fib_sync_down_dev(dev, event)) fib_flush(dev_net(dev)); rt_cache_flush(dev_net(dev)); arp_ifdown(dev); @@ -1081,7 +1081,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, case NETDEV_UP: fib_add_ifaddr(ifa); #ifdef CONFIG_IP_ROUTE_MULTIPATH - fib_sync_up(dev); + fib_sync_up(dev, RTNH_F_DEAD); #endif atomic_inc(&net->ipv4.dev_addr_genid); rt_cache_flush(dev_net(dev)); @@ -1093,7 +1093,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, /* Last address was deleted from this interface. * Disable IP. */ - fib_disable_ip(dev, 1); + fib_disable_ip(dev, event); } else { rt_cache_flush(dev_net(dev)); } @@ -1107,9 +1107,10 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct in_device *in_dev; struct net *net = dev_net(dev); + unsigned int flags; if (event == NETDEV_UNREGISTER) { - fib_disable_ip(dev, 2); + fib_disable_ip(dev, event); rt_flush_dev(dev); return NOTIFY_DONE; } @@ -1124,16 +1125,22 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo fib_add_ifaddr(ifa); } endfor_ifa(in_dev); #ifdef CONFIG_IP_ROUTE_MULTIPATH - fib_sync_up(dev); + fib_sync_up(dev, RTNH_F_DEAD); #endif atomic_inc(&net->ipv4.dev_addr_genid); rt_cache_flush(net); break; case NETDEV_DOWN: - fib_disable_ip(dev, 0); + fib_disable_ip(dev, event); break; - case NETDEV_CHANGEMTU: case NETDEV_CHANGE: + flags = dev_get_flags(dev); + if (flags & (IFF_RUNNING | IFF_LOWER_UP)) + fib_sync_up(dev, RTNH_F_LINKDOWN); + else + fib_sync_down_dev(dev, event); + /* fall through */ + case NETDEV_CHANGEMTU: rt_cache_flush(net); break; } diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 28ec3c1823bf..b1b305b1e340 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -266,7 +266,7 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) #ifdef CONFIG_IP_ROUTE_CLASSID nh->nh_tclassid != onh->nh_tclassid || #endif - ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD)) + ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_COMPARE_MASK)) return -1; onh++; } endfor_nexthops(fi); @@ -318,7 +318,7 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi) nfi->fib_type == fi->fib_type && memcmp(nfi->fib_metrics, fi->fib_metrics, sizeof(u32) * RTAX_MAX) == 0 && - ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 && + !((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK) && (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0)) return fi; } @@ -604,6 +604,8 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, return -ENODEV; if (!(dev->flags & IFF_UP)) return -ENETDOWN; + if (!netif_carrier_ok(dev)) + nh->nh_flags |= RTNH_F_LINKDOWN; nh->nh_dev = dev; dev_hold(dev); nh->nh_scope = RT_SCOPE_LINK; @@ -636,6 +638,8 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, if (!dev) goto out; dev_hold(dev); + if (!netif_carrier_ok(dev)) + nh->nh_flags |= RTNH_F_LINKDOWN; err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN; } else { struct in_device *in_dev; @@ -654,6 +658,8 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, nh->nh_dev = in_dev->dev; dev_hold(nh->nh_dev); nh->nh_scope = RT_SCOPE_HOST; + if (!netif_carrier_ok(nh->nh_dev)) + nh->nh_flags |= RTNH_F_LINKDOWN; err = 0; } out: @@ -920,11 +926,17 @@ struct fib_info *fib_create_info(struct fib_config *cfg) if (!nh->nh_dev) goto failure; } else { + int linkdown = 0; + change_nexthops(fi) { err = fib_check_nh(cfg, fi, nexthop_nh); if (err != 0) goto failure; + if (nexthop_nh->nh_flags & RTNH_F_LINKDOWN) + linkdown++; } endfor_nexthops(fi) + if (linkdown == fi->fib_nhs) + fi->fib_flags |= RTNH_F_LINKDOWN; } if (fi->fib_prefsrc) { @@ -1103,7 +1115,7 @@ int fib_sync_down_addr(struct net *net, __be32 local) return ret; } -int fib_sync_down_dev(struct net_device *dev, int force) +int fib_sync_down_dev(struct net_device *dev, unsigned long event) { int ret = 0; int scope = RT_SCOPE_NOWHERE; @@ -1112,7 +1124,8 @@ int fib_sync_down_dev(struct net_device *dev, int force) struct hlist_head *head = &fib_info_devhash[hash]; struct fib_nh *nh; - if (force) + if (event == NETDEV_UNREGISTER || + event == NETDEV_DOWN) scope = -1; hlist_for_each_entry(nh, head, nh_hash) { @@ -1129,7 +1142,15 @@ int fib_sync_down_dev(struct net_device *dev, int force) dead++; else if (nexthop_nh->nh_dev == dev && nexthop_nh->nh_scope != scope) { - nexthop_nh->nh_flags |= RTNH_F_DEAD; + switch (event) { + case NETDEV_DOWN: + case NETDEV_UNREGISTER: + nexthop_nh->nh_flags |= RTNH_F_DEAD; + /* fall through */ + case NETDEV_CHANGE: + nexthop_nh->nh_flags |= RTNH_F_LINKDOWN; + break; + } #ifdef CONFIG_IP_ROUTE_MULTIPATH spin_lock_bh(&fib_multipath_lock); fi->fib_power -= nexthop_nh->nh_power; @@ -1139,14 +1160,23 @@ int fib_sync_down_dev(struct net_device *dev, int force) dead++; } #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (force > 1 && nexthop_nh->nh_dev == dev) { + if (event == NETDEV_UNREGISTER && + nexthop_nh->nh_dev == dev) { dead = fi->fib_nhs; break; } #endif } endfor_nexthops(fi) if (dead == fi->fib_nhs) { - fi->fib_flags |= RTNH_F_DEAD; + switch (event) { + case NETDEV_DOWN: + case NETDEV_UNREGISTER: + fi->fib_flags |= RTNH_F_DEAD; + /* fall through */ + case NETDEV_CHANGE: + fi->fib_flags |= RTNH_F_LINKDOWN; + break; + } ret++; } } @@ -1210,13 +1240,11 @@ out: return; } -#ifdef CONFIG_IP_ROUTE_MULTIPATH - /* * Dead device goes up. We wake up dead nexthops. * It takes sense only on multipath routes. */ -int fib_sync_up(struct net_device *dev) +int fib_sync_up(struct net_device *dev, unsigned int nh_flags) { struct fib_info *prev_fi; unsigned int hash; @@ -1243,7 +1271,7 @@ int fib_sync_up(struct net_device *dev) prev_fi = fi; alive = 0; change_nexthops(fi) { - if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) { + if (!(nexthop_nh->nh_flags & nh_flags)) { alive++; continue; } @@ -1254,14 +1282,18 @@ int fib_sync_up(struct net_device *dev) !__in_dev_get_rtnl(dev)) continue; alive++; +#ifdef CONFIG_IP_ROUTE_MULTIPATH spin_lock_bh(&fib_multipath_lock); nexthop_nh->nh_power = 0; - nexthop_nh->nh_flags &= ~RTNH_F_DEAD; + nexthop_nh->nh_flags &= ~nh_flags; spin_unlock_bh(&fib_multipath_lock); +#else + nexthop_nh->nh_flags &= ~nh_flags; +#endif } endfor_nexthops(fi) if (alive > 0) { - fi->fib_flags &= ~RTNH_F_DEAD; + fi->fib_flags &= ~nh_flags; ret++; } } @@ -1269,6 +1301,8 @@ int fib_sync_up(struct net_device *dev) return ret; } +#ifdef CONFIG_IP_ROUTE_MULTIPATH + /* * The algorithm is suboptimal, but it provides really * fair weighted route distribution. -- cgit From 0eeb075fad736fb92620af995c47c204bbb5e829 Mon Sep 17 00:00:00 2001 From: Andy Gospodarek Date: Tue, 23 Jun 2015 13:45:37 -0400 Subject: net: ipv4 sysctl option to ignore routes when nexthop link is down This feature is only enabled with the new per-interface or ipv4 global sysctls called 'ignore_routes_with_linkdown'. net.ipv4.conf.all.ignore_routes_with_linkdown = 0 net.ipv4.conf.default.ignore_routes_with_linkdown = 0 net.ipv4.conf.lo.ignore_routes_with_linkdown = 0 ... When the above sysctls are set, will report to userspace that a route is dead and will no longer resolve to this nexthop when performing a fib lookup. This will signal to userspace that the route will not be selected. The signalling of a RTNH_F_DEAD is only passed to userspace if the sysctl is enabled and link is down. This was done as without it the netlink listeners would have no idea whether or not a nexthop would be selected. The kernel only sets RTNH_F_DEAD internally if the interface has IFF_UP cleared. With the new sysctl set, the following behavior can be observed (interface p8p1 is link-down): default via 10.0.5.2 dev p9p1 10.0.5.0/24 dev p9p1 proto kernel scope link src 10.0.5.15 70.0.0.0/24 dev p7p1 proto kernel scope link src 70.0.0.1 80.0.0.0/24 dev p8p1 proto kernel scope link src 80.0.0.1 dead linkdown 90.0.0.0/24 via 80.0.0.2 dev p8p1 metric 1 dead linkdown 90.0.0.0/24 via 70.0.0.2 dev p7p1 metric 2 90.0.0.1 via 70.0.0.2 dev p7p1 src 70.0.0.1 cache local 80.0.0.1 dev lo src 80.0.0.1 cache 80.0.0.2 via 10.0.5.2 dev p9p1 src 10.0.5.15 cache While the route does remain in the table (so it can be modified if needed rather than being wiped away as it would be if IFF_UP was cleared), the proper next-hop is chosen automatically when the link is down. Now interface p8p1 is linked-up: default via 10.0.5.2 dev p9p1 10.0.5.0/24 dev p9p1 proto kernel scope link src 10.0.5.15 70.0.0.0/24 dev p7p1 proto kernel scope link src 70.0.0.1 80.0.0.0/24 dev p8p1 proto kernel scope link src 80.0.0.1 90.0.0.0/24 via 80.0.0.2 dev p8p1 metric 1 90.0.0.0/24 via 70.0.0.2 dev p7p1 metric 2 192.168.56.0/24 dev p2p1 proto kernel scope link src 192.168.56.2 90.0.0.1 via 80.0.0.2 dev p8p1 src 80.0.0.1 cache local 80.0.0.1 dev lo src 80.0.0.1 cache 80.0.0.2 dev p8p1 src 80.0.0.1 cache and the output changes to what one would expect. If the sysctl is not set, the following output would be expected when p8p1 is down: default via 10.0.5.2 dev p9p1 10.0.5.0/24 dev p9p1 proto kernel scope link src 10.0.5.15 70.0.0.0/24 dev p7p1 proto kernel scope link src 70.0.0.1 80.0.0.0/24 dev p8p1 proto kernel scope link src 80.0.0.1 linkdown 90.0.0.0/24 via 80.0.0.2 dev p8p1 metric 1 linkdown 90.0.0.0/24 via 70.0.0.2 dev p7p1 metric 2 Since the dead flag does not appear, there should be no expectation that the kernel would skip using this route due to link being down. v2: Split kernel changes into 2 patches, this actually makes a behavioral change if the sysctl is set. Also took suggestion from Alex to simplify code by only checking sysctl during fib lookup and suggestion from Scott to add a per-interface sysctl. v3: Code clean-ups to make it more readable and efficient as well as a reverse path check fix. v4: Drop binary sysctl v5: Whitespace fixups from Dave v6: Style changes from Dave and checkpatch suggestions v7: One more checkpatch fixup Signed-off-by: Andy Gospodarek Signed-off-by: Dinesh Dutt Acked-by: Scott Feldman Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 2 ++ net/ipv4/fib_frontend.c | 6 +++--- net/ipv4/fib_rules.c | 5 +++-- net/ipv4/fib_semantics.c | 33 ++++++++++++++++++++++++++++----- net/ipv4/fib_trie.c | 6 ++++++ net/ipv4/netfilter/ipt_rpfilter.c | 2 +- net/ipv4/route.c | 10 +++++----- 7 files changed, 48 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 419d23c53ec7..7498716e8f54 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -2169,6 +2169,8 @@ static struct devinet_sysctl_table { "igmpv2_unsolicited_report_interval"), DEVINET_SYSCTL_RW_ENTRY(IGMPV3_UNSOLICITED_REPORT_INTERVAL, "igmpv3_unsolicited_report_interval"), + DEVINET_SYSCTL_RW_ENTRY(IGNORE_ROUTES_WITH_LINKDOWN, + "ignore_routes_with_linkdown"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"), diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 534eb1485045..6bbc54940eb4 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -280,7 +280,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); fl4.flowi4_scope = scope; fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0; - if (!fib_lookup(net, &fl4, &res)) + if (!fib_lookup(net, &fl4, &res, 0)) return FIB_RES_PREFSRC(net, res); } else { scope = RT_SCOPE_LINK; @@ -319,7 +319,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0; net = dev_net(dev); - if (fib_lookup(net, &fl4, &res)) + if (fib_lookup(net, &fl4, &res, 0)) goto last_resort; if (res.type != RTN_UNICAST && (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev))) @@ -354,7 +354,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, fl4.flowi4_oif = dev->ifindex; ret = 0; - if (fib_lookup(net, &fl4, &res) == 0) { + if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) { if (res.type == RTN_UNICAST) ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; } diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 56151982f74e..18123d50f576 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -47,11 +47,12 @@ struct fib4_rule { #endif }; -int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res) +int __fib_lookup(struct net *net, struct flowi4 *flp, + struct fib_result *res, unsigned int flags) { struct fib_lookup_arg arg = { .result = res, - .flags = FIB_LOOKUP_NOREF, + .flags = flags, }; int err; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index b1b305b1e340..3bfccd83551c 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -623,7 +623,8 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, /* It is not necessary, but requires a bit of thinking */ if (fl4.flowi4_scope < RT_SCOPE_LINK) fl4.flowi4_scope = RT_SCOPE_LINK; - err = fib_lookup(net, &fl4, &res); + err = fib_lookup(net, &fl4, &res, + FIB_LOOKUP_IGNORE_LINKSTATE); if (err) { rcu_read_unlock(); return err; @@ -1035,12 +1036,20 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc)) goto nla_put_failure; if (fi->fib_nhs == 1) { + struct in_device *in_dev; + if (fi->fib_nh->nh_gw && nla_put_in_addr(skb, RTA_GATEWAY, fi->fib_nh->nh_gw)) goto nla_put_failure; if (fi->fib_nh->nh_oif && nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif)) goto nla_put_failure; + if (fi->fib_nh->nh_flags & RTNH_F_LINKDOWN) { + in_dev = __in_dev_get_rcu(fi->fib_nh->nh_dev); + if (in_dev && + IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev)) + rtm->rtm_flags |= RTNH_F_DEAD; + } #ifdef CONFIG_IP_ROUTE_CLASSID if (fi->fib_nh[0].nh_tclassid && nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid)) @@ -1057,11 +1066,19 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, goto nla_put_failure; for_nexthops(fi) { + struct in_device *in_dev; + rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); if (!rtnh) goto nla_put_failure; rtnh->rtnh_flags = nh->nh_flags & 0xFF; + if (nh->nh_flags & RTNH_F_LINKDOWN) { + in_dev = __in_dev_get_rcu(nh->nh_dev); + if (in_dev && + IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev)) + rtnh->rtnh_flags |= RTNH_F_DEAD; + } rtnh->rtnh_hops = nh->nh_weight - 1; rtnh->rtnh_ifindex = nh->nh_oif; @@ -1310,16 +1327,22 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags) void fib_select_multipath(struct fib_result *res) { struct fib_info *fi = res->fi; + struct in_device *in_dev; int w; spin_lock_bh(&fib_multipath_lock); if (fi->fib_power <= 0) { int power = 0; change_nexthops(fi) { - if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) { - power += nexthop_nh->nh_weight; - nexthop_nh->nh_power = nexthop_nh->nh_weight; - } + in_dev = __in_dev_get_rcu(nexthop_nh->nh_dev); + if (nexthop_nh->nh_flags & RTNH_F_DEAD) + continue; + if (in_dev && + IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && + nexthop_nh->nh_flags & RTNH_F_LINKDOWN) + continue; + power += nexthop_nh->nh_weight; + nexthop_nh->nh_power = nexthop_nh->nh_weight; } endfor_nexthops(fi); fi->fib_power = power; if (power <= 0) { diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 6c666a9f1bd5..15d32612e3c6 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1412,9 +1412,15 @@ found: continue; for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) { const struct fib_nh *nh = &fi->fib_nh[nhsel]; + struct in_device *in_dev = __in_dev_get_rcu(nh->nh_dev); if (nh->nh_flags & RTNH_F_DEAD) continue; + if (in_dev && + IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && + nh->nh_flags & RTNH_F_LINKDOWN && + !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE)) + continue; if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif) continue; diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index 4bfaedf9b34e..8618fd150c96 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -40,7 +40,7 @@ static bool rpfilter_lookup_reverse(struct flowi4 *fl4, struct net *net = dev_net(dev); int ret __maybe_unused; - if (fib_lookup(net, fl4, &res)) + if (fib_lookup(net, fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE)) return false; if (res.type != RTN_UNICAST) { diff --git a/net/ipv4/route.c b/net/ipv4/route.c index f6055984c307..d0362a2de3d3 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -747,7 +747,7 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow if (!(n->nud_state & NUD_VALID)) { neigh_event_send(n, NULL); } else { - if (fib_lookup(net, fl4, &res) == 0) { + if (fib_lookup(net, fl4, &res, 0) == 0) { struct fib_nh *nh = &FIB_RES_NH(res); update_or_create_fnhe(nh, fl4->daddr, new_gw, @@ -975,7 +975,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) return; rcu_read_lock(); - if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) { + if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) { struct fib_nh *nh = &FIB_RES_NH(res); update_or_create_fnhe(nh, fl4->daddr, 0, mtu, @@ -1186,7 +1186,7 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) fl4.flowi4_mark = skb->mark; rcu_read_lock(); - if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0) + if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0) src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res); else src = inet_select_addr(rt->dst.dev, @@ -1716,7 +1716,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.daddr = daddr; fl4.saddr = saddr; - err = fib_lookup(net, &fl4, &res); + err = fib_lookup(net, &fl4, &res, 0); if (err != 0) { if (!IN_DEV_FORWARD(in_dev)) err = -EHOSTUNREACH; @@ -2123,7 +2123,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) goto make_route; } - if (fib_lookup(net, fl4, &res)) { + if (fib_lookup(net, fl4, &res, 0)) { res.fi = NULL; res.table = NULL; if (fl4->flowi4_oif) { -- cgit From 204621551b2a0060a013b92f7add4d5c452fa7cb Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Wed, 24 Jun 2015 11:02:51 +0200 Subject: net: inet_diag: export IPV6_V6ONLY sockopt For AF_INET6 sockets, the value of struct ipv6_pinfo.ipv6only is exported to userspace. It indicates whether a socket bound to in6addr_any listens on IPv4 as well as IPv6. Since the socket is natively IPv6, it is not listed by e.g. 'ss -l -4'. This patch is accompanied by an appropriate one for iproute2 to enable the additional information in 'ss -e'. Signed-off-by: Phil Sutter Signed-off-by: David S. Miller --- net/ipv4/inet_diag.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 516a1f6fbdd3..9bc26677058e 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -151,6 +151,10 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, if (nla_put_u8(skb, INET_DIAG_TCLASS, inet6_sk(sk)->tclass) < 0) goto errout; + + if (ipv6_only_sock(sk) && + nla_put_u8(skb, INET_DIAG_SKV6ONLY, 1)) + goto errout; } #endif -- cgit From 9aa66382163e784acac0ce3580ed202d9a56d1ac Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 23 Jun 2015 04:47:44 -0700 Subject: bridge: multicast: add a comment to br_port_state_selection about blocking state Add a comment to explain why we're not disabling port's multicast when it goes in blocking state. Since there's a check in the timer's function which bypasses the timer if the port's in blocking/disabled state, the timer will simply expire and stop without sending more queries. Suggested-by: Herbert Xu Signed-off-by: Nikolay Aleksandrov Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/bridge/br_stp.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index e7ab74b405a1..b4b6dab9c285 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -463,6 +463,10 @@ void br_port_state_selection(struct net_bridge *br) if (p->state != BR_STATE_BLOCKING) br_multicast_enable_port(p); + /* Multicast is not disabled for the port when it goes in + * blocking state because the timers will expire and stop by + * themselves without sending more queries. + */ if (p->state == BR_STATE_FORWARDING) ++liveports; } -- cgit From 1ea2d020ba477cb7011a7174e8501a9e04a325d4 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 23 Jun 2015 05:28:16 -0700 Subject: bridge: vlan: flush the dynamically learned entries on port vlan delete Add a new argument to br_fdb_delete_by_port which allows to specify a vid to match when flushing entries and use it in nbp_vlan_delete() to flush the dynamically learned entries of the vlan/port pair when removing a vlan from a port. Before this patch only the local mac was being removed and the dynamically learned ones were left to expire. Note that the do_all argument is still respected and if specified, the vid will be ignored. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_fdb.c | 7 +++++-- net/bridge/br_if.c | 4 ++-- net/bridge/br_private.h | 2 +- net/bridge/br_stp_if.c | 2 +- net/bridge/br_sysfs_if.c | 2 +- net/bridge/br_vlan.c | 1 + 6 files changed, 11 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index be84b7e5a3da..9e9875da0a4f 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -330,9 +330,11 @@ void br_fdb_flush(struct net_bridge *br) /* Flush all entries referring to a specific port. * if do_all is set also flush static entries + * if vid is set delete all entries that match the vlan_id */ void br_fdb_delete_by_port(struct net_bridge *br, const struct net_bridge_port *p, + u16 vid, int do_all) { int i; @@ -347,8 +349,9 @@ void br_fdb_delete_by_port(struct net_bridge *br, if (f->dst != p) continue; - if (f->is_static && !do_all) - continue; + if (!do_all) + if (f->is_static || (vid && f->vlan_id != vid)) + continue; if (f->is_local) fdb_delete_local(br, p, f); diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 1849d96b3c91..a538cb1199a3 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -249,7 +249,7 @@ static void del_nbp(struct net_bridge_port *p) list_del_rcu(&p->list); nbp_vlan_flush(p); - br_fdb_delete_by_port(br, p, 1); + br_fdb_delete_by_port(br, p, 0, 1); nbp_update_port_count(br); netdev_upper_dev_unlink(dev, br->dev); @@ -278,7 +278,7 @@ void br_dev_delete(struct net_device *dev, struct list_head *head) del_nbp(p); } - br_fdb_delete_by_port(br, NULL, 1); + br_fdb_delete_by_port(br, NULL, 0, 1); br_vlan_flush(br); del_timer_sync(&br->gc_timer); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 5dccced71269..8b21146b24a0 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -387,7 +387,7 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr); void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr); void br_fdb_cleanup(unsigned long arg); void br_fdb_delete_by_port(struct net_bridge *br, - const struct net_bridge_port *p, int do_all); + const struct net_bridge_port *p, u16 vid, int do_all); struct net_bridge_fdb_entry *__br_fdb_get(struct net_bridge *br, const unsigned char *addr, __u16 vid); int br_fdb_test_addr(struct net_device *dev, unsigned char *addr); diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index 7832d07f48f6..a2730e7196cd 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -111,7 +111,7 @@ void br_stp_disable_port(struct net_bridge_port *p) del_timer(&p->forward_delay_timer); del_timer(&p->hold_timer); - br_fdb_delete_by_port(br, p, 0); + br_fdb_delete_by_port(br, p, 0, 0); br_multicast_disable_port(p); br_configuration_update(br); diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index 4905845a94e9..efe415ad842a 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -160,7 +160,7 @@ static BRPORT_ATTR(hold_timer, S_IRUGO, show_hold_timer, NULL); static int store_flush(struct net_bridge_port *p, unsigned long v) { - br_fdb_delete_by_port(p->br, p, 0); // Don't delete local entry + br_fdb_delete_by_port(p->br, p, 0, 0); // Don't delete local entry return 0; } static BRPORT_ATTR(flush, S_IWUSR, NULL, store_flush); diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 574feea6a8cc..0d41f81838ff 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -741,6 +741,7 @@ int nbp_vlan_delete(struct net_bridge_port *port, u16 vid) return -EINVAL; br_fdb_find_delete_local(port->br, port, port->dev->dev_addr, vid); + br_fdb_delete_by_port(port->br, port, vid, 0); return __vlan_del(pv, vid); } -- cgit