From 05cc09de4c017663a217630682041066f2f9a5cd Mon Sep 17 00:00:00 2001 From: Alexey Khoroshilov Date: Fri, 5 Oct 2018 23:22:06 +0300 Subject: [PATCH 01/93] mac80211_hwsim: fix module init error paths for netlink There is no unregister netlink notifier and family on error paths in init_mac80211_hwsim(). Also there is an error path where hwsim_class is not destroyed. Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Alexey Khoroshilov Fixes: 62759361eb49 ("mac80211-hwsim: Provide multicast event for HWSIM_CMD_NEW_RADIO") Signed-off-by: Johannes Berg --- drivers/net/wireless/mac80211_hwsim.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index aa8058264d5b..07f958c63334 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -3703,16 +3703,16 @@ static int __init init_mac80211_hwsim(void) if (err) goto out_unregister_pernet; + err = hwsim_init_netlink(); + if (err) + goto out_unregister_driver; + hwsim_class = class_create(THIS_MODULE, "mac80211_hwsim"); if (IS_ERR(hwsim_class)) { err = PTR_ERR(hwsim_class); - goto out_unregister_driver; + goto out_exit_netlink; } - err = hwsim_init_netlink(); - if (err < 0) - goto out_unregister_driver; - for (i = 0; i < radios; i++) { struct hwsim_new_radio_params param = { 0 }; @@ -3818,6 +3818,8 @@ out_free_mon: free_netdev(hwsim_mon); out_free_radios: mac80211_hwsim_free(); +out_exit_netlink: + hwsim_exit_netlink(); out_unregister_driver: platform_driver_unregister(&mac80211_hwsim_driver); out_unregister_pernet: From 33483a6b88e4c4c3fc50178b185da52c55288b95 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 16 Oct 2018 02:35:30 +0000 Subject: [PATCH 02/93] mac80211: fix missing unlock on error in ieee80211_mark_sta_auth() Add the missing unlock before return from function ieee80211_mark_sta_auth() in the error handling case. Cc: stable@vger.kernel.org Fixes: fc107a933071 ("mac80211: Helper function for marking STA authenticated") Signed-off-by: Wei Yongjun [use result variable/label instead of duplicating] Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index d2bc8d57c87e..bcf5ffc1567a 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -2766,6 +2766,7 @@ static bool ieee80211_mark_sta_auth(struct ieee80211_sub_if_data *sdata, { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct sta_info *sta; + bool result = true; sdata_info(sdata, "authenticated\n"); ifmgd->auth_data->done = true; @@ -2778,15 +2779,18 @@ static bool ieee80211_mark_sta_auth(struct ieee80211_sub_if_data *sdata, sta = sta_info_get(sdata, bssid); if (!sta) { WARN_ONCE(1, "%s: STA %pM not found", sdata->name, bssid); - return false; + result = false; + goto out; } if (sta_info_move_state(sta, IEEE80211_STA_AUTH)) { sdata_info(sdata, "failed moving %pM to auth\n", bssid); - return false; + result = false; + goto out; } - mutex_unlock(&sdata->local->sta_mtx); - return true; +out: + mutex_unlock(&sdata->local->sta_mtx); + return result; } static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, From a1881c9b8a1edef0a5ae1d5c1b61406fe3402114 Mon Sep 17 00:00:00 2001 From: Vasyl Vavrychuk Date: Thu, 18 Oct 2018 01:02:12 +0300 Subject: [PATCH 03/93] mac80211_hwsim: Timer should be initialized before device registered Otherwise if network manager starts configuring Wi-Fi interface immidiatelly after getting notification of its creation, we will get NULL pointer dereference: BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] hrtimer_active+0x28/0x50 ... Call Trace: [] ? hrtimer_try_to_cancel+0x27/0x110 [] ? hrtimer_cancel+0x15/0x20 [] ? mac80211_hwsim_config+0x140/0x1c0 [mac80211_hwsim] Cc: stable@vger.kernel.org Signed-off-by: Vasyl Vavrychuk Signed-off-by: Johannes Berg --- drivers/net/wireless/mac80211_hwsim.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index 07f958c63334..d1464e3e1be2 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -2884,6 +2884,10 @@ static int mac80211_hwsim_new_radio(struct genl_info *info, wiphy_ext_feature_set(hw->wiphy, NL80211_EXT_FEATURE_CQM_RSSI_LIST); + tasklet_hrtimer_init(&data->beacon_timer, + mac80211_hwsim_beacon, + CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + err = ieee80211_register_hw(hw); if (err < 0) { pr_debug("mac80211_hwsim: ieee80211_register_hw failed (%d)\n", @@ -2908,10 +2912,6 @@ static int mac80211_hwsim_new_radio(struct genl_info *info, data->debugfs, data, &hwsim_simulate_radar); - tasklet_hrtimer_init(&data->beacon_timer, - mac80211_hwsim_beacon, - CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - spin_lock_bh(&hwsim_radio_lock); err = rhashtable_insert_fast(&hwsim_radios_rht, &data->rht, hwsim_rht_params); From 81c5dce2cd0bb0ecb61b6212410da5eb78cd8f79 Mon Sep 17 00:00:00 2001 From: Sergey Matyukevich Date: Fri, 19 Oct 2018 15:40:13 +0000 Subject: [PATCH 04/93] cfg80211: add missing constraint for user-supplied VHT mask Do a logical vht_capa &= vht_capa_mask of user-supplied VHT mask with the driver-supplied mask of modifiable VHT capabilities. Fix whitespaces and comment typos. Signed-off-by: Sergey Matyukevich Signed-off-by: Johannes Berg --- net/wireless/mlme.c | 4 ++-- net/wireless/sme.c | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 12b3edf70a7b..1615e503f8e3 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -272,11 +272,11 @@ void cfg80211_oper_and_ht_capa(struct ieee80211_ht_cap *ht_capa, p1 = (u8*)(ht_capa); p2 = (u8*)(ht_capa_mask); - for (i = 0; iht_capa_mask, rdev->wiphy.ht_capa_mod_mask); + cfg80211_oper_and_vht_capa(&connect->vht_capa_mask, + rdev->wiphy.vht_capa_mod_mask); if (connkeys && connkeys->def >= 0) { int idx; From c752cac9db1b0c469db7ba9d17af4ba708984db5 Mon Sep 17 00:00:00 2001 From: Yan-Hsuan Chuang Date: Tue, 23 Oct 2018 11:24:44 +0800 Subject: [PATCH 05/93] mac80211: fix GFP_KERNEL under tasklet context cfg80211_sta_opmode_change_notify needs a gfp_t flag to hint the nl80211 stack when allocating new skb, but it is called under tasklet context here with GFP_KERNEL and kernel will yield a warning about it. Cc: stable@vger.kernel.org Fixes: ff84e7bfe176 ("mac80211: Add support to notify ht/vht opmode modification.") Signed-off-by: Yan-Hsuan Chuang ACKed-by: Larry Finger Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 3bd3b5769797..a69ecfb212ed 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -3063,7 +3063,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) cfg80211_sta_opmode_change_notify(sdata->dev, rx->sta->addr, &sta_opmode, - GFP_KERNEL); + GFP_ATOMIC); goto handled; } case WLAN_HT_ACTION_NOTIFY_CHANWIDTH: { @@ -3100,7 +3100,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) cfg80211_sta_opmode_change_notify(sdata->dev, rx->sta->addr, &sta_opmode, - GFP_KERNEL); + GFP_ATOMIC); goto handled; } default: From 5c21e8100dfd57c806e833ae905e26efbb87840f Mon Sep 17 00:00:00 2001 From: Ben Greear Date: Tue, 23 Oct 2018 13:36:52 -0700 Subject: [PATCH 06/93] mac80211: Clear beacon_int in ieee80211_do_stop This fixes stale beacon-int values that would keep a netdev from going up. To reproduce: Create two VAP on one radio. vap1 has beacon-int 100, start it. vap2 has beacon-int 240, start it (and it will fail because beacon-int mismatch). reconfigure vap2 to have beacon-int 100 and start it. It will fail because the stale beacon-int 240 will be used in the ifup path and hostapd never gets a chance to set the new beacon interval. Cc: stable@vger.kernel.org Signed-off-by: Ben Greear Signed-off-by: Johannes Berg --- net/mac80211/iface.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 5836ddeac9e3..5f3c81e705c7 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1015,6 +1015,8 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, if (local->open_count == 0) ieee80211_clear_tx_pending(local); + sdata->vif.bss_conf.beacon_int = 0; + /* * If the interface goes down while suspended, presumably because * the device was unplugged and that happens before our resume, From c177db2d0d5e751d52d3827b8cfdb6ef92a95a2d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 30 Oct 2018 09:17:44 +0100 Subject: [PATCH 07/93] cfg80211/mac80211: fix FTM settings across CSA When FTM is enabled, doing a CSA will unexpectedly lose it since the value of ftm_responder may be initialized to 0 instead of -1, so fix that. Fixes: 81e54d08d9d8 ("cfg80211: support FTM responder configuration/statistics") Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 744b5851bbf9..8d763725498c 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -7870,6 +7870,7 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) } memset(¶ms, 0, sizeof(params)); + params.beacon_csa.ftm_responder = -1; if (!info->attrs[NL80211_ATTR_WIPHY_FREQ] || !info->attrs[NL80211_ATTR_CH_SWITCH_COUNT]) From 03b738625b1e58f4ae2bddf04706ab85c677af2d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 30 Oct 2018 09:17:45 +0100 Subject: [PATCH 08/93] mac80211: fix CSA beacon allocation size If the FTM responder settings are changed simultaneously with the CSA beacon, the buffer size allocated isn't sufficient and we'll have a heap overrun. Fix this. While at it, also clean up the ftm_responder assignment, doing it only if ftm_responder is non-zero is valid as it's 0 to start with, but not really useful to understand the code. Fixes: bc847970f432 ("mac80211: support FTM responder configuration/statistics") Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 51622333d460..818aa0060349 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2891,7 +2891,7 @@ cfg80211_beacon_dup(struct cfg80211_beacon_data *beacon) len = beacon->head_len + beacon->tail_len + beacon->beacon_ies_len + beacon->proberesp_ies_len + beacon->assocresp_ies_len + - beacon->probe_resp_len; + beacon->probe_resp_len + beacon->lci_len + beacon->civicloc_len; new_beacon = kzalloc(sizeof(*new_beacon) + len, GFP_KERNEL); if (!new_beacon) @@ -2934,8 +2934,9 @@ cfg80211_beacon_dup(struct cfg80211_beacon_data *beacon) memcpy(pos, beacon->probe_resp, beacon->probe_resp_len); pos += beacon->probe_resp_len; } - if (beacon->ftm_responder) - new_beacon->ftm_responder = beacon->ftm_responder; + + /* might copy -1, meaning no changes requested */ + new_beacon->ftm_responder = beacon->ftm_responder; if (beacon->lci) { new_beacon->lci_len = beacon->lci_len; new_beacon->lci = pos; From 113f3aaa81bd56aba02659786ed65cbd9cb9a6fc Mon Sep 17 00:00:00 2001 From: Sriram R Date: Fri, 19 Oct 2018 14:42:59 +0530 Subject: [PATCH 09/93] cfg80211: Prevent regulatory restore during STA disconnect in concurrent interfaces Currently when an AP and STA interfaces are active in the same or different radios, regulatory settings are restored whenever the STA disconnects. This restores all channel information including dfs states in all radios. For example, if an AP interface is active in one radio and STA in another, when radar is detected on the AP interface, the dfs state of the channel will be changed to UNAVAILABLE. But when the STA interface disconnects, this issues a regulatory disconnect hint which restores all regulatory settings in all the radios attached and thereby losing the stored dfs state on the other radio where the channel was marked as unavailable earlier. Hence prevent such regulatory restore whenever another active beaconing interface is present in the same or other radios. Signed-off-by: Sriram R Signed-off-by: Johannes Berg --- net/wireless/sme.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 1c93412038dc..f741d8376a46 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -642,11 +642,15 @@ static bool cfg80211_is_all_idle(void) * All devices must be idle as otherwise if you are actively * scanning some new beacon hints could be learned and would * count as new regulatory hints. + * Also if there is any other active beaconing interface we + * need not issue a disconnect hint and reset any info such + * as chan dfs state, etc. */ list_for_each_entry(rdev, &cfg80211_rdev_list, list) { list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { wdev_lock(wdev); - if (wdev->conn || wdev->current_bss) + if (wdev->conn || wdev->current_bss || + cfg80211_beaconing_iface_active(wdev)) is_all_idle = false; wdev_unlock(wdev); } From cdbb096adddb3f42584cecb5ec2e07c26815b71f Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 27 Nov 2018 13:23:27 -0800 Subject: [PATCH 10/93] bpf: btf: implement btf_name_valid_identifier() Function btf_name_valid_identifier() have been implemented in bpf-next commit 2667a2626f4d ("bpf: btf: Add BTF_KIND_FUNC and BTF_KIND_FUNC_PROTO"). Backport this function so later patch can use it. Fixes: 69b693f0aefa ("bpf: btf: Introduce BPF Type Format (BTF)") Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index ee4c82667d65..93c233ab2db6 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -426,6 +427,30 @@ static bool btf_name_offset_valid(const struct btf *btf, u32 offset) offset < btf->hdr.str_len; } +/* Only C-style identifier is permitted. This can be relaxed if + * necessary. + */ +static bool btf_name_valid_identifier(const struct btf *btf, u32 offset) +{ + /* offset must be valid */ + const char *src = &btf->strings[offset]; + const char *src_limit; + + if (!isalpha(*src) && *src != '_') + return false; + + /* set a limit on identifier length */ + src_limit = src + KSYM_NAME_LEN; + src++; + while (*src && src < src_limit) { + if (!isalnum(*src) && *src != '_') + return false; + src++; + } + + return !*src; +} + static const char *btf_name_by_offset(const struct btf *btf, u32 offset) { if (!offset) From eb04bbb608e683f8fd3ef7f716e2fa32dd90861f Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 27 Nov 2018 13:23:28 -0800 Subject: [PATCH 11/93] bpf: btf: check name validity for various types This patch added name checking for the following types: . BTF_KIND_PTR, BTF_KIND_ARRAY, BTF_KIND_VOLATILE, BTF_KIND_CONST, BTF_KIND_RESTRICT: the name must be null . BTF_KIND_STRUCT, BTF_KIND_UNION: the struct/member name is either null or a valid identifier . BTF_KIND_ENUM: the enum type name is either null or a valid identifier; the enumerator name must be a valid identifier. . BTF_KIND_FWD: the name must be a valid identifier . BTF_KIND_TYPEDEF: the name must be a valid identifier For those places a valid name is required, the name must be a valid C identifier. This can be relaxed later if we found use cases for a different (non-C) frontend. Fixes: 69b693f0aefa ("bpf: btf: Introduce BPF Type Format (BTF)") Acked-by: Martin KaFai Lau Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 93c233ab2db6..4da543d6bea2 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1168,6 +1168,22 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env, return -EINVAL; } + /* typedef type must have a valid name, and other ref types, + * volatile, const, restrict, should have a null name. + */ + if (BTF_INFO_KIND(t->info) == BTF_KIND_TYPEDEF) { + if (!t->name_off || + !btf_name_valid_identifier(env->btf, t->name_off)) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + } else { + if (t->name_off) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + } + btf_verifier_log_type(env, t, NULL); return 0; @@ -1325,6 +1341,13 @@ static s32 btf_fwd_check_meta(struct btf_verifier_env *env, return -EINVAL; } + /* fwd type must have a valid name */ + if (!t->name_off || + !btf_name_valid_identifier(env->btf, t->name_off)) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + btf_verifier_log_type(env, t, NULL); return 0; @@ -1381,6 +1404,12 @@ static s32 btf_array_check_meta(struct btf_verifier_env *env, return -EINVAL; } + /* array type should not have a name */ + if (t->name_off) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + if (btf_type_vlen(t)) { btf_verifier_log_type(env, t, "vlen != 0"); return -EINVAL; @@ -1557,6 +1586,13 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, return -EINVAL; } + /* struct type either no name or a valid one */ + if (t->name_off && + !btf_name_valid_identifier(env->btf, t->name_off)) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + btf_verifier_log_type(env, t, NULL); last_offset = 0; @@ -1568,6 +1604,12 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, return -EINVAL; } + /* struct member either no name or a valid one */ + if (member->name_off && + !btf_name_valid_identifier(btf, member->name_off)) { + btf_verifier_log_member(env, t, member, "Invalid name"); + return -EINVAL; + } /* A member cannot be in type void */ if (!member->type || !BTF_TYPE_ID_VALID(member->type)) { btf_verifier_log_member(env, t, member, @@ -1755,6 +1797,13 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env, return -EINVAL; } + /* enum type either no name or a valid one */ + if (t->name_off && + !btf_name_valid_identifier(env->btf, t->name_off)) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + btf_verifier_log_type(env, t, NULL); for (i = 0; i < nr_enums; i++) { @@ -1764,6 +1813,14 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env, return -EINVAL; } + /* enum member must have a valid name */ + if (!enums[i].name_off || + !btf_name_valid_identifier(btf, enums[i].name_off)) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + + btf_verifier_log(env, "\t%s val=%d\n", btf_name_by_offset(btf, enums[i].name_off), enums[i].val); From 8800cd031af085807028656c6ba7eb7908d78262 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 27 Nov 2018 13:23:29 -0800 Subject: [PATCH 12/93] tools/bpf: fix two test_btf unit test cases There are two unit test cases, which should encode TYPEDEF type, but instead encode PTR type. The error is flagged out after enforcing name checking in the previous patch. Fixes: c0fa1b6c3efc ("bpf: btf: Add BTF tests") Signed-off-by: Martin KaFai Lau Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_btf.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/bpf/test_btf.c b/tools/testing/selftests/bpf/test_btf.c index f42b3396d622..b361bb851829 100644 --- a/tools/testing/selftests/bpf/test_btf.c +++ b/tools/testing/selftests/bpf/test_btf.c @@ -432,11 +432,11 @@ static struct btf_raw_test raw_tests[] = { /* const void* */ /* [3] */ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 2), /* typedef const void * const_void_ptr */ - BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 3), - /* struct A { */ /* [4] */ + BTF_TYPEDEF_ENC(NAME_TBD, 3), /* [4] */ + /* struct A { */ /* [5] */ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 1), sizeof(void *)), /* const_void_ptr m; */ - BTF_MEMBER_ENC(NAME_TBD, 3, 0), + BTF_MEMBER_ENC(NAME_TBD, 4, 0), /* } */ BTF_END_RAW, }, @@ -494,10 +494,10 @@ static struct btf_raw_test raw_tests[] = { BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 0), /* const void* */ /* [3] */ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 2), - /* typedef const void * const_void_ptr */ /* [4] */ - BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 3), - /* const_void_ptr[4] */ /* [5] */ - BTF_TYPE_ARRAY_ENC(3, 1, 4), + /* typedef const void * const_void_ptr */ + BTF_TYPEDEF_ENC(NAME_TBD, 3), /* [4] */ + /* const_void_ptr[4] */ + BTF_TYPE_ARRAY_ENC(4, 1, 4), /* [5] */ BTF_END_RAW, }, .str_sec = "\0const_void_ptr", From d08489125e04a9f73d9323caea43270fd22d395f Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 27 Nov 2018 13:23:30 -0800 Subject: [PATCH 13/93] tools/bpf: add addition type tests to test_btf The following additional unit testcases are added to test_btf: ... BTF raw test[42] (typedef (invalid name, name_off = 0)): OK BTF raw test[43] (typedef (invalid name, invalid identifier)): OK BTF raw test[44] (ptr type (invalid name, name_off <> 0)): OK BTF raw test[45] (volatile type (invalid name, name_off <> 0)): OK BTF raw test[46] (const type (invalid name, name_off <> 0)): OK BTF raw test[47] (restrict type (invalid name, name_off <> 0)): OK BTF raw test[48] (fwd type (invalid name, name_off = 0)): OK BTF raw test[49] (fwd type (invalid name, invalid identifier)): OK BTF raw test[50] (array type (invalid name, name_off <> 0)): OK BTF raw test[51] (struct type (name_off = 0)): OK BTF raw test[52] (struct type (invalid name, invalid identifier)): OK BTF raw test[53] (struct member (name_off = 0)): OK BTF raw test[54] (struct member (invalid name, invalid identifier)): OK BTF raw test[55] (enum type (name_off = 0)): OK BTF raw test[56] (enum type (invalid name, invalid identifier)): OK BTF raw test[57] (enum member (invalid name, name_off = 0)): OK BTF raw test[58] (enum member (invalid name, invalid identifier)): OK ... Fixes: c0fa1b6c3efc ("bpf: btf: Add BTF tests") Acked-by: Martin KaFai Lau Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_btf.c | 361 +++++++++++++++++++++++++ 1 file changed, 361 insertions(+) diff --git a/tools/testing/selftests/bpf/test_btf.c b/tools/testing/selftests/bpf/test_btf.c index b361bb851829..38e1cbaaffdb 100644 --- a/tools/testing/selftests/bpf/test_btf.c +++ b/tools/testing/selftests/bpf/test_btf.c @@ -1292,6 +1292,367 @@ static struct btf_raw_test raw_tests[] = { .err_str = "type != 0", }, +{ + .descr = "typedef (invalid name, name_off = 0)", + .raw_types = { + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + BTF_TYPEDEF_ENC(0, 1), /* [2] */ + BTF_END_RAW, + }, + .str_sec = "\0__int", + .str_sec_size = sizeof("\0__int"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "typedef_check_btf", + .key_size = sizeof(int), + .value_size = sizeof(int), + .key_type_id = 1, + .value_type_id = 1, + .max_entries = 4, + .btf_load_err = true, + .err_str = "Invalid name", +}, + +{ + .descr = "typedef (invalid name, invalid identifier)", + .raw_types = { + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + BTF_TYPEDEF_ENC(NAME_TBD, 1), /* [2] */ + BTF_END_RAW, + }, + .str_sec = "\0__!int", + .str_sec_size = sizeof("\0__!int"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "typedef_check_btf", + .key_size = sizeof(int), + .value_size = sizeof(int), + .key_type_id = 1, + .value_type_id = 1, + .max_entries = 4, + .btf_load_err = true, + .err_str = "Invalid name", +}, + +{ + .descr = "ptr type (invalid name, name_off <> 0)", + .raw_types = { + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + BTF_TYPE_ENC(NAME_TBD, + BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 1), /* [2] */ + BTF_END_RAW, + }, + .str_sec = "\0__int", + .str_sec_size = sizeof("\0__int"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "ptr_type_check_btf", + .key_size = sizeof(int), + .value_size = sizeof(int), + .key_type_id = 1, + .value_type_id = 1, + .max_entries = 4, + .btf_load_err = true, + .err_str = "Invalid name", +}, + +{ + .descr = "volatile type (invalid name, name_off <> 0)", + .raw_types = { + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + BTF_TYPE_ENC(NAME_TBD, + BTF_INFO_ENC(BTF_KIND_VOLATILE, 0, 0), 1), /* [2] */ + BTF_END_RAW, + }, + .str_sec = "\0__int", + .str_sec_size = sizeof("\0__int"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "volatile_type_check_btf", + .key_size = sizeof(int), + .value_size = sizeof(int), + .key_type_id = 1, + .value_type_id = 1, + .max_entries = 4, + .btf_load_err = true, + .err_str = "Invalid name", +}, + +{ + .descr = "const type (invalid name, name_off <> 0)", + .raw_types = { + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + BTF_TYPE_ENC(NAME_TBD, + BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 1), /* [2] */ + BTF_END_RAW, + }, + .str_sec = "\0__int", + .str_sec_size = sizeof("\0__int"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "const_type_check_btf", + .key_size = sizeof(int), + .value_size = sizeof(int), + .key_type_id = 1, + .value_type_id = 1, + .max_entries = 4, + .btf_load_err = true, + .err_str = "Invalid name", +}, + +{ + .descr = "restrict type (invalid name, name_off <> 0)", + .raw_types = { + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 1), /* [2] */ + BTF_TYPE_ENC(NAME_TBD, + BTF_INFO_ENC(BTF_KIND_RESTRICT, 0, 0), 2), /* [3] */ + BTF_END_RAW, + }, + .str_sec = "\0__int", + .str_sec_size = sizeof("\0__int"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "restrict_type_check_btf", + .key_size = sizeof(int), + .value_size = sizeof(int), + .key_type_id = 1, + .value_type_id = 1, + .max_entries = 4, + .btf_load_err = true, + .err_str = "Invalid name", +}, + +{ + .descr = "fwd type (invalid name, name_off = 0)", + .raw_types = { + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FWD, 0, 0), 0), /* [2] */ + BTF_END_RAW, + }, + .str_sec = "\0__skb", + .str_sec_size = sizeof("\0__skb"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "fwd_type_check_btf", + .key_size = sizeof(int), + .value_size = sizeof(int), + .key_type_id = 1, + .value_type_id = 1, + .max_entries = 4, + .btf_load_err = true, + .err_str = "Invalid name", +}, + +{ + .descr = "fwd type (invalid name, invalid identifier)", + .raw_types = { + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + BTF_TYPE_ENC(NAME_TBD, + BTF_INFO_ENC(BTF_KIND_FWD, 0, 0), 0), /* [2] */ + BTF_END_RAW, + }, + .str_sec = "\0__!skb", + .str_sec_size = sizeof("\0__!skb"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "fwd_type_check_btf", + .key_size = sizeof(int), + .value_size = sizeof(int), + .key_type_id = 1, + .value_type_id = 1, + .max_entries = 4, + .btf_load_err = true, + .err_str = "Invalid name", +}, + +{ + .descr = "array type (invalid name, name_off <> 0)", + .raw_types = { + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + BTF_TYPE_ENC(NAME_TBD, + BTF_INFO_ENC(BTF_KIND_ARRAY, 0, 0), 0), /* [2] */ + BTF_ARRAY_ENC(1, 1, 4), + BTF_END_RAW, + }, + .str_sec = "\0__skb", + .str_sec_size = sizeof("\0__skb"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "array_type_check_btf", + .key_size = sizeof(int), + .value_size = sizeof(int), + .key_type_id = 1, + .value_type_id = 1, + .max_entries = 4, + .btf_load_err = true, + .err_str = "Invalid name", +}, + +{ + .descr = "struct type (name_off = 0)", + .raw_types = { + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + BTF_TYPE_ENC(0, + BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 1), 4), /* [2] */ + BTF_MEMBER_ENC(NAME_TBD, 1, 0), + BTF_END_RAW, + }, + .str_sec = "\0A", + .str_sec_size = sizeof("\0A"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "struct_type_check_btf", + .key_size = sizeof(int), + .value_size = sizeof(int), + .key_type_id = 1, + .value_type_id = 1, + .max_entries = 4, +}, + +{ + .descr = "struct type (invalid name, invalid identifier)", + .raw_types = { + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + BTF_TYPE_ENC(NAME_TBD, + BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 1), 4), /* [2] */ + BTF_MEMBER_ENC(NAME_TBD, 1, 0), + BTF_END_RAW, + }, + .str_sec = "\0A!\0B", + .str_sec_size = sizeof("\0A!\0B"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "struct_type_check_btf", + .key_size = sizeof(int), + .value_size = sizeof(int), + .key_type_id = 1, + .value_type_id = 1, + .max_entries = 4, + .btf_load_err = true, + .err_str = "Invalid name", +}, + +{ + .descr = "struct member (name_off = 0)", + .raw_types = { + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + BTF_TYPE_ENC(0, + BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 1), 4), /* [2] */ + BTF_MEMBER_ENC(NAME_TBD, 1, 0), + BTF_END_RAW, + }, + .str_sec = "\0A", + .str_sec_size = sizeof("\0A"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "struct_type_check_btf", + .key_size = sizeof(int), + .value_size = sizeof(int), + .key_type_id = 1, + .value_type_id = 1, + .max_entries = 4, +}, + +{ + .descr = "struct member (invalid name, invalid identifier)", + .raw_types = { + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + BTF_TYPE_ENC(NAME_TBD, + BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 1), 4), /* [2] */ + BTF_MEMBER_ENC(NAME_TBD, 1, 0), + BTF_END_RAW, + }, + .str_sec = "\0A\0B*", + .str_sec_size = sizeof("\0A\0B*"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "struct_type_check_btf", + .key_size = sizeof(int), + .value_size = sizeof(int), + .key_type_id = 1, + .value_type_id = 1, + .max_entries = 4, + .btf_load_err = true, + .err_str = "Invalid name", +}, + +{ + .descr = "enum type (name_off = 0)", + .raw_types = { + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + BTF_TYPE_ENC(0, + BTF_INFO_ENC(BTF_KIND_ENUM, 0, 1), + sizeof(int)), /* [2] */ + BTF_ENUM_ENC(NAME_TBD, 0), + BTF_END_RAW, + }, + .str_sec = "\0A\0B", + .str_sec_size = sizeof("\0A\0B"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "enum_type_check_btf", + .key_size = sizeof(int), + .value_size = sizeof(int), + .key_type_id = 1, + .value_type_id = 1, + .max_entries = 4, +}, + +{ + .descr = "enum type (invalid name, invalid identifier)", + .raw_types = { + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + BTF_TYPE_ENC(NAME_TBD, + BTF_INFO_ENC(BTF_KIND_ENUM, 0, 1), + sizeof(int)), /* [2] */ + BTF_ENUM_ENC(NAME_TBD, 0), + BTF_END_RAW, + }, + .str_sec = "\0A!\0B", + .str_sec_size = sizeof("\0A!\0B"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "enum_type_check_btf", + .key_size = sizeof(int), + .value_size = sizeof(int), + .key_type_id = 1, + .value_type_id = 1, + .max_entries = 4, + .btf_load_err = true, + .err_str = "Invalid name", +}, + +{ + .descr = "enum member (invalid name, name_off = 0)", + .raw_types = { + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + BTF_TYPE_ENC(0, + BTF_INFO_ENC(BTF_KIND_ENUM, 0, 1), + sizeof(int)), /* [2] */ + BTF_ENUM_ENC(0, 0), + BTF_END_RAW, + }, + .str_sec = "", + .str_sec_size = sizeof(""), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "enum_type_check_btf", + .key_size = sizeof(int), + .value_size = sizeof(int), + .key_type_id = 1, + .value_type_id = 1, + .max_entries = 4, + .btf_load_err = true, + .err_str = "Invalid name", +}, + +{ + .descr = "enum member (invalid name, invalid identifier)", + .raw_types = { + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + BTF_TYPE_ENC(0, + BTF_INFO_ENC(BTF_KIND_ENUM, 0, 1), + sizeof(int)), /* [2] */ + BTF_ENUM_ENC(NAME_TBD, 0), + BTF_END_RAW, + }, + .str_sec = "\0A!", + .str_sec_size = sizeof("\0A!"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "enum_type_check_btf", + .key_size = sizeof(int), + .value_size = sizeof(int), + .key_type_id = 1, + .value_type_id = 1, + .max_entries = 4, + .btf_load_err = true, + .err_str = "Invalid name", +}, { .descr = "arraymap invalid btf key (a bit field)", .raw_types = { From 528bff0cdb6649f97f2c4802e4ac7a4b50645f2f Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 28 Nov 2018 09:38:23 -0800 Subject: [PATCH 14/93] tools: bpftool: fix a bitfield pretty print issue Commit b12d6ec09730 ("bpf: btf: add btf print functionality") added btf pretty print functionality to bpftool. There is a problem though in printing a bitfield whose type has modifiers. For example, for a type like typedef int ___int; struct tmp_t { int a:3; ___int b:3; }; Suppose we have a map struct bpf_map_def SEC("maps") tmpmap = { .type = BPF_MAP_TYPE_HASH, .key_size = sizeof(__u32), .value_size = sizeof(struct tmp_t), .max_entries = 1, }; and the hash table is populated with one element with key 0 and value (.a = 1 and .b = 2). In BTF, the struct member "b" will have a type "typedef" which points to an int type. The current implementation does not pass the bit offset during transition from typedef to int type, hence incorrectly print the value as $ bpftool m d id 79 [{ "key": 0, "value": { "a": 0x1, "b": 0x1 } } ] This patch fixed the issue by carrying bit_offset along the type chain during bit_field print. The correct result can be printed as $ bpftool m d id 76 [{ "key": 0, "value": { "a": 0x1, "b": 0x2 } } ] The kernel pretty print is implemented correctly and does not have this issue. Fixes: b12d6ec09730 ("bpf: btf: add btf print functionality") Signed-off-by: Yonghong Song Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- tools/bpf/bpftool/btf_dumper.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/bpf/bpftool/btf_dumper.c b/tools/bpf/bpftool/btf_dumper.c index 55bc512a1831..e4e6e2b3fd84 100644 --- a/tools/bpf/bpftool/btf_dumper.c +++ b/tools/bpf/bpftool/btf_dumper.c @@ -32,7 +32,7 @@ static void btf_dumper_ptr(const void *data, json_writer_t *jw, } static int btf_dumper_modifier(const struct btf_dumper *d, __u32 type_id, - const void *data) + __u8 bit_offset, const void *data) { int actual_type_id; @@ -40,7 +40,7 @@ static int btf_dumper_modifier(const struct btf_dumper *d, __u32 type_id, if (actual_type_id < 0) return actual_type_id; - return btf_dumper_do_type(d, actual_type_id, 0, data); + return btf_dumper_do_type(d, actual_type_id, bit_offset, data); } static void btf_dumper_enum(const void *data, json_writer_t *jw) @@ -237,7 +237,7 @@ static int btf_dumper_do_type(const struct btf_dumper *d, __u32 type_id, case BTF_KIND_VOLATILE: case BTF_KIND_CONST: case BTF_KIND_RESTRICT: - return btf_dumper_modifier(d, type_id, data); + return btf_dumper_modifier(d, type_id, bit_offset, data); default: jsonw_printf(d->jw, "(unsupported-kind"); return -EINVAL; From 09ee3b4a249dd5c64da7d25a52a4ce42a49d647a Mon Sep 17 00:00:00 2001 From: Kunihiko Hayashi Date: Thu, 29 Nov 2018 17:08:36 +0900 Subject: [PATCH 15/93] net: ethernet: ave: Increase descriptors to improve performance To improve performance, this increases Rx descriptor to 256, Tx descriptor to 64, and adjusts NAPI weight to NAPI_POLL_WEIGHT. Signed-off-by: Kunihiko Hayashi Signed-off-by: David S. Miller --- drivers/net/ethernet/socionext/sni_ave.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/socionext/sni_ave.c b/drivers/net/ethernet/socionext/sni_ave.c index 6732f5cbde08..29b5b12bce6c 100644 --- a/drivers/net/ethernet/socionext/sni_ave.c +++ b/drivers/net/ethernet/socionext/sni_ave.c @@ -185,8 +185,8 @@ NETIF_MSG_TX_ERR) /* Parameter for descriptor */ -#define AVE_NR_TXDESC 32 /* Tx descriptor */ -#define AVE_NR_RXDESC 64 /* Rx descriptor */ +#define AVE_NR_TXDESC 64 /* Tx descriptor */ +#define AVE_NR_RXDESC 256 /* Rx descriptor */ #define AVE_DESC_OFS_CMDSTS 0 #define AVE_DESC_OFS_ADDRL 4 @@ -1689,9 +1689,10 @@ static int ave_probe(struct platform_device *pdev) pdev->name, pdev->id); /* Register as a NAPI supported driver */ - netif_napi_add(ndev, &priv->napi_rx, ave_napi_poll_rx, priv->rx.ndesc); + netif_napi_add(ndev, &priv->napi_rx, ave_napi_poll_rx, + NAPI_POLL_WEIGHT); netif_tx_napi_add(ndev, &priv->napi_tx, ave_napi_poll_tx, - priv->tx.ndesc); + NAPI_POLL_WEIGHT); platform_set_drvdata(pdev, ndev); From 88113957ddb7b7d5451e28cd708c82ea7e63b097 Mon Sep 17 00:00:00 2001 From: Kunihiko Hayashi Date: Thu, 29 Nov 2018 17:08:37 +0900 Subject: [PATCH 16/93] net: ethernet: ave: Replace NET_IP_ALIGN with AVE_FRAME_HEADROOM In commit 26a4676faa1a ("arm64: mm: define NET_IP_ALIGN to 0"), AVE controller affects this modification because the controller forces to ignore lower 2bits of buffer start address, and make 2-byte headroom, that is, data reception starts from (buffer + 2). This patch defines AVE_FRAME_HEADROOM macro as hardware-specific value, and replaces NET_IP_ALIGN with it. Signed-off-by: Kunihiko Hayashi Signed-off-by: David S. Miller --- drivers/net/ethernet/socionext/sni_ave.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/socionext/sni_ave.c b/drivers/net/ethernet/socionext/sni_ave.c index 29b5b12bce6c..0da11344d035 100644 --- a/drivers/net/ethernet/socionext/sni_ave.c +++ b/drivers/net/ethernet/socionext/sni_ave.c @@ -194,6 +194,7 @@ /* Parameter for ethernet frame */ #define AVE_MAX_ETHFRAME 1518 +#define AVE_FRAME_HEADROOM 2 /* Parameter for interrupt */ #define AVE_INTM_COUNT 20 @@ -576,12 +577,13 @@ static int ave_rxdesc_prepare(struct net_device *ndev, int entry) skb = priv->rx.desc[entry].skbs; if (!skb) { - skb = netdev_alloc_skb_ip_align(ndev, - AVE_MAX_ETHFRAME); + skb = netdev_alloc_skb(ndev, AVE_MAX_ETHFRAME); if (!skb) { netdev_err(ndev, "can't allocate skb for Rx\n"); return -ENOMEM; } + skb->data += AVE_FRAME_HEADROOM; + skb->tail += AVE_FRAME_HEADROOM; } /* set disable to cmdsts */ @@ -594,12 +596,12 @@ static int ave_rxdesc_prepare(struct net_device *ndev, int entry) * - Rx buffer begins with 2 byte headroom, and data will be put from * (buffer + 2). * To satisfy this, specify the address to put back the buffer - * pointer advanced by NET_IP_ALIGN by netdev_alloc_skb_ip_align(), - * and expand the map size by NET_IP_ALIGN. + * pointer advanced by AVE_FRAME_HEADROOM, and expand the map size + * by AVE_FRAME_HEADROOM. */ ret = ave_dma_map(ndev, &priv->rx.desc[entry], - skb->data - NET_IP_ALIGN, - AVE_MAX_ETHFRAME + NET_IP_ALIGN, + skb->data - AVE_FRAME_HEADROOM, + AVE_MAX_ETHFRAME + AVE_FRAME_HEADROOM, DMA_FROM_DEVICE, &paddr); if (ret) { netdev_err(ndev, "can't map skb for Rx\n"); From d75d0e874ffe929dec143d331b53e4bfceb10af2 Mon Sep 17 00:00:00 2001 From: Kunihiko Hayashi Date: Thu, 29 Nov 2018 17:08:38 +0900 Subject: [PATCH 17/93] net: ethernet: ave: Add MODULE_AUTHOR and MAINTAINERS entry Add missing MODULE_AUTHOR of ave driver and an entry to MAINTAINERS. Signed-off-by: Kunihiko Hayashi Signed-off-by: David S. Miller --- MAINTAINERS | 7 +++++++ drivers/net/ethernet/socionext/sni_ave.c | 1 + 2 files changed, 8 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index da57abebaab3..51d35f0ab989 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13885,6 +13885,13 @@ F: drivers/md/raid* F: include/linux/raid/ F: include/uapi/linux/raid/ +SOCIONEXT (SNI) AVE NETWORK DRIVER +M: Kunihiko Hayashi +L: netdev@vger.kernel.org +S: Maintained +F: drivers/net/ethernet/socionext/sni_ave.c +F: Documentation/devicetree/bindings/net/socionext,uniphier-ave4.txt + SOCIONEXT (SNI) NETSEC NETWORK DRIVER M: Jassi Brar L: netdev@vger.kernel.org diff --git a/drivers/net/ethernet/socionext/sni_ave.c b/drivers/net/ethernet/socionext/sni_ave.c index 0da11344d035..7c7cd9d94bcc 100644 --- a/drivers/net/ethernet/socionext/sni_ave.c +++ b/drivers/net/ethernet/socionext/sni_ave.c @@ -1916,5 +1916,6 @@ static struct platform_driver ave_driver = { }; module_platform_driver(ave_driver); +MODULE_AUTHOR("Kunihiko Hayashi "); MODULE_DESCRIPTION("Socionext UniPhier AVE ethernet driver"); MODULE_LICENSE("GPL v2"); From d7f7e0018b96fd1a30a968faa9464eb57372c1ec Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Thu, 29 Nov 2018 12:40:11 +0200 Subject: [PATCH 18/93] net: phy: sfp: correct store of detected link modes The link modes that sfp_parse_support() detects are stored in the 'modes' bitmap. There is no reason to make an exception for 1000Base-PX or 1000Base-BX10. Fixes: 03145864bd0f ("sfp: support 1G BiDi (eg, FiberStore SFP-GE-BX) modules") Signed-off-by: Baruch Siach Acked-by: Russell King Signed-off-by: David S. Miller --- drivers/net/phy/sfp-bus.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/sfp-bus.c b/drivers/net/phy/sfp-bus.c index 83060fb349f4..ad9db652874d 100644 --- a/drivers/net/phy/sfp-bus.c +++ b/drivers/net/phy/sfp-bus.c @@ -162,7 +162,7 @@ void sfp_parse_support(struct sfp_bus *bus, const struct sfp_eeprom_id *id, /* 1000Base-PX or 1000Base-BX10 */ if ((id->base.e_base_px || id->base.e_base_bx10) && br_min <= 1300 && br_max >= 1200) - phylink_set(support, 1000baseX_Full); + phylink_set(modes, 1000baseX_Full); /* For active or passive cables, select the link modes * based on the bit rates and the cable compliance bytes. From f28c020fb488e1a8b87469812017044bef88aa2b Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Thu, 29 Nov 2018 14:14:48 +0100 Subject: [PATCH 19/93] net: restore call to netdev_queue_numa_node_write when resetting XPS Before commit 80d19669ecd3 ("net: Refactor XPS for CPUs and Rx queues"), netif_reset_xps_queues() did netdev_queue_numa_node_write() for all the queues being reset. Now, this is only done when the "active" variable in clean_xps_maps() is false, ie when on all the CPUs, there's no active XPS mapping left. Fixes: 80d19669ecd3 ("net: Refactor XPS for CPUs and Rx queues") Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller --- net/core/dev.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index ddc551f24ba2..32a63f4c3a92 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2187,18 +2187,20 @@ static void clean_xps_maps(struct net_device *dev, const unsigned long *mask, active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count); if (!active) { - if (is_rxqs_map) { + if (is_rxqs_map) RCU_INIT_POINTER(dev->xps_rxqs_map, NULL); - } else { + else RCU_INIT_POINTER(dev->xps_cpus_map, NULL); - - for (i = offset + (count - 1); count--; i--) - netdev_queue_numa_node_write( - netdev_get_tx_queue(dev, i), - NUMA_NO_NODE); - } kfree_rcu(dev_maps, rcu); } + + if (!is_rxqs_map) { + for (i = offset + (count - 1); count--; i--) { + netdev_queue_numa_node_write( + netdev_get_tx_queue(dev, i), + NUMA_NO_NODE); + } + } } static void netif_reset_xps_queues(struct net_device *dev, u16 offset, From 867d0ad476db89a1e8af3f297af402399a54eea5 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Thu, 29 Nov 2018 14:14:49 +0100 Subject: [PATCH 20/93] net: fix XPS static_key accounting Commit 04157469b7b8 ("net: Use static_key for XPS maps") introduced a static key for XPS, but the increments/decrements don't match. First, the static key's counter is incremented once for each queue, but only decremented once for a whole batch of queues, leading to large unbalances. Second, the xps_rxqs_needed key is decremented whenever we reset a batch of queues, whether they had any rxqs mapping or not, so that if we setup cpu-XPS on em1 and RXQS-XPS on em2, resetting the queues on em1 would decrement the xps_rxqs_needed key. This reworks the accounting scheme so that the xps_needed key is incremented only once for each type of XPS for all the queues on a device, and the xps_rxqs_needed key is incremented only once for all queues. This is sufficient to let us retrieve queues via get_xps_queue(). This patch introduces a new reset_xps_maps(), which reinitializes and frees the appropriate map (xps_rxqs_map or xps_cpus_map), and drops a reference to the needed keys: - both xps_needed and xps_rxqs_needed, in case of rxqs maps, - only xps_needed, in case of CPU maps. Now, we also need to call reset_xps_maps() at the end of __netif_set_xps_queue() when there's no active map left, for example when writing '00000000,00000000' to all queues' xps_rxqs setting. Fixes: 04157469b7b8 ("net: Use static_key for XPS maps") Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller --- net/core/dev.c | 45 ++++++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 32a63f4c3a92..3470e7fff1f4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2175,6 +2175,20 @@ static bool remove_xps_queue_cpu(struct net_device *dev, return active; } +static void reset_xps_maps(struct net_device *dev, + struct xps_dev_maps *dev_maps, + bool is_rxqs_map) +{ + if (is_rxqs_map) { + static_key_slow_dec_cpuslocked(&xps_rxqs_needed); + RCU_INIT_POINTER(dev->xps_rxqs_map, NULL); + } else { + RCU_INIT_POINTER(dev->xps_cpus_map, NULL); + } + static_key_slow_dec_cpuslocked(&xps_needed); + kfree_rcu(dev_maps, rcu); +} + static void clean_xps_maps(struct net_device *dev, const unsigned long *mask, struct xps_dev_maps *dev_maps, unsigned int nr_ids, u16 offset, u16 count, bool is_rxqs_map) @@ -2186,13 +2200,8 @@ static void clean_xps_maps(struct net_device *dev, const unsigned long *mask, j < nr_ids;) active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count); - if (!active) { - if (is_rxqs_map) - RCU_INIT_POINTER(dev->xps_rxqs_map, NULL); - else - RCU_INIT_POINTER(dev->xps_cpus_map, NULL); - kfree_rcu(dev_maps, rcu); - } + if (!active) + reset_xps_maps(dev, dev_maps, is_rxqs_map); if (!is_rxqs_map) { for (i = offset + (count - 1); count--; i--) { @@ -2236,10 +2245,6 @@ static void netif_reset_xps_queues(struct net_device *dev, u16 offset, false); out_no_maps: - if (static_key_enabled(&xps_rxqs_needed)) - static_key_slow_dec_cpuslocked(&xps_rxqs_needed); - - static_key_slow_dec_cpuslocked(&xps_needed); mutex_unlock(&xps_map_mutex); cpus_read_unlock(); } @@ -2357,9 +2362,12 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, if (!new_dev_maps) goto out_no_new_maps; - static_key_slow_inc_cpuslocked(&xps_needed); - if (is_rxqs_map) - static_key_slow_inc_cpuslocked(&xps_rxqs_needed); + if (!dev_maps) { + /* Increment static keys at most once per type */ + static_key_slow_inc_cpuslocked(&xps_needed); + if (is_rxqs_map) + static_key_slow_inc_cpuslocked(&xps_rxqs_needed); + } for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), j < nr_ids;) { @@ -2457,13 +2465,8 @@ out_no_new_maps: } /* free map if not active */ - if (!active) { - if (is_rxqs_map) - RCU_INIT_POINTER(dev->xps_rxqs_map, NULL); - else - RCU_INIT_POINTER(dev->xps_cpus_map, NULL); - kfree_rcu(dev_maps, rcu); - } + if (!active) + reset_xps_maps(dev, dev_maps, is_rxqs_map); out_no_maps: mutex_unlock(&xps_map_mutex); From 90230968f102acbe103fbf7c03d41addfef5f153 Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Thu, 29 Nov 2018 12:00:05 +0200 Subject: [PATCH 21/93] net: phy: sfp: correct location of SFP standards SFP standards are now available from the SNIA (Storage Networking Industry Association) website. Cc: Andrew Lunn Cc: Florian Fainelli Signed-off-by: Baruch Siach Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/sfp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/sfp.h b/include/linux/sfp.h index d37518e89db2..d9d9de3fcf8e 100644 --- a/include/linux/sfp.h +++ b/include/linux/sfp.h @@ -224,7 +224,7 @@ struct sfp_eeprom_ext { * * See the SFF-8472 specification and related documents for the definition * of these structure members. This can be obtained from - * ftp://ftp.seagate.com/sff + * https://www.snia.org/technology-communities/sff/specifications */ struct sfp_eeprom_id { struct sfp_eeprom_base base; From d449ba3d581ed29f751a59792fdc775572c66904 Mon Sep 17 00:00:00 2001 From: Martin Schiller Date: Tue, 27 Nov 2018 09:50:27 +0100 Subject: [PATCH 22/93] net/x25: fix called/calling length calculation in x25_parse_address_block The length of the called and calling address was not calculated correctly (BCD encoding). Signed-off-by: Martin Schiller Signed-off-by: David S. Miller --- net/x25/af_x25.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index d49aa79b7997..5226a7f43050 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -100,7 +100,7 @@ int x25_parse_address_block(struct sk_buff *skb, } len = *skb->data; - needed = 1 + (len >> 4) + (len & 0x0f); + needed = 1 + ((len >> 4) + (len & 0x0f) + 1) / 2; if (!pskb_may_pull(skb, needed)) { /* packet is too short to hold the addresses it claims From 06137619f061f498c2924f6543fa45b7d39f0501 Mon Sep 17 00:00:00 2001 From: Martin Schiller Date: Tue, 27 Nov 2018 09:50:28 +0100 Subject: [PATCH 23/93] net/x25: fix null_x25_address handling o x25_find_listener(): the compare for the null_x25_address was wrong. We have to check the x25_addr of the listener socket instead of the x25_addr of the incomming call. o x25_bind(): it was not possible to bind a socket to null_x25_address Signed-off-by: Martin Schiller Signed-off-by: David S. Miller --- net/x25/af_x25.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 5226a7f43050..5121729b8b63 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -288,7 +288,7 @@ static struct sock *x25_find_listener(struct x25_address *addr, sk_for_each(s, &x25_list) if ((!strcmp(addr->x25_addr, x25_sk(s)->source_addr.x25_addr) || - !strcmp(addr->x25_addr, + !strcmp(x25_sk(s)->source_addr.x25_addr, null_x25_address.x25_addr)) && s->sk_state == TCP_LISTEN) { /* @@ -688,11 +688,15 @@ static int x25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) goto out; } - len = strlen(addr->sx25_addr.x25_addr); - for (i = 0; i < len; i++) { - if (!isdigit(addr->sx25_addr.x25_addr[i])) { - rc = -EINVAL; - goto out; + /* check for the null_x25_address */ + if (strcmp(addr->sx25_addr.x25_addr, null_x25_address.x25_addr)) { + + len = strlen(addr->sx25_addr.x25_addr); + for (i = 0; i < len; i++) { + if (!isdigit(addr->sx25_addr.x25_addr[i])) { + rc = -EINVAL; + goto out; + } } } From b020fcf6bb4b2d980298c416b3f407075aa2b3b6 Mon Sep 17 00:00:00 2001 From: Martin Schiller Date: Tue, 27 Nov 2018 09:50:29 +0100 Subject: [PATCH 24/93] net/x25: handle call collisions If a session in X25_STATE_1 (Awaiting Call Accept) receives a call request, the session will be closed (x25_disconnect), cause=0x01 (Number Busy) and diag=0x48 (Call Collision) will be set and a clear request will be send. Signed-off-by: Martin Schiller Signed-off-by: David S. Miller --- net/x25/x25_in.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/net/x25/x25_in.c b/net/x25/x25_in.c index 3c12cae32001..afb26221d8a8 100644 --- a/net/x25/x25_in.c +++ b/net/x25/x25_in.c @@ -142,6 +142,15 @@ static int x25_state1_machine(struct sock *sk, struct sk_buff *skb, int frametyp sk->sk_state_change(sk); break; } + case X25_CALL_REQUEST: + /* call collision */ + x25->causediag.cause = 0x01; + x25->causediag.diagnostic = 0x48; + + x25_write_internal(sk, X25_CLEAR_REQUEST); + x25_disconnect(sk, EISCONN, 0x01, 0x48); + break; + case X25_CLEAR_REQUEST: if (!pskb_may_pull(skb, X25_STD_MIN_LEN + 2)) goto out_clear; From 9410d386d0a829ace9558336263086c2fbbe8aed Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Thu, 29 Nov 2018 16:01:04 -0800 Subject: [PATCH 25/93] net: Prevent invalid access to skb->prev in __qdisc_drop_all __qdisc_drop_all() accesses skb->prev to get to the tail of the segment-list. With commit 68d2f84a1368 ("net: gro: properly remove skb from list") the skb-list handling has been changed to set skb->next to NULL and set the list-poison on skb->prev. With that change, __qdisc_drop_all() will panic when it tries to dereference skb->prev. Since commit 992cba7e276d ("net: Add and use skb_list_del_init().") __list_del_entry is used, leaving skb->prev unchanged (thus, pointing to the list-head if it's the first skb of the list). This will make __qdisc_drop_all modify the next-pointer of the list-head and result in a panic later on: [ 34.501053] general protection fault: 0000 [#1] SMP KASAN PTI [ 34.501968] CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.20.0-rc2.mptcp #108 [ 34.502887] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 0.5.1 01/01/2011 [ 34.504074] RIP: 0010:dev_gro_receive+0x343/0x1f90 [ 34.504751] Code: e0 48 c1 e8 03 42 80 3c 30 00 0f 85 4a 1c 00 00 4d 8b 24 24 4c 39 65 d0 0f 84 0a 04 00 00 49 8d 7c 24 38 48 89 f8 48 c1 e8 03 <42> 0f b6 04 30 84 c0 74 08 3c 04 [ 34.507060] RSP: 0018:ffff8883af507930 EFLAGS: 00010202 [ 34.507761] RAX: 0000000000000007 RBX: ffff8883970b2c80 RCX: 1ffff11072e165a6 [ 34.508640] RDX: 1ffff11075867008 RSI: ffff8883ac338040 RDI: 0000000000000038 [ 34.509493] RBP: ffff8883af5079d0 R08: ffff8883970b2d40 R09: 0000000000000062 [ 34.510346] R10: 0000000000000034 R11: 0000000000000000 R12: 0000000000000000 [ 34.511215] R13: 0000000000000000 R14: dffffc0000000000 R15: ffff8883ac338008 [ 34.512082] FS: 0000000000000000(0000) GS:ffff8883af500000(0000) knlGS:0000000000000000 [ 34.513036] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 34.513741] CR2: 000055ccc3e9d020 CR3: 00000003abf32000 CR4: 00000000000006e0 [ 34.514593] Call Trace: [ 34.514893] [ 34.515157] napi_gro_receive+0x93/0x150 [ 34.515632] receive_buf+0x893/0x3700 [ 34.516094] ? __netif_receive_skb+0x1f/0x1a0 [ 34.516629] ? virtnet_probe+0x1b40/0x1b40 [ 34.517153] ? __stable_node_chain+0x4d0/0x850 [ 34.517684] ? kfree+0x9a/0x180 [ 34.518067] ? __kasan_slab_free+0x171/0x190 [ 34.518582] ? detach_buf+0x1df/0x650 [ 34.519061] ? lapic_next_event+0x5a/0x90 [ 34.519539] ? virtqueue_get_buf_ctx+0x280/0x7f0 [ 34.520093] virtnet_poll+0x2df/0xd60 [ 34.520533] ? receive_buf+0x3700/0x3700 [ 34.521027] ? qdisc_watchdog_schedule_ns+0xd5/0x140 [ 34.521631] ? htb_dequeue+0x1817/0x25f0 [ 34.522107] ? sch_direct_xmit+0x142/0xf30 [ 34.522595] ? virtqueue_napi_schedule+0x26/0x30 [ 34.523155] net_rx_action+0x2f6/0xc50 [ 34.523601] ? napi_complete_done+0x2f0/0x2f0 [ 34.524126] ? kasan_check_read+0x11/0x20 [ 34.524608] ? _raw_spin_lock+0x7d/0xd0 [ 34.525070] ? _raw_spin_lock_bh+0xd0/0xd0 [ 34.525563] ? kvm_guest_apic_eoi_write+0x6b/0x80 [ 34.526130] ? apic_ack_irq+0x9e/0xe0 [ 34.526567] __do_softirq+0x188/0x4b5 [ 34.527015] irq_exit+0x151/0x180 [ 34.527417] do_IRQ+0xdb/0x150 [ 34.527783] common_interrupt+0xf/0xf [ 34.528223] This patch makes sure that skb->prev is set to NULL when entering netem_enqueue. Cc: Prashant Bhole Cc: Tyler Hicks Cc: Eric Dumazet Fixes: 68d2f84a1368 ("net: gro: properly remove skb from list") Suggested-by: Eric Dumazet Signed-off-by: Christoph Paasch Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_netem.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 2c38e3d07924..22cd46a60057 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -431,6 +431,9 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, int count = 1; int rc = NET_XMIT_SUCCESS; + /* Do not fool qdisc_drop_all() */ + skb->prev = NULL; + /* Random duplication */ if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor)) ++count; From 4135cce7fd0a0d755665c02728578c7c5afe4726 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 27 Nov 2018 19:11:50 +0800 Subject: [PATCH 26/93] sctp: update frag_point when stream_interleave is set sctp_assoc_update_frag_point() should be called whenever asoc->pathmtu changes, but we missed one place in sctp_association_init(). It would cause frag_point is zero when sending data. As says in Jakub's reproducer, if sp->pathmtu is set by socketopt, the new asoc->pathmtu inherits it in sctp_association_init(). Later when transports are added and their pmtu >= asoc->pathmtu, it will never call sctp_assoc_update_frag_point() to set frag_point. This patch is to fix it by updating frag_point after asoc->pathmtu is set as sp->pathmtu in sctp_association_init(). Note that it moved them after sctp_stream_init(), as stream->si needs to be set first. Frag_point's calculation is also related with datachunk's type, so it needs to update frag_point when stream->si may be changed in sctp_process_init(). v1->v2: - call sctp_assoc_update_frag_point() separately in sctp_process_init and sctp_association_init, per Marcelo's suggestion. Fixes: 2f5e3c9df693 ("sctp: introduce sctp_assoc_update_frag_point") Reported-by: Jakub Audykowicz Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/associola.c | 7 ++++--- net/sctp/sm_make_chunk.c | 3 +++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 6a28b96e779e..dd77ec3892b6 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -118,9 +118,6 @@ static struct sctp_association *sctp_association_init( asoc->flowlabel = sp->flowlabel; asoc->dscp = sp->dscp; - /* Initialize default path MTU. */ - asoc->pathmtu = sp->pathmtu; - /* Set association default SACK delay */ asoc->sackdelay = msecs_to_jiffies(sp->sackdelay); asoc->sackfreq = sp->sackfreq; @@ -252,6 +249,10 @@ static struct sctp_association *sctp_association_init( 0, gfp)) goto fail_init; + /* Initialize default path MTU. */ + asoc->pathmtu = sp->pathmtu; + sctp_assoc_update_frag_point(asoc); + /* Assume that peer would support both address types unless we are * told otherwise. */ diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 4a4fd1971255..f4ac6c592e13 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -2462,6 +2462,9 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk, asoc->c.sinit_max_instreams, gfp)) goto clean_up; + /* Update frag_point when stream_interleave may get changed. */ + sctp_assoc_update_frag_point(asoc); + if (!asoc->temp && sctp_assoc_set_id(asoc, gfp)) goto clean_up; From 5f2b8b62786853341a20d4cd4948f9cbca3db002 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Tue, 27 Nov 2018 14:21:43 +0100 Subject: [PATCH 27/93] net: stmmac: Move debugfs init/exit to ->probe()/->remove() Setting up and tearing down debugfs is current unbalanced, as seen by this error during resume from suspend: [ 752.134067] dwc-eth-dwmac 2490000.ethernet eth0: ERROR failed to create debugfs directory [ 752.134347] dwc-eth-dwmac 2490000.ethernet eth0: stmmac_hw_setup: failed debugFS registration The imbalance happens because the driver creates the debugfs hierarchy when the device is opened and tears it down when the device is closed. There's little gain in that, and it could be argued that it is even surprising because it's not usually done for other devices. Fix the imbalance by moving the debugfs creation and teardown to the driver's ->probe() and ->remove() implementations instead. Note that the ring descriptors cannot be read while the interface is down, so make sure to return an empty file when the descriptors_status debugfs file is read. Signed-off-by: Thierry Reding Acked-by: Jose Abreu Signed-off-by: David S. Miller --- .../net/ethernet/stmicro/stmmac/stmmac_main.c | 23 +++++++++++-------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 076a8be18d67..5551fead8f66 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -2550,12 +2550,6 @@ static int stmmac_hw_setup(struct net_device *dev, bool init_ptp) netdev_warn(priv->dev, "PTP init failed\n"); } -#ifdef CONFIG_DEBUG_FS - ret = stmmac_init_fs(dev); - if (ret < 0) - netdev_warn(priv->dev, "%s: failed debugFS registration\n", - __func__); -#endif priv->tx_lpi_timer = STMMAC_DEFAULT_TWT_LS; if (priv->use_riwt) { @@ -2756,10 +2750,6 @@ static int stmmac_release(struct net_device *dev) netif_carrier_off(dev); -#ifdef CONFIG_DEBUG_FS - stmmac_exit_fs(dev); -#endif - stmmac_release_ptp(priv); return 0; @@ -3899,6 +3889,9 @@ static int stmmac_sysfs_ring_read(struct seq_file *seq, void *v) u32 tx_count = priv->plat->tx_queues_to_use; u32 queue; + if ((dev->flags & IFF_UP) == 0) + return 0; + for (queue = 0; queue < rx_count; queue++) { struct stmmac_rx_queue *rx_q = &priv->rx_queue[queue]; @@ -4397,6 +4390,13 @@ int stmmac_dvr_probe(struct device *device, goto error_netdev_register; } +#ifdef CONFIG_DEBUG_FS + ret = stmmac_init_fs(ndev); + if (ret < 0) + netdev_warn(priv->dev, "%s: failed debugFS registration\n", + __func__); +#endif + return ret; error_netdev_register: @@ -4432,6 +4432,9 @@ int stmmac_dvr_remove(struct device *dev) netdev_info(priv->dev, "%s: removing driver", __func__); +#ifdef CONFIG_DEBUG_FS + stmmac_exit_fs(ndev); +#endif stmmac_stop_all_dma(priv); stmmac_mac_set(priv, priv->ioaddr, false); From 56e0e295091dde5d0346fad08d3d8b6c07084c9d Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 27 Nov 2018 14:00:15 +0000 Subject: [PATCH 28/93] liquidio: fix spelling mistake "deferal" -> "deferral" There is a spelling mistake in the oct_stats_strings array, fix it. Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- drivers/net/ethernet/cavium/liquidio/lio_ethtool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c b/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c index 4c3925af53bc..abe5d0dac851 100644 --- a/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c +++ b/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c @@ -111,7 +111,7 @@ static const char oct_stats_strings[][ETH_GSTRING_LEN] = { "mac_tx_one_collision", "mac_tx_multi_collision", "mac_tx_max_collision_fail", - "mac_tx_max_deferal_fail", + "mac_tx_max_deferral_fail", "mac_tx_fifo_err", "mac_tx_runts", From 43d0e96022ae3c66743c01bba6c18a3afec7b578 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 27 Nov 2018 14:37:17 +0000 Subject: [PATCH 29/93] openvswitch: fix spelling mistake "execeeds" -> "exceeds" There is a spelling mistake in a net_warn_ratelimited message, fix this. Signed-off-by: Colin Ian King Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/openvswitch/conntrack.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index a4660c48ff01..cd94f925495a 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1166,7 +1166,7 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); if (err) { net_warn_ratelimited("openvswitch: zone: %u " - "execeeds conntrack limit\n", + "exceeds conntrack limit\n", info->zone.id); return err; } From 37c4b91f955fdd5f4ad771956b97d35f1321098e Mon Sep 17 00:00:00 2001 From: Dmitry Bogdanov Date: Tue, 27 Nov 2018 14:51:17 +0000 Subject: [PATCH 30/93] net: aquantia: fix rx checksum offload bits The last set of csum offload fixes had a leak: Checksum enabled status bits from rx descriptor were incorrectly interpreted. Consequently all the other valid logic worked on zero bits. That caused rx checksum offloads never to trigger. Tested by dumping rx descriptors and validating resulting csum_level. Reported-by: Igor Russkikh Signed-off-by: Dmitry Bogdanov Signed-off-by: Igor Russkikh Fixes: ad703c2b9127f ("net: aquantia: invalid checksumm offload implementation") Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c index f02592f43fe3..a7e853fa43c2 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c @@ -674,7 +674,7 @@ static int hw_atl_b0_hw_ring_rx_receive(struct aq_hw_s *self, rx_stat = (0x0000003CU & rxd_wb->status) >> 2; - is_rx_check_sum_enabled = (rxd_wb->type) & (0x3U << 19); + is_rx_check_sum_enabled = (rxd_wb->type >> 19) & 0x3U; pkt_type = 0xFFU & (rxd_wb->type >> 4); From 3b5b3a3331d141e8f2a7aaae3a94dfa1e61ecbe4 Mon Sep 17 00:00:00 2001 From: Toni Peltonen Date: Tue, 27 Nov 2018 16:56:57 +0200 Subject: [PATCH 31/93] bonding: fix 802.3ad state sent to partner when unbinding slave Previously when unbinding a slave the 802.3ad implementation only told partner that the port is not suitable for aggregation by setting the port aggregation state from aggregatable to individual. This is not enough. If the physical layer still stays up and we only unbinded this port from the bond there is nothing in the aggregation status alone to prevent the partner from sending traffic towards us. To ensure that the partner doesn't consider this port at all anymore we should also disable collecting and distributing to signal that this actor is going away. Also clear AD_STATE_SYNCHRONIZATION to ensure partner exits collecting + distributing state. I have tested this behaviour againts Arista EOS switches with mlx5 cards (physical link stays up even when interface is down) and simulated the same situation virtually Linux <-> Linux with two network namespaces running two veth device pairs. In both cases setting aggregation to individual doesn't alone prevent traffic from being to sent towards this port given that the link stays up in partners end. Partner still keeps it's end in collecting + distributing state and continues until timeout is reached. In most cases this means we are losing the traffic partner sends towards our port while we wait for timeout. This is most visible with slow periodic time (LACP rate slow). Other open source implementations like Open VSwitch and libreswitch, and vendor implementations like Arista EOS, seem to disable collecting + distributing to when doing similar port disabling/detaching/removing change. With this patch kernel implementation would behave the same way and ensure partner doesn't consider our actor viable anymore. Signed-off-by: Toni Peltonen Signed-off-by: Jay Vosburgh Acked-by: Jonathan Toppins Signed-off-by: David S. Miller --- drivers/net/bonding/bond_3ad.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c index f43fb2f958a5..93dfcef8afc4 100644 --- a/drivers/net/bonding/bond_3ad.c +++ b/drivers/net/bonding/bond_3ad.c @@ -2086,6 +2086,9 @@ void bond_3ad_unbind_slave(struct slave *slave) aggregator->aggregator_identifier); /* Tell the partner that this port is not suitable for aggregation */ + port->actor_oper_port_state &= ~AD_STATE_SYNCHRONIZATION; + port->actor_oper_port_state &= ~AD_STATE_COLLECTING; + port->actor_oper_port_state &= ~AD_STATE_DISTRIBUTING; port->actor_oper_port_state &= ~AD_STATE_AGGREGATION; __update_lacpdu_from_port(port); ad_lacpdu_send(port); From 1166494891da88af25c444e65cd4f32c3e026b46 Mon Sep 17 00:00:00 2001 From: John Hurley Date: Tue, 27 Nov 2018 14:04:11 -0800 Subject: [PATCH 32/93] nfp: flower: release metadata on offload failure Calling nfp_compile_flow_metadata both assigns a stats context and increments a ref counter on (or allocates) a mask id table entry. These are released by the nfp_modify_flow_metadata call on flow deletion, however, if a flow add fails after metadata is set then the flow entry will be deleted but the metadata assignments leaked. Add an error path to the flow add offload function to ensure allocated metadata is released in the event of an offload fail. Fixes: 81f3ddf2547d ("nfp: add control message passing capabilities to flower offloads") Signed-off-by: John Hurley Reviewed-by: Pieter Jansen van Vuuren Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/flower/offload.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c index 29c95423ab64..c3ad8d737cf0 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/offload.c +++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c @@ -479,13 +479,13 @@ nfp_flower_add_offload(struct nfp_app *app, struct net_device *netdev, err = nfp_flower_xmit_flow(netdev, flow_pay, NFP_FLOWER_CMSG_TYPE_FLOW_ADD); if (err) - goto err_destroy_flow; + goto err_release_metadata; flow_pay->tc_flower_cookie = flow->cookie; err = rhashtable_insert_fast(&priv->flow_table, &flow_pay->fl_node, nfp_flower_table_params); if (err) - goto err_destroy_flow; + goto err_release_metadata; port->tc_offload_cnt++; @@ -494,6 +494,8 @@ nfp_flower_add_offload(struct nfp_app *app, struct net_device *netdev, return 0; +err_release_metadata: + nfp_modify_flow_metadata(app, flow_pay); err_destroy_flow: kfree(flow_pay->action_data); kfree(flow_pay->mask_data); From b5f0cf08340090d1503dbdbfd797e32264974100 Mon Sep 17 00:00:00 2001 From: John Hurley Date: Tue, 27 Nov 2018 14:04:12 -0800 Subject: [PATCH 33/93] nfp: flower: prevent offload if rhashtable insert fails For flow offload adds, if the rhash insert code fails, the flow will still have been offloaded but the reference to it in the driver freed. Re-order the offload setup calls to ensure that a flow will only be written to FW if a kernel reference is held and stored in the rhashtable. Remove this hashtable entry if the offload fails. Fixes: c01d0efa5136 ("nfp: flower: use rhashtable for flow caching") Signed-off-by: John Hurley Reviewed-by: Pieter Jansen van Vuuren Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- .../net/ethernet/netronome/nfp/flower/offload.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c index c3ad8d737cf0..2f49eb75f3cc 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/offload.c +++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c @@ -476,17 +476,17 @@ nfp_flower_add_offload(struct nfp_app *app, struct net_device *netdev, if (err) goto err_destroy_flow; - err = nfp_flower_xmit_flow(netdev, flow_pay, - NFP_FLOWER_CMSG_TYPE_FLOW_ADD); - if (err) - goto err_release_metadata; - flow_pay->tc_flower_cookie = flow->cookie; err = rhashtable_insert_fast(&priv->flow_table, &flow_pay->fl_node, nfp_flower_table_params); if (err) goto err_release_metadata; + err = nfp_flower_xmit_flow(netdev, flow_pay, + NFP_FLOWER_CMSG_TYPE_FLOW_ADD); + if (err) + goto err_remove_rhash; + port->tc_offload_cnt++; /* Deallocate flow payload when flower rule has been destroyed. */ @@ -494,6 +494,10 @@ nfp_flower_add_offload(struct nfp_app *app, struct net_device *netdev, return 0; +err_remove_rhash: + WARN_ON_ONCE(rhashtable_remove_fast(&priv->flow_table, + &flow_pay->fl_node, + nfp_flower_table_params)); err_release_metadata: nfp_modify_flow_metadata(app, flow_pay); err_destroy_flow: From c01ac66b38660f2b507ccd0b75d28e3002d56fbb Mon Sep 17 00:00:00 2001 From: David Miller Date: Wed, 28 Nov 2018 22:33:53 -0800 Subject: [PATCH 34/93] bpf: Fix verifier log string check for bad alignment. The message got changed a lot time ago. This was responsible for 36 test case failures on sparc64. Fixes: f1174f77b50c ("bpf/verifier: rework value tracking") Signed-off-by: David S. Miller Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 550b7e46bf4a..5dd4410a716c 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -14230,7 +14230,7 @@ static void do_test_single(struct bpf_test *test, bool unpriv, reject_from_alignment = fd_prog < 0 && (test->flags & F_NEEDS_EFFICIENT_UNALIGNED_ACCESS) && - strstr(bpf_vlog, "Unknown alignment."); + strstr(bpf_vlog, "misaligned"); #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS if (reject_from_alignment) { printf("FAIL\nFailed due to alignment despite having efficient unaligned access: '%s'!\n", From b7df9ada9a7700dbcca1ba53d217c01e3d48179c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 1 Dec 2018 01:18:53 +0100 Subject: [PATCH 35/93] bpf: fix pointer offsets in context for 32 bit Currently, pointer offsets in three BPF context structures are broken in two scenarios: i) 32 bit compiled applications running on 64 bit kernels, and ii) LLVM compiled BPF programs running on 32 bit kernels. The latter is due to BPF target machine being strictly 64 bit. So in each of the cases the offsets will mismatch in verifier when checking / rewriting context access. Fix this by providing a helper macro __bpf_md_ptr() that will enforce padding up to 64 bit and proper alignment, and for context access a macro bpf_ctx_range_ptr() which will cover full 64 bit member range on 32 bit archs. For flow_keys, we additionally need to force the size check to sizeof(__u64) as with other pointer types. Fixes: d58e468b1112 ("flow_dissector: implements flow dissector BPF hook") Fixes: 4f738adba30a ("bpf: create tcp_bpf_ulp allowing BPF to monitor socket TX/RX data") Fixes: 2dbb9b9e6df6 ("bpf: Introduce BPF_PROG_TYPE_SK_REUSEPORT") Reported-by: David S. Miller Signed-off-by: Daniel Borkmann Acked-by: David S. Miller Tested-by: David S. Miller Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 7 +++++++ include/uapi/linux/bpf.h | 17 ++++++++++++----- net/core/filter.c | 16 ++++++++-------- tools/include/uapi/linux/bpf.h | 17 ++++++++++++----- 4 files changed, 39 insertions(+), 18 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 448dcc448f1f..795ff0b869bb 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -449,6 +449,13 @@ struct sock_reuseport; offsetof(TYPE, MEMBER) ... offsetofend(TYPE, MEMBER) - 1 #define bpf_ctx_range_till(TYPE, MEMBER1, MEMBER2) \ offsetof(TYPE, MEMBER1) ... offsetofend(TYPE, MEMBER2) - 1 +#if BITS_PER_LONG == 64 +# define bpf_ctx_range_ptr(TYPE, MEMBER) \ + offsetof(TYPE, MEMBER) ... offsetofend(TYPE, MEMBER) - 1 +#else +# define bpf_ctx_range_ptr(TYPE, MEMBER) \ + offsetof(TYPE, MEMBER) ... offsetof(TYPE, MEMBER) + 8 - 1 +#endif /* BITS_PER_LONG == 64 */ #define bpf_target_off(TYPE, MEMBER, SIZE, PTR_SIZE) \ ({ \ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 852dc17ab47a..426b5c8a245b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2422,6 +2422,12 @@ enum bpf_lwt_encap_mode { BPF_LWT_ENCAP_SEG6_INLINE }; +#define __bpf_md_ptr(type, name) \ +union { \ + type name; \ + __u64 :64; \ +} __attribute__((aligned(8))) + /* user accessible mirror of in-kernel sk_buff. * new fields can only be added to the end of this structure */ @@ -2456,7 +2462,7 @@ struct __sk_buff { /* ... here. */ __u32 data_meta; - struct bpf_flow_keys *flow_keys; + __bpf_md_ptr(struct bpf_flow_keys *, flow_keys); }; struct bpf_tunnel_key { @@ -2572,8 +2578,8 @@ enum sk_action { * be added to the end of this structure */ struct sk_msg_md { - void *data; - void *data_end; + __bpf_md_ptr(void *, data); + __bpf_md_ptr(void *, data_end); __u32 family; __u32 remote_ip4; /* Stored in network byte order */ @@ -2589,8 +2595,9 @@ struct sk_reuseport_md { * Start of directly accessible data. It begins from * the tcp/udp header. */ - void *data; - void *data_end; /* End of directly accessible data */ + __bpf_md_ptr(void *, data); + /* End of directly accessible data */ + __bpf_md_ptr(void *, data_end); /* * Total length of packet (starting from the tcp/udp header). * Note that the directly accessible bytes (data_end - data) diff --git a/net/core/filter.c b/net/core/filter.c index 9a1327eb25fa..6ee605da990f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5435,8 +5435,8 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type if (size != size_default) return false; break; - case bpf_ctx_range(struct __sk_buff, flow_keys): - if (size != sizeof(struct bpf_flow_keys *)) + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): + if (size != sizeof(__u64)) return false; break; default: @@ -5464,7 +5464,7 @@ static bool sk_filter_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_end): - case bpf_ctx_range(struct __sk_buff, flow_keys): + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; } @@ -5489,7 +5489,7 @@ static bool cg_skb_is_valid_access(int off, int size, switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): - case bpf_ctx_range(struct __sk_buff, flow_keys): + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): return false; case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_end): @@ -5530,7 +5530,7 @@ static bool lwt_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range(struct __sk_buff, data_meta): - case bpf_ctx_range(struct __sk_buff, flow_keys): + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): return false; } @@ -5756,7 +5756,7 @@ static bool tc_cls_act_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; - case bpf_ctx_range(struct __sk_buff, flow_keys): + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; } @@ -5958,7 +5958,7 @@ static bool sk_skb_is_valid_access(int off, int size, switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): - case bpf_ctx_range(struct __sk_buff, flow_keys): + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): return false; } @@ -6039,7 +6039,7 @@ static bool flow_dissector_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; - case bpf_ctx_range(struct __sk_buff, flow_keys): + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): info->reg_type = PTR_TO_FLOW_KEYS; break; case bpf_ctx_range(struct __sk_buff, tc_classid): diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 852dc17ab47a..426b5c8a245b 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2422,6 +2422,12 @@ enum bpf_lwt_encap_mode { BPF_LWT_ENCAP_SEG6_INLINE }; +#define __bpf_md_ptr(type, name) \ +union { \ + type name; \ + __u64 :64; \ +} __attribute__((aligned(8))) + /* user accessible mirror of in-kernel sk_buff. * new fields can only be added to the end of this structure */ @@ -2456,7 +2462,7 @@ struct __sk_buff { /* ... here. */ __u32 data_meta; - struct bpf_flow_keys *flow_keys; + __bpf_md_ptr(struct bpf_flow_keys *, flow_keys); }; struct bpf_tunnel_key { @@ -2572,8 +2578,8 @@ enum sk_action { * be added to the end of this structure */ struct sk_msg_md { - void *data; - void *data_end; + __bpf_md_ptr(void *, data); + __bpf_md_ptr(void *, data_end); __u32 family; __u32 remote_ip4; /* Stored in network byte order */ @@ -2589,8 +2595,9 @@ struct sk_reuseport_md { * Start of directly accessible data. It begins from * the tcp/udp header. */ - void *data; - void *data_end; /* End of directly accessible data */ + __bpf_md_ptr(void *, data); + /* End of directly accessible data */ + __bpf_md_ptr(void *, data_end); /* * Total length of packet (starting from the tcp/udp header). * Note that the directly accessible bytes (data_end - data) From fd6d433865a2ad1f7e018ef80408cb3dc3be1ab3 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 28 Nov 2018 18:43:42 +0100 Subject: [PATCH 36/93] net/sched: act_police: fix memory leak in case of invalid control action when users set an invalid control action, kmemleak complains as follows: # echo clear >/sys/kernel/debug/kmemleak # ./tdc.py -e b48b Test b48b: Add police action with exceed goto chain control action All test results: 1..1 ok 1 - b48b # Add police action with exceed goto chain control action about to flush the tap output if tests need to be skipped done flushing skipped test tap output # echo scan >/sys/kernel/debug/kmemleak # cat /sys/kernel/debug/kmemleak unreferenced object 0xffffa0fafbc3dde0 (size 96): comm "tc", pid 2358, jiffies 4294922738 (age 17.022s) hex dump (first 32 bytes): 2a 00 00 20 00 00 00 00 00 00 7d 00 00 00 00 00 *.. ......}..... f8 07 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000648803d2>] tcf_action_init_1+0x384/0x4c0 [<00000000cb69382e>] tcf_action_init+0x12b/0x1a0 [<00000000847ef0d4>] tcf_action_add+0x73/0x170 [<0000000093656e14>] tc_ctl_action+0x122/0x160 [<0000000023c98e32>] rtnetlink_rcv_msg+0x263/0x2d0 [<000000003493ae9c>] netlink_rcv_skb+0x4d/0x130 [<00000000de63f8ba>] netlink_unicast+0x209/0x2d0 [<00000000c3da0ebe>] netlink_sendmsg+0x2c1/0x3c0 [<000000007a9e0753>] sock_sendmsg+0x33/0x40 [<00000000457c6d2e>] ___sys_sendmsg+0x2a0/0x2f0 [<00000000c5c6a086>] __sys_sendmsg+0x5e/0xa0 [<00000000446eafce>] do_syscall_64+0x5b/0x180 [<000000004aa871f2>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [<00000000450c38ef>] 0xffffffffffffffff change tcf_police_init() to avoid leaking 'new' in case TCA_POLICE_RESULT contains TC_ACT_GOTO_CHAIN extended action. Fixes: c08f5ed5d625 ("net/sched: act_police: disallow 'goto chain' on fallback control action") Reported-by: Dan Carpenter Signed-off-by: Davide Caratti Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/act_police.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 37c9b8f0e10f..ec8ec55e0fe8 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -85,7 +85,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, int ovr, int bind, bool rtnl_held, struct netlink_ext_ack *extack) { - int ret = 0, err; + int ret = 0, tcfp_result = TC_ACT_OK, err, size; struct nlattr *tb[TCA_POLICE_MAX + 1]; struct tc_police *parm; struct tcf_police *police; @@ -93,7 +93,6 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, struct tc_action_net *tn = net_generic(net, police_net_id); struct tcf_police_params *new; bool exists = false; - int size; if (nla == NULL) return -EINVAL; @@ -160,6 +159,16 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, goto failure; } + if (tb[TCA_POLICE_RESULT]) { + tcfp_result = nla_get_u32(tb[TCA_POLICE_RESULT]); + if (TC_ACT_EXT_CMP(tcfp_result, TC_ACT_GOTO_CHAIN)) { + NL_SET_ERR_MSG(extack, + "goto chain not allowed on fallback"); + err = -EINVAL; + goto failure; + } + } + new = kzalloc(sizeof(*new), GFP_KERNEL); if (unlikely(!new)) { err = -ENOMEM; @@ -167,6 +176,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, } /* No failure allowed after this point */ + new->tcfp_result = tcfp_result; new->tcfp_mtu = parm->mtu; if (!new->tcfp_mtu) { new->tcfp_mtu = ~0; @@ -196,16 +206,6 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, if (tb[TCA_POLICE_AVRATE]) new->tcfp_ewma_rate = nla_get_u32(tb[TCA_POLICE_AVRATE]); - if (tb[TCA_POLICE_RESULT]) { - new->tcfp_result = nla_get_u32(tb[TCA_POLICE_RESULT]); - if (TC_ACT_EXT_CMP(new->tcfp_result, TC_ACT_GOTO_CHAIN)) { - NL_SET_ERR_MSG(extack, - "goto chain not allowed on fallback"); - err = -EINVAL; - goto failure; - } - } - spin_lock_bh(&police->tcf_lock); spin_lock_bh(&police->tcfp_lock); police->tcfp_t_c = ktime_get_ns(); From f71c6143c2038df1cb43a4b9c90740d14f77467c Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Fri, 30 Nov 2018 15:32:20 -0800 Subject: [PATCH 37/93] bpf: Support sk lookup in netns with id 0 David Ahern and Nicolas Dichtel report that the handling of the netns id 0 is incorrect for the BPF socket lookup helpers: rather than finding the netns with id 0, it is resolving to the current netns. This renders the netns_id 0 inaccessible. To fix this, adjust the API for the netns to treat all negative s32 values as a lookup in the current netns (including u64 values which when truncated to s32 become negative), while any values with a positive value in the signed 32-bit integer space would result in a lookup for a socket in the netns corresponding to that id. As before, if the netns with that ID does not exist, no socket will be found. Any netns outside of these ranges will fail to find a corresponding socket, as those values are reserved for future usage. Signed-off-by: Joe Stringer Acked-by: Nicolas Dichtel Acked-by: Joey Pabalinas Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 35 ++++++++++------- net/core/filter.c | 11 +++--- tools/include/uapi/linux/bpf.h | 39 ++++++++++++------- tools/testing/selftests/bpf/bpf_helpers.h | 4 +- .../selftests/bpf/test_sk_lookup_kern.c | 18 ++++----- 5 files changed, 63 insertions(+), 44 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 426b5c8a245b..cba518c57229 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2170,7 +2170,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags) + * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) * Description * Look for TCP socket matching *tuple*, optionally in a child * network namespace *netns*. The return value must be checked, @@ -2187,12 +2187,14 @@ union bpf_attr { * **sizeof**\ (*tuple*\ **->ipv6**) * Look for an IPv6 socket. * - * If the *netns* is zero, then the socket lookup table in the - * netns associated with the *ctx* will be used. For the TC hooks, - * this in the netns of the device in the skb. For socket hooks, - * this in the netns of the socket. If *netns* is non-zero, then - * it specifies the ID of the netns relative to the netns - * associated with the *ctx*. + * If the *netns* is a negative signed 32-bit integer, then the + * socket lookup table in the netns associated with the *ctx* will + * will be used. For the TC hooks, this is the netns of the device + * in the skb. For socket hooks, this is the netns of the socket. + * If *netns* is any other signed 32-bit value greater than or + * equal to zero then it specifies the ID of the netns relative to + * the netns associated with the *ctx*. *netns* values beyond the + * range of 32-bit integers are reserved for future use. * * All values for *flags* are reserved for future usage, and must * be left at zero. @@ -2202,7 +2204,7 @@ union bpf_attr { * Return * Pointer to *struct bpf_sock*, or NULL in case of failure. * - * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags) + * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) * Description * Look for UDP socket matching *tuple*, optionally in a child * network namespace *netns*. The return value must be checked, @@ -2219,12 +2221,14 @@ union bpf_attr { * **sizeof**\ (*tuple*\ **->ipv6**) * Look for an IPv6 socket. * - * If the *netns* is zero, then the socket lookup table in the - * netns associated with the *ctx* will be used. For the TC hooks, - * this in the netns of the device in the skb. For socket hooks, - * this in the netns of the socket. If *netns* is non-zero, then - * it specifies the ID of the netns relative to the netns - * associated with the *ctx*. + * If the *netns* is a negative signed 32-bit integer, then the + * socket lookup table in the netns associated with the *ctx* will + * will be used. For the TC hooks, this is the netns of the device + * in the skb. For socket hooks, this is the netns of the socket. + * If *netns* is any other signed 32-bit value greater than or + * equal to zero then it specifies the ID of the netns relative to + * the netns associated with the *ctx*. *netns* values beyond the + * range of 32-bit integers are reserved for future use. * * All values for *flags* are reserved for future usage, and must * be left at zero. @@ -2405,6 +2409,9 @@ enum bpf_func_id { /* BPF_FUNC_perf_event_output for sk_buff input context. */ #define BPF_F_CTXLEN_MASK (0xfffffULL << 32) +/* Current network namespace */ +#define BPF_F_CURRENT_NETNS (-1L) + /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, diff --git a/net/core/filter.c b/net/core/filter.c index 6ee605da990f..8d2c629501e2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4890,22 +4890,23 @@ bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, struct net *net; family = len == sizeof(tuple->ipv4) ? AF_INET : AF_INET6; - if (unlikely(family == AF_UNSPEC || netns_id > U32_MAX || flags)) + if (unlikely(family == AF_UNSPEC || flags || + !((s32)netns_id < 0 || netns_id <= S32_MAX))) goto out; if (skb->dev) caller_net = dev_net(skb->dev); else caller_net = sock_net(skb->sk); - if (netns_id) { + if ((s32)netns_id < 0) { + net = caller_net; + sk = sk_lookup(net, tuple, skb, family, proto); + } else { net = get_net_ns_by_id(caller_net, netns_id); if (unlikely(!net)) goto out; sk = sk_lookup(net, tuple, skb, family, proto); put_net(net); - } else { - net = caller_net; - sk = sk_lookup(net, tuple, skb, family, proto); } if (sk) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 426b5c8a245b..76b265c7d93e 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2170,7 +2170,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags) + * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) * Description * Look for TCP socket matching *tuple*, optionally in a child * network namespace *netns*. The return value must be checked, @@ -2187,12 +2187,14 @@ union bpf_attr { * **sizeof**\ (*tuple*\ **->ipv6**) * Look for an IPv6 socket. * - * If the *netns* is zero, then the socket lookup table in the - * netns associated with the *ctx* will be used. For the TC hooks, - * this in the netns of the device in the skb. For socket hooks, - * this in the netns of the socket. If *netns* is non-zero, then - * it specifies the ID of the netns relative to the netns - * associated with the *ctx*. + * If the *netns* is a negative signed 32-bit integer, then the + * socket lookup table in the netns associated with the *ctx* will + * will be used. For the TC hooks, this is the netns of the device + * in the skb. For socket hooks, this is the netns of the socket. + * If *netns* is any other signed 32-bit value greater than or + * equal to zero then it specifies the ID of the netns relative to + * the netns associated with the *ctx*. *netns* values beyond the + * range of 32-bit integers are reserved for future use. * * All values for *flags* are reserved for future usage, and must * be left at zero. @@ -2201,8 +2203,10 @@ union bpf_attr { * **CONFIG_NET** configuration option. * Return * Pointer to *struct bpf_sock*, or NULL in case of failure. + * For sockets with reuseport option, *struct bpf_sock* + * return is from reuse->socks[] using hash of the packet. * - * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags) + * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) * Description * Look for UDP socket matching *tuple*, optionally in a child * network namespace *netns*. The return value must be checked, @@ -2219,12 +2223,14 @@ union bpf_attr { * **sizeof**\ (*tuple*\ **->ipv6**) * Look for an IPv6 socket. * - * If the *netns* is zero, then the socket lookup table in the - * netns associated with the *ctx* will be used. For the TC hooks, - * this in the netns of the device in the skb. For socket hooks, - * this in the netns of the socket. If *netns* is non-zero, then - * it specifies the ID of the netns relative to the netns - * associated with the *ctx*. + * If the *netns* is a negative signed 32-bit integer, then the + * socket lookup table in the netns associated with the *ctx* will + * will be used. For the TC hooks, this is the netns of the device + * in the skb. For socket hooks, this is the netns of the socket. + * If *netns* is any other signed 32-bit value greater than or + * equal to zero then it specifies the ID of the netns relative to + * the netns associated with the *ctx*. *netns* values beyond the + * range of 32-bit integers are reserved for future use. * * All values for *flags* are reserved for future usage, and must * be left at zero. @@ -2233,6 +2239,8 @@ union bpf_attr { * **CONFIG_NET** configuration option. * Return * Pointer to *struct bpf_sock*, or NULL in case of failure. + * For sockets with reuseport option, *struct bpf_sock* + * return is from reuse->socks[] using hash of the packet. * * int bpf_sk_release(struct bpf_sock *sk) * Description @@ -2405,6 +2413,9 @@ enum bpf_func_id { /* BPF_FUNC_perf_event_output for sk_buff input context. */ #define BPF_F_CTXLEN_MASK (0xfffffULL << 32) +/* Current network namespace */ +#define BPF_F_CURRENT_NETNS (-1L) + /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index 686e57ce40f4..efb6c13ab0de 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h @@ -154,12 +154,12 @@ static unsigned long long (*bpf_skb_ancestor_cgroup_id)(void *ctx, int level) = (void *) BPF_FUNC_skb_ancestor_cgroup_id; static struct bpf_sock *(*bpf_sk_lookup_tcp)(void *ctx, struct bpf_sock_tuple *tuple, - int size, unsigned int netns_id, + int size, unsigned long long netns_id, unsigned long long flags) = (void *) BPF_FUNC_sk_lookup_tcp; static struct bpf_sock *(*bpf_sk_lookup_udp)(void *ctx, struct bpf_sock_tuple *tuple, - int size, unsigned int netns_id, + int size, unsigned long long netns_id, unsigned long long flags) = (void *) BPF_FUNC_sk_lookup_udp; static int (*bpf_sk_release)(struct bpf_sock *sk) = diff --git a/tools/testing/selftests/bpf/test_sk_lookup_kern.c b/tools/testing/selftests/bpf/test_sk_lookup_kern.c index b745bdc08c2b..e21cd736c196 100644 --- a/tools/testing/selftests/bpf/test_sk_lookup_kern.c +++ b/tools/testing/selftests/bpf/test_sk_lookup_kern.c @@ -72,7 +72,7 @@ int bpf_sk_lookup_test0(struct __sk_buff *skb) return TC_ACT_SHOT; tuple_len = ipv4 ? sizeof(tuple->ipv4) : sizeof(tuple->ipv6); - sk = bpf_sk_lookup_tcp(skb, tuple, tuple_len, 0, 0); + sk = bpf_sk_lookup_tcp(skb, tuple, tuple_len, BPF_F_CURRENT_NETNS, 0); if (sk) bpf_sk_release(sk); return sk ? TC_ACT_OK : TC_ACT_UNSPEC; @@ -84,7 +84,7 @@ int bpf_sk_lookup_test1(struct __sk_buff *skb) struct bpf_sock_tuple tuple = {}; struct bpf_sock *sk; - sk = bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), 0, 0); + sk = bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), BPF_F_CURRENT_NETNS, 0); if (sk) bpf_sk_release(sk); return 0; @@ -97,7 +97,7 @@ int bpf_sk_lookup_uaf(struct __sk_buff *skb) struct bpf_sock *sk; __u32 family = 0; - sk = bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), 0, 0); + sk = bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), BPF_F_CURRENT_NETNS, 0); if (sk) { bpf_sk_release(sk); family = sk->family; @@ -112,7 +112,7 @@ int bpf_sk_lookup_modptr(struct __sk_buff *skb) struct bpf_sock *sk; __u32 family; - sk = bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), 0, 0); + sk = bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), BPF_F_CURRENT_NETNS, 0); if (sk) { sk += 1; bpf_sk_release(sk); @@ -127,7 +127,7 @@ int bpf_sk_lookup_modptr_or_null(struct __sk_buff *skb) struct bpf_sock *sk; __u32 family; - sk = bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), 0, 0); + sk = bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), BPF_F_CURRENT_NETNS, 0); sk += 1; if (sk) bpf_sk_release(sk); @@ -139,7 +139,7 @@ int bpf_sk_lookup_test2(struct __sk_buff *skb) { struct bpf_sock_tuple tuple = {}; - bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), 0, 0); + bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), BPF_F_CURRENT_NETNS, 0); return 0; } @@ -149,7 +149,7 @@ int bpf_sk_lookup_test3(struct __sk_buff *skb) struct bpf_sock_tuple tuple = {}; struct bpf_sock *sk; - sk = bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), 0, 0); + sk = bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), BPF_F_CURRENT_NETNS, 0); bpf_sk_release(sk); bpf_sk_release(sk); return 0; @@ -161,7 +161,7 @@ int bpf_sk_lookup_test4(struct __sk_buff *skb) struct bpf_sock_tuple tuple = {}; struct bpf_sock *sk; - sk = bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), 0, 0); + sk = bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), BPF_F_CURRENT_NETNS, 0); bpf_sk_release(sk); return 0; } @@ -169,7 +169,7 @@ int bpf_sk_lookup_test4(struct __sk_buff *skb) void lookup_no_release(struct __sk_buff *skb) { struct bpf_sock_tuple tuple = {}; - bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), 0, 0); + bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), BPF_F_CURRENT_NETNS, 0); } SEC("fail_no_release_subcall") From d74286d2c25ad29dbf9e342955dd8dc31f21653b Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Fri, 30 Nov 2018 15:32:21 -0800 Subject: [PATCH 38/93] bpf: Improve socket lookup reuseport documentation Improve the wording around socket lookup for reuseport sockets, and ensure that both bpf.h headers are in sync. Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 4 ++++ tools/include/uapi/linux/bpf.h | 8 ++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index cba518c57229..72c453a8bf50 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2203,6 +2203,8 @@ union bpf_attr { * **CONFIG_NET** configuration option. * Return * Pointer to *struct bpf_sock*, or NULL in case of failure. + * For sockets with reuseport option, the *struct bpf_sock* + * result is from reuse->socks[] using the hash of the tuple. * * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) * Description @@ -2237,6 +2239,8 @@ union bpf_attr { * **CONFIG_NET** configuration option. * Return * Pointer to *struct bpf_sock*, or NULL in case of failure. + * For sockets with reuseport option, the *struct bpf_sock* + * result is from reuse->socks[] using the hash of the tuple. * * int bpf_sk_release(struct bpf_sock *sk) * Description diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 76b265c7d93e..72c453a8bf50 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2203,8 +2203,8 @@ union bpf_attr { * **CONFIG_NET** configuration option. * Return * Pointer to *struct bpf_sock*, or NULL in case of failure. - * For sockets with reuseport option, *struct bpf_sock* - * return is from reuse->socks[] using hash of the packet. + * For sockets with reuseport option, the *struct bpf_sock* + * result is from reuse->socks[] using the hash of the tuple. * * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) * Description @@ -2239,8 +2239,8 @@ union bpf_attr { * **CONFIG_NET** configuration option. * Return * Pointer to *struct bpf_sock*, or NULL in case of failure. - * For sockets with reuseport option, *struct bpf_sock* - * return is from reuse->socks[] using hash of the packet. + * For sockets with reuseport option, the *struct bpf_sock* + * result is from reuse->socks[] using the hash of the tuple. * * int bpf_sk_release(struct bpf_sock *sk) * Description From a3d7e01da06013dc580641a1da57c3b482d58157 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 28 Nov 2018 13:40:04 -0800 Subject: [PATCH 39/93] net: dsa: Fix tagging attribute location While introducing the DSA tagging protocol attribute, it was added to the DSA slave network devices, but those actually see untagged traffic (that is their whole purpose). Correct this mistake by putting the tagging sysfs attribute under the DSA master network device where this is the information that we need. While at it, also correct the sysfs documentation mistake that missed the "dsa/" directory component of the attribute. Fixes: 98cdb4807123 ("net: dsa: Expose tagging protocol to user-space") Signed-off-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- Documentation/ABI/testing/sysfs-class-net-dsa | 2 +- net/dsa/master.c | 34 ++++++++++++++++++- net/dsa/slave.c | 28 --------------- 3 files changed, 34 insertions(+), 30 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-class-net-dsa b/Documentation/ABI/testing/sysfs-class-net-dsa index f240221e071e..985d84c585c6 100644 --- a/Documentation/ABI/testing/sysfs-class-net-dsa +++ b/Documentation/ABI/testing/sysfs-class-net-dsa @@ -1,4 +1,4 @@ -What: /sys/class/net//tagging +What: /sys/class/net//dsa/tagging Date: August 2018 KernelVersion: 4.20 Contact: netdev@vger.kernel.org diff --git a/net/dsa/master.c b/net/dsa/master.c index c90ee3227dea..5e8c9bef78bd 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -158,8 +158,31 @@ static void dsa_master_ethtool_teardown(struct net_device *dev) cpu_dp->orig_ethtool_ops = NULL; } +static ssize_t tagging_show(struct device *d, struct device_attribute *attr, + char *buf) +{ + struct net_device *dev = to_net_dev(d); + struct dsa_port *cpu_dp = dev->dsa_ptr; + + return sprintf(buf, "%s\n", + dsa_tag_protocol_to_str(cpu_dp->tag_ops)); +} +static DEVICE_ATTR_RO(tagging); + +static struct attribute *dsa_slave_attrs[] = { + &dev_attr_tagging.attr, + NULL +}; + +static const struct attribute_group dsa_group = { + .name = "dsa", + .attrs = dsa_slave_attrs, +}; + int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp) { + int ret; + /* If we use a tagging format that doesn't have an ethertype * field, make sure that all packets from this point on get * sent to the tag format's receive function. @@ -168,11 +191,20 @@ int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp) dev->dsa_ptr = cpu_dp; - return dsa_master_ethtool_setup(dev); + ret = dsa_master_ethtool_setup(dev); + if (ret) + return ret; + + ret = sysfs_create_group(&dev->dev.kobj, &dsa_group); + if (ret) + dsa_master_ethtool_teardown(dev); + + return ret; } void dsa_master_teardown(struct net_device *dev) { + sysfs_remove_group(&dev->dev.kobj, &dsa_group); dsa_master_ethtool_teardown(dev); dev->dsa_ptr = NULL; diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 7d0c19e7edcf..aec78f5aca72 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1058,27 +1058,6 @@ static struct device_type dsa_type = { .name = "dsa", }; -static ssize_t tagging_show(struct device *d, struct device_attribute *attr, - char *buf) -{ - struct net_device *dev = to_net_dev(d); - struct dsa_port *dp = dsa_slave_to_port(dev); - - return sprintf(buf, "%s\n", - dsa_tag_protocol_to_str(dp->cpu_dp->tag_ops)); -} -static DEVICE_ATTR_RO(tagging); - -static struct attribute *dsa_slave_attrs[] = { - &dev_attr_tagging.attr, - NULL -}; - -static const struct attribute_group dsa_group = { - .name = "dsa", - .attrs = dsa_slave_attrs, -}; - static void dsa_slave_phylink_validate(struct net_device *dev, unsigned long *supported, struct phylink_link_state *state) @@ -1374,14 +1353,8 @@ int dsa_slave_create(struct dsa_port *port) goto out_phy; } - ret = sysfs_create_group(&slave_dev->dev.kobj, &dsa_group); - if (ret) - goto out_unreg; - return 0; -out_unreg: - unregister_netdev(slave_dev); out_phy: rtnl_lock(); phylink_disconnect_phy(p->dp->pl); @@ -1405,7 +1378,6 @@ void dsa_slave_destroy(struct net_device *slave_dev) rtnl_unlock(); dsa_slave_notify(slave_dev, DSA_PORT_UNREGISTER); - sysfs_remove_group(&slave_dev->dev.kobj, &dsa_group); unregister_netdev(slave_dev); phylink_destroy(dp->pl); free_percpu(p->stats64); From ef6fcd455278c2be3032a346cc66d9dd9866b787 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 28 Nov 2018 15:04:05 -0800 Subject: [PATCH 40/93] mlx5: fix get_ip_proto() IP header is not necessarily located right after struct ethhdr, there could be multiple 802.1Q headers in between, this is why we call __vlan_get_protocol(). Fixes: fe1dc069990c ("net/mlx5e: don't set CHECKSUM_COMPLETE on SCTP packets") Cc: Alaa Hleihel Cc: Or Gerlitz Cc: Saeed Mahameed Signed-off-by: Cong Wang Reviewed-by: Tariq Toukan Acked-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 16985ca3248d..624eed345b5d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -724,9 +724,9 @@ static u32 mlx5e_get_fcs(const struct sk_buff *skb) return __get_unaligned_cpu32(fcs_bytes); } -static u8 get_ip_proto(struct sk_buff *skb, __be16 proto) +static u8 get_ip_proto(struct sk_buff *skb, int network_depth, __be16 proto) { - void *ip_p = skb->data + sizeof(struct ethhdr); + void *ip_p = skb->data + network_depth; return (proto == htons(ETH_P_IP)) ? ((struct iphdr *)ip_p)->protocol : ((struct ipv6hdr *)ip_p)->nexthdr; @@ -755,7 +755,7 @@ static inline void mlx5e_handle_csum(struct net_device *netdev, goto csum_unnecessary; if (likely(is_last_ethertype_ip(skb, &network_depth, &proto))) { - if (unlikely(get_ip_proto(skb, proto) == IPPROTO_SCTP)) + if (unlikely(get_ip_proto(skb, network_depth, proto) == IPPROTO_SCTP)) goto csum_unnecessary; skb->ip_summed = CHECKSUM_COMPLETE; From c0f53771ba45745e5870daf880127925c93f232f Mon Sep 17 00:00:00 2001 From: Pan Bian Date: Thu, 29 Nov 2018 07:54:22 +0800 Subject: [PATCH 41/93] liquidio: read sc->iq_no before release sc The function lio_vf_rep_packet_sent_callback releases the occupation of sc via octeon_free_soft_command. sc should not be used after that. Unfortunately, sc->iq_no is read. To fix this, the patch stores sc->iq_no into a local variable before releasing sc and then uses the local variable instead of sc->iq_no. Signed-off-by: Pan Bian Signed-off-by: David S. Miller --- drivers/net/ethernet/cavium/liquidio/lio_vf_rep.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_rep.c b/drivers/net/ethernet/cavium/liquidio/lio_vf_rep.c index ea9859e028d4..de61060721c4 100644 --- a/drivers/net/ethernet/cavium/liquidio/lio_vf_rep.c +++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_rep.c @@ -349,13 +349,15 @@ lio_vf_rep_packet_sent_callback(struct octeon_device *oct, struct octeon_soft_command *sc = (struct octeon_soft_command *)buf; struct sk_buff *skb = sc->ctxptr; struct net_device *ndev = skb->dev; + u32 iq_no; dma_unmap_single(&oct->pci_dev->dev, sc->dmadptr, sc->datasize, DMA_TO_DEVICE); dev_kfree_skb_any(skb); + iq_no = sc->iq_no; octeon_free_soft_command(oct, sc); - if (octnet_iq_is_full(oct, sc->iq_no)) + if (octnet_iq_is_full(oct, iq_no)) return; if (netif_queue_stopped(ndev)) From 3976535af0cb9fe34a55f2ffb8d7e6b39a2f8188 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 28 Nov 2018 16:06:43 -0800 Subject: [PATCH 42/93] tcp: fix off-by-one bug on aborting window-probing socket Previously there is an off-by-one bug on determining when to abort a stalled window-probing socket. This patch fixes that so it is consistent with tcp_write_timeout(). Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 091c53925e4d..25efdae4368a 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -378,7 +378,7 @@ static void tcp_probe_timer(struct sock *sk) return; } - if (icsk->icsk_probes_out > max_probes) { + if (icsk->icsk_probes_out >= max_probes) { abort: tcp_write_err(sk); } else { /* Only send another probe if we didn't close things up. */ From ec641b39457e17774313b66697a8a1dc070257bd Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 28 Nov 2018 16:06:44 -0800 Subject: [PATCH 43/93] tcp: fix SNMP under-estimation on failed retransmission Previously the SNMP counter LINUX_MIB_TCPRETRANSFAIL is not counting the TSO/GSO properly on failed retransmission. This patch fixes that. Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3f510cad0b3e..68b5326f7321 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2920,7 +2920,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; trace_tcp_retransmit_skb(sk, skb); } else if (err != -EBUSY) { - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs); } return err; } From e1561fe2dd69dc5dddd69bd73aa65355bdfb048b Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 28 Nov 2018 16:06:45 -0800 Subject: [PATCH 44/93] tcp: fix SNMP TCP timeout under-estimation Previously the SNMP TCPTIMEOUTS counter has inconsistent accounting: 1. It counts all SYN and SYN-ACK timeouts 2. It counts timeouts in other states except recurring timeouts and timeouts after fast recovery or disorder state. Such selective accounting makes analysis difficult and complicated. For example the monitoring system needs to collect many other SNMP counters to infer the total amount of timeout events. This patch makes TCPTIMEOUTS counter simply counts all the retransmit timeout (SYN or data or FIN). Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 25efdae4368a..f87dbc78b6bc 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -484,11 +484,12 @@ void tcp_retransmit_timer(struct sock *sk) goto out_reset_timer; } + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEOUTS); if (tcp_write_timeout(sk)) goto out; if (icsk->icsk_retransmits == 0) { - int mib_idx; + int mib_idx = 0; if (icsk->icsk_ca_state == TCP_CA_Recovery) { if (tcp_is_sack(tp)) @@ -503,10 +504,9 @@ void tcp_retransmit_timer(struct sock *sk) mib_idx = LINUX_MIB_TCPSACKFAILURES; else mib_idx = LINUX_MIB_TCPRENOFAILURES; - } else { - mib_idx = LINUX_MIB_TCPTIMEOUTS; } - __NET_INC_STATS(sock_net(sk), mib_idx); + if (mib_idx) + __NET_INC_STATS(sock_net(sk), mib_idx); } tcp_enter_loss(sk); From 436c9453a1ac0944b82870ef2e0d9be956b396d9 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 29 Nov 2018 13:53:16 +0800 Subject: [PATCH 45/93] virtio-net: keep vnet header zeroed after processing XDP We copy vnet header unconditionally in page_to_skb() this is wrong since XDP may modify the packet data. So let's keep a zeroed vnet header for not confusing the conversion between vnet header and skb metadata. In the future, we should able to detect whether or not the packet was modified and keep using the vnet header when packet was not touched. Fixes: f600b6905015 ("virtio_net: Add XDP support") Reported-by: Pavel Popa Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index cecfd77c9f3c..ea672145f6a6 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -365,7 +365,8 @@ static unsigned int mergeable_ctx_to_truesize(void *mrg_ctx) static struct sk_buff *page_to_skb(struct virtnet_info *vi, struct receive_queue *rq, struct page *page, unsigned int offset, - unsigned int len, unsigned int truesize) + unsigned int len, unsigned int truesize, + bool hdr_valid) { struct sk_buff *skb; struct virtio_net_hdr_mrg_rxbuf *hdr; @@ -387,7 +388,8 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, else hdr_padded_len = sizeof(struct padded_vnet_hdr); - memcpy(hdr, p, hdr_len); + if (hdr_valid) + memcpy(hdr, p, hdr_len); len -= hdr_len; offset += hdr_padded_len; @@ -739,7 +741,8 @@ static struct sk_buff *receive_big(struct net_device *dev, struct virtnet_rq_stats *stats) { struct page *page = buf; - struct sk_buff *skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE); + struct sk_buff *skb = page_to_skb(vi, rq, page, 0, len, + PAGE_SIZE, true); stats->bytes += len - vi->hdr_len; if (unlikely(!skb)) @@ -842,7 +845,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, rcu_read_unlock(); put_page(page); head_skb = page_to_skb(vi, rq, xdp_page, - offset, len, PAGE_SIZE); + offset, len, + PAGE_SIZE, false); return head_skb; } break; @@ -898,7 +902,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, goto err_skb; } - head_skb = page_to_skb(vi, rq, page, offset, len, truesize); + head_skb = page_to_skb(vi, rq, page, offset, len, truesize, !xdp_prog); curr_skb = head_skb; if (unlikely(!curr_skb)) From 35b827b6d06199841a83839e8bb69c0cd13a28be Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 29 Nov 2018 14:45:39 +0100 Subject: [PATCH 46/93] tun: forbid iface creation with rtnl ops It's not supported right now (the goal of the initial patch was to support 'ip link del' only). Before the patch: $ ip link add foo type tun [ 239.632660] BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 [snip] [ 239.636410] RIP: 0010:register_netdevice+0x8e/0x3a0 This panic occurs because dev->netdev_ops is not set by tun_setup(). But to have something usable, it will require more than just setting netdev_ops. Fixes: f019a7a594d9 ("tun: Implement ip link del tunXXX") CC: Eric W. Biederman Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- drivers/net/tun.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index e244f5d7512a..cf349e65a66b 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -2293,9 +2293,9 @@ static void tun_setup(struct net_device *dev) static int tun_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { - if (!data) - return 0; - return -EINVAL; + NL_SET_ERR_MSG(extack, + "tun/tap creation via rtnetlink is not supported."); + return -EOPNOTSUPP; } static size_t tun_get_size(const struct net_device *dev) From dcb40590e69e306030e944a39d0e4bf54247fb68 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Sat, 1 Dec 2018 10:39:44 -0800 Subject: [PATCH 47/93] bpf: refactor bpf_test_run() to separate own failures and test program result After commit f42ee093be29 ("bpf/test_run: support cgroup local storage") the bpf_test_run() function may fail with -ENOMEM, if it's not possible to allocate memory for a cgroup local storage. This error shouldn't be mixed with the return value of the testing program. Let's add an additional argument with a pointer where to store the testing program's result; and make bpf_test_run() return either 0 or -ENOMEM. Fixes: f42ee093be29 ("bpf/test_run: support cgroup local storage") Reported-by: Dan Carpenter Suggested-by: Alexei Starovoitov Signed-off-by: Roman Gushchin Cc: Daniel Borkmann Cc: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- net/bpf/test_run.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index c89c22c49015..25001913d03b 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -28,12 +28,13 @@ static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx, return ret; } -static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time) +static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *ret, + u32 *time) { struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { 0 }; enum bpf_cgroup_storage_type stype; u64 time_start, time_spent = 0; - u32 ret = 0, i; + u32 i; for_each_cgroup_storage_type(stype) { storage[stype] = bpf_cgroup_storage_alloc(prog, stype); @@ -49,7 +50,7 @@ static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time) repeat = 1; time_start = ktime_get_ns(); for (i = 0; i < repeat; i++) { - ret = bpf_test_run_one(prog, ctx, storage); + *ret = bpf_test_run_one(prog, ctx, storage); if (need_resched()) { if (signal_pending(current)) break; @@ -65,7 +66,7 @@ static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time) for_each_cgroup_storage_type(stype) bpf_cgroup_storage_free(storage[stype]); - return ret; + return 0; } static int bpf_test_finish(const union bpf_attr *kattr, @@ -165,7 +166,12 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, __skb_push(skb, hh_len); if (is_direct_pkt_access) bpf_compute_data_pointers(skb); - retval = bpf_test_run(prog, skb, repeat, &duration); + ret = bpf_test_run(prog, skb, repeat, &retval, &duration); + if (ret) { + kfree_skb(skb); + kfree(sk); + return ret; + } if (!is_l2) { if (skb_headroom(skb) < hh_len) { int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb)); @@ -212,11 +218,14 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0); xdp.rxq = &rxqueue->xdp_rxq; - retval = bpf_test_run(prog, &xdp, repeat, &duration); + ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration); + if (ret) + goto out; if (xdp.data != data + XDP_PACKET_HEADROOM + NET_IP_ALIGN || xdp.data_end != xdp.data + size) size = xdp.data_end - xdp.data; ret = bpf_test_finish(kattr, uattr, xdp.data, size, retval, duration); +out: kfree(data); return ret; } From d2a36971ef595069b7a600d1144c2e0881a930a1 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 3 Dec 2018 08:19:33 +0100 Subject: [PATCH 48/93] net: phy: don't allow __set_phy_supported to add unsupported modes Currently __set_phy_supported allows to add modes w/o checking whether the PHY supports them. This is wrong, it should never add modes but only remove modes we don't want to support. The commit marked as fixed didn't do anything wrong, it just copied existing functionality to the helper which is being fixed now. Fixes: f3a6bd393c2c ("phylib: Add phy_set_max_speed helper") Signed-off-by: Heiner Kallweit Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/phy_device.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 23ee3967c166..18e92c19c5ab 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -1880,20 +1880,17 @@ EXPORT_SYMBOL(genphy_loopback); static int __set_phy_supported(struct phy_device *phydev, u32 max_speed) { - phydev->supported &= ~(PHY_1000BT_FEATURES | PHY_100BT_FEATURES | - PHY_10BT_FEATURES); - switch (max_speed) { - default: - return -ENOTSUPP; - case SPEED_1000: - phydev->supported |= PHY_1000BT_FEATURES; + case SPEED_10: + phydev->supported &= ~PHY_100BT_FEATURES; /* fall through */ case SPEED_100: - phydev->supported |= PHY_100BT_FEATURES; - /* fall through */ - case SPEED_10: - phydev->supported |= PHY_10BT_FEATURES; + phydev->supported &= ~PHY_1000BT_FEATURES; + break; + case SPEED_1000: + break; + default: + return -ENOTSUPP; } return 0; From a5d4a89245ead1f37ed135213653c5beebea4237 Mon Sep 17 00:00:00 2001 From: Su Yanjun Date: Mon, 3 Dec 2018 15:33:07 +0800 Subject: [PATCH 49/93] net: 8139cp: fix a BUG triggered by changing mtu with network traffic When changing mtu many times with traffic, a bug is triggered: [ 1035.684037] kernel BUG at lib/dynamic_queue_limits.c:26! [ 1035.684042] invalid opcode: 0000 [#1] SMP [ 1035.684049] Modules linked in: loop binfmt_misc 8139cp(OE) macsec tcp_diag udp_diag inet_diag unix_diag af_packet_diag netlink_diag tcp_lp fuse uinput xt_CHECKSUM iptable_mangle ipt_MASQUERADE nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack ipt_REJECT nf_reject_ipv4 tun bridge stp llc ebtable_filter ebtables ip6table_filter devlink ip6_tables iptable_filter sunrpc snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hda_core snd_hwdep ppdev snd_seq iosf_mbi crc32_pclmul parport_pc snd_seq_device ghash_clmulni_intel parport snd_pcm aesni_intel joydev lrw snd_timer virtio_balloon sg gf128mul glue_helper ablk_helper cryptd snd soundcore i2c_piix4 pcspkr ip_tables xfs libcrc32c sr_mod sd_mod cdrom crc_t10dif crct10dif_generic ata_generic [ 1035.684102] pata_acpi virtio_console qxl drm_kms_helper syscopyarea sysfillrect sysimgblt floppy fb_sys_fops crct10dif_pclmul crct10dif_common ttm crc32c_intel serio_raw ata_piix drm libata 8139too virtio_pci drm_panel_orientation_quirks virtio_ring virtio mii dm_mirror dm_region_hash dm_log dm_mod [last unloaded: 8139cp] [ 1035.684132] CPU: 9 PID: 25140 Comm: if-mtu-change Kdump: loaded Tainted: G OE ------------ T 3.10.0-957.el7.x86_64 #1 [ 1035.684134] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 [ 1035.684136] task: ffff8f59b1f5a080 ti: ffff8f5a2e32c000 task.ti: ffff8f5a2e32c000 [ 1035.684149] RIP: 0010:[] [] dql_completed+0x180/0x190 [ 1035.684162] RSP: 0000:ffff8f5a75483e50 EFLAGS: 00010093 [ 1035.684162] RAX: 00000000000000c2 RBX: ffff8f5a6f91c000 RCX: 0000000000000000 [ 1035.684162] RDX: 0000000000000000 RSI: 0000000000000184 RDI: ffff8f599fea3ec0 [ 1035.684162] RBP: ffff8f5a75483ea8 R08: 00000000000000c2 R09: 0000000000000000 [ 1035.684162] R10: 00000000000616ef R11: ffff8f5a75483b56 R12: ffff8f599fea3e00 [ 1035.684162] R13: 0000000000000001 R14: 0000000000000000 R15: 0000000000000184 [ 1035.684162] FS: 00007fa8434de740(0000) GS:ffff8f5a75480000(0000) knlGS:0000000000000000 [ 1035.684162] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1035.684162] CR2: 00000000004305d0 CR3: 000000024eb66000 CR4: 00000000001406e0 [ 1035.684162] Call Trace: [ 1035.684162] [ 1035.684162] [] ? cp_interrupt+0x478/0x580 [8139cp] [ 1035.684162] [] __handle_irq_event_percpu+0x44/0x1c0 [ 1035.684162] [] handle_irq_event_percpu+0x32/0x80 [ 1035.684162] [] handle_irq_event+0x3c/0x60 [ 1035.684162] [] handle_fasteoi_irq+0x59/0x110 [ 1035.684162] [] handle_irq+0xe4/0x1a0 [ 1035.684162] [] do_IRQ+0x4d/0xf0 [ 1035.684162] [] common_interrupt+0x162/0x162 [ 1035.684162] [ 1035.684162] [] ? __wake_up_bit+0x24/0x70 [ 1035.684162] [] ? do_set_pte+0xd5/0x120 [ 1035.684162] [] unlock_page+0x2b/0x30 [ 1035.684162] [] do_read_fault.isra.61+0x139/0x1b0 [ 1035.684162] [] handle_pte_fault+0x2f4/0xd10 [ 1035.684162] [] handle_mm_fault+0x39d/0x9b0 [ 1035.684162] [] __do_page_fault+0x203/0x500 [ 1035.684162] [] trace_do_page_fault+0x56/0x150 [ 1035.684162] [] do_async_page_fault+0x22/0xf0 [ 1035.684162] [] async_page_fault+0x28/0x30 [ 1035.684162] Code: 54 c7 47 54 ff ff ff ff 44 0f 49 ce 48 8b 35 48 2f 9c 00 48 89 77 58 e9 fe fe ff ff 0f 1f 80 00 00 00 00 41 89 d1 e9 ef fe ff ff <0f> 0b 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 55 8d 42 ff 48 [ 1035.684162] RIP [] dql_completed+0x180/0x190 [ 1035.684162] RSP It's not the same as in 7fe0ee09 patch described. As 8139cp uses shared irq mode, other device irq will trigger cp_interrupt to execute. cp_change_mtu -> cp_close -> cp_open In cp_close routine just before free_irq(), some interrupt may occur. In my environment, cp_interrupt exectutes and IntrStatus is 0x4, exactly TxOk. That will cause cp_tx to wake device queue. As device queue is started, cp_start_xmit and cp_open will run at same time which will cause kernel BUG. For example: [#] for tx descriptor At start: [#][#][#] num_queued=3 After cp_init_hw->cp_start_hw->netdev_reset_queue: [#][#][#] num_queued=0 When 8139cp starts to work then cp_tx will check num_queued mismatchs the complete_bytes. The patch will check IntrMask before check IntrStatus in cp_interrupt. When 8139cp interrupt is disabled, just return. Signed-off-by: Su Yanjun Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/8139cp.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/realtek/8139cp.c b/drivers/net/ethernet/realtek/8139cp.c index 81045dfa1cd8..44f6e4873aad 100644 --- a/drivers/net/ethernet/realtek/8139cp.c +++ b/drivers/net/ethernet/realtek/8139cp.c @@ -571,6 +571,7 @@ static irqreturn_t cp_interrupt (int irq, void *dev_instance) struct cp_private *cp; int handled = 0; u16 status; + u16 mask; if (unlikely(dev == NULL)) return IRQ_NONE; @@ -578,6 +579,10 @@ static irqreturn_t cp_interrupt (int irq, void *dev_instance) spin_lock(&cp->lock); + mask = cpr16(IntrMask); + if (!mask) + goto out_unlock; + status = cpr16(IntrStatus); if (!status || (status == 0xFFFF)) goto out_unlock; From 4e4b08e55889da97dec750759f3ade8cc92b4644 Mon Sep 17 00:00:00 2001 From: Prashant Bhole Date: Mon, 3 Dec 2018 18:09:24 +0900 Subject: [PATCH 50/93] tun: remove skb access after netif_receive_skb In tun.c skb->len was accessed while doing stats accounting after a call to netif_receive_skb. We can not access skb after this call because buffers may be dropped. The fix for this bug would be to store skb->len in local variable and then use it after netif_receive_skb(). IMO using xdp data size for accounting bytes will be better because input for tun_xdp_one() is xdp_buff. Hence this patch: - fixes a bug by removing skb access after netif_receive_skb() - uses xdp data size for accounting bytes [613.019057] BUG: KASAN: use-after-free in tun_sendmsg+0x77c/0xc50 [tun] [613.021062] Read of size 4 at addr ffff8881da9ab7c0 by task vhost-1115/1155 [613.023073] [613.024003] CPU: 0 PID: 1155 Comm: vhost-1115 Not tainted 4.20.0-rc3-vm+ #232 [613.026029] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 [613.029116] Call Trace: [613.031145] dump_stack+0x5b/0x90 [613.032219] print_address_description+0x6c/0x23c [613.034156] ? tun_sendmsg+0x77c/0xc50 [tun] [613.036141] kasan_report.cold.5+0x241/0x308 [613.038125] tun_sendmsg+0x77c/0xc50 [tun] [613.040109] ? tun_get_user+0x1960/0x1960 [tun] [613.042094] ? __isolate_free_page+0x270/0x270 [613.045173] vhost_tx_batch.isra.14+0xeb/0x1f0 [vhost_net] [613.047127] ? peek_head_len.part.13+0x90/0x90 [vhost_net] [613.049096] ? get_tx_bufs+0x5a/0x2c0 [vhost_net] [613.051106] ? vhost_enable_notify+0x2d8/0x420 [vhost] [613.053139] handle_tx_copy+0x2d0/0x8f0 [vhost_net] [613.053139] ? vhost_net_buf_peek+0x340/0x340 [vhost_net] [613.053139] ? __mutex_lock+0x8d9/0xb30 [613.053139] ? finish_task_switch+0x8f/0x3f0 [613.053139] ? handle_tx+0x32/0x120 [vhost_net] [613.053139] ? mutex_trylock+0x110/0x110 [613.053139] ? finish_task_switch+0xcf/0x3f0 [613.053139] ? finish_task_switch+0x240/0x3f0 [613.053139] ? __switch_to_asm+0x34/0x70 [613.053139] ? __switch_to_asm+0x40/0x70 [613.053139] ? __schedule+0x506/0xf10 [613.053139] handle_tx+0xc7/0x120 [vhost_net] [613.053139] vhost_worker+0x166/0x200 [vhost] [613.053139] ? vhost_dev_init+0x580/0x580 [vhost] [613.053139] ? __kthread_parkme+0x77/0x90 [613.053139] ? vhost_dev_init+0x580/0x580 [vhost] [613.053139] kthread+0x1b1/0x1d0 [613.053139] ? kthread_park+0xb0/0xb0 [613.053139] ret_from_fork+0x35/0x40 [613.088705] [613.088705] Allocated by task 1155: [613.088705] kasan_kmalloc+0xbf/0xe0 [613.088705] kmem_cache_alloc+0xdc/0x220 [613.088705] __build_skb+0x2a/0x160 [613.088705] build_skb+0x14/0xc0 [613.088705] tun_sendmsg+0x4f0/0xc50 [tun] [613.088705] vhost_tx_batch.isra.14+0xeb/0x1f0 [vhost_net] [613.088705] handle_tx_copy+0x2d0/0x8f0 [vhost_net] [613.088705] handle_tx+0xc7/0x120 [vhost_net] [613.088705] vhost_worker+0x166/0x200 [vhost] [613.088705] kthread+0x1b1/0x1d0 [613.088705] ret_from_fork+0x35/0x40 [613.088705] [613.088705] Freed by task 1155: [613.088705] __kasan_slab_free+0x12e/0x180 [613.088705] kmem_cache_free+0xa0/0x230 [613.088705] ip6_mc_input+0x40f/0x5a0 [613.088705] ipv6_rcv+0xc9/0x1e0 [613.088705] __netif_receive_skb_one_core+0xc1/0x100 [613.088705] netif_receive_skb_internal+0xc4/0x270 [613.088705] br_pass_frame_up+0x2b9/0x2e0 [613.088705] br_handle_frame_finish+0x2fb/0x7a0 [613.088705] br_handle_frame+0x30f/0x6c0 [613.088705] __netif_receive_skb_core+0x61a/0x15b0 [613.088705] __netif_receive_skb_one_core+0x8e/0x100 [613.088705] netif_receive_skb_internal+0xc4/0x270 [613.088705] tun_sendmsg+0x738/0xc50 [tun] [613.088705] vhost_tx_batch.isra.14+0xeb/0x1f0 [vhost_net] [613.088705] handle_tx_copy+0x2d0/0x8f0 [vhost_net] [613.088705] handle_tx+0xc7/0x120 [vhost_net] [613.088705] vhost_worker+0x166/0x200 [vhost] [613.088705] kthread+0x1b1/0x1d0 [613.088705] ret_from_fork+0x35/0x40 [613.088705] [613.088705] The buggy address belongs to the object at ffff8881da9ab740 [613.088705] which belongs to the cache skbuff_head_cache of size 232 Fixes: 043d222f93ab ("tuntap: accept an array of XDP buffs through sendmsg()") Reviewed-by: Toshiaki Makita Signed-off-by: Prashant Bhole Acked-by: Jason Wang Acked-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- drivers/net/tun.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index cf349e65a66b..005020042be9 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -2385,6 +2385,7 @@ static int tun_xdp_one(struct tun_struct *tun, struct tun_file *tfile, struct xdp_buff *xdp, int *flush) { + unsigned int datasize = xdp->data_end - xdp->data; struct tun_xdp_hdr *hdr = xdp->data_hard_start; struct virtio_net_hdr *gso = &hdr->gso; struct tun_pcpu_stats *stats; @@ -2461,7 +2462,7 @@ build: stats = get_cpu_ptr(tun->pcpu_stats); u64_stats_update_begin(&stats->syncp); stats->rx_packets++; - stats->rx_bytes += skb->len; + stats->rx_bytes += datasize; u64_stats_update_end(&stats->syncp); put_cpu_ptr(stats); From 025dceb0fab31c912c41b8f32577432231d83e6b Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Mon, 3 Dec 2018 17:51:04 +0530 Subject: [PATCH 51/93] bpf: powerpc64: optimize JIT passes for bpf function calls Once the JITed images for each function in a multi-function program are generated after the first three JIT passes, we only need to fix the target address for the branch instruction corresponding to each bpf-to-bpf function call. This introduces the following optimizations for reducing the work done by the JIT compiler when handling multi-function programs: [1] Instead of doing two extra passes to fix the bpf function calls, do just one as that would be sufficient. [2] During the extra pass, only overwrite the instruction sequences for the bpf-to-bpf function calls as everything else would still remain exactly the same. This also reduces the number of writes to the JITed image. [3] Do not regenerate the prologue and the epilogue during the extra pass as that would be redundant. Signed-off-by: Sandipan Das Signed-off-by: Daniel Borkmann --- arch/powerpc/net/bpf_jit_comp64.c | 66 +++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 17482f5de3e2..9393e231cbc2 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -891,6 +891,55 @@ cond_branch: return 0; } +/* Fix the branch target addresses for subprog calls */ +static int bpf_jit_fixup_subprog_calls(struct bpf_prog *fp, u32 *image, + struct codegen_context *ctx, u32 *addrs) +{ + const struct bpf_insn *insn = fp->insnsi; + bool func_addr_fixed; + u64 func_addr; + u32 tmp_idx; + int i, ret; + + for (i = 0; i < fp->len; i++) { + /* + * During the extra pass, only the branch target addresses for + * the subprog calls need to be fixed. All other instructions + * can left untouched. + * + * The JITed image length does not change because we already + * ensure that the JITed instruction sequence for these calls + * are of fixed length by padding them with NOPs. + */ + if (insn[i].code == (BPF_JMP | BPF_CALL) && + insn[i].src_reg == BPF_PSEUDO_CALL) { + ret = bpf_jit_get_func_addr(fp, &insn[i], true, + &func_addr, + &func_addr_fixed); + if (ret < 0) + return ret; + + /* + * Save ctx->idx as this would currently point to the + * end of the JITed image and set it to the offset of + * the instruction sequence corresponding to the + * subprog call temporarily. + */ + tmp_idx = ctx->idx; + ctx->idx = addrs[i] / 4; + bpf_jit_emit_func_call_rel(image, ctx, func_addr); + + /* + * Restore ctx->idx here. This is safe as the length + * of the JITed sequence remains unchanged. + */ + ctx->idx = tmp_idx; + } + } + + return 0; +} + struct powerpc64_jit_data { struct bpf_binary_header *header; u32 *addrs; @@ -989,6 +1038,22 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) skip_init_ctx: code_base = (u32 *)(image + FUNCTION_DESCR_SIZE); + if (extra_pass) { + /* + * Do not touch the prologue and epilogue as they will remain + * unchanged. Only fix the branch target address for subprog + * calls in the body. + * + * This does not change the offsets and lengths of the subprog + * call instruction sequences and hence, the size of the JITed + * image as well. + */ + bpf_jit_fixup_subprog_calls(fp, code_base, &cgctx, addrs); + + /* There is no need to perform the usual passes. */ + goto skip_codegen_passes; + } + /* Code generation passes 1-2 */ for (pass = 1; pass < 3; pass++) { /* Now build the prologue, body code & epilogue for real. */ @@ -1002,6 +1067,7 @@ skip_init_ctx: proglen - (cgctx.idx * 4), cgctx.seen); } +skip_codegen_passes: if (bpf_jit_enable > 1) /* * Note that we output the base address of the code_base From ef1b5bf506b1f0ee3edc98533e1f3ecb105eb46a Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Wed, 28 Nov 2018 09:02:41 +0000 Subject: [PATCH 52/93] net: phy: Fix not to call phy_resume() if PHY is not attached This patch fixes an issue that mdio_bus_phy_resume() doesn't call phy_resume() if the PHY is not attached. Fixes: 803dd9c77ac3 ("net: phy: avoid suspending twice a PHY") Signed-off-by: Yoshihiro Shimoda Signed-off-by: David S. Miller --- drivers/net/phy/phy_device.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 18e92c19c5ab..c4b9008c52d2 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -220,7 +220,7 @@ static LIST_HEAD(phy_fixup_list); static DEFINE_MUTEX(phy_fixup_lock); #ifdef CONFIG_PM -static bool mdio_bus_phy_may_suspend(struct phy_device *phydev) +static bool mdio_bus_phy_may_suspend(struct phy_device *phydev, bool suspend) { struct device_driver *drv = phydev->mdio.dev.driver; struct phy_driver *phydrv = to_phy_driver(drv); @@ -232,10 +232,11 @@ static bool mdio_bus_phy_may_suspend(struct phy_device *phydev) /* PHY not attached? May suspend if the PHY has not already been * suspended as part of a prior call to phy_disconnect() -> * phy_detach() -> phy_suspend() because the parent netdev might be the - * MDIO bus driver and clock gated at this point. + * MDIO bus driver and clock gated at this point. Also may resume if + * PHY is not attached. */ if (!netdev) - return !phydev->suspended; + return suspend ? !phydev->suspended : phydev->suspended; if (netdev->wol_enabled) return false; @@ -270,7 +271,7 @@ static int mdio_bus_phy_suspend(struct device *dev) if (phydev->attached_dev && phydev->adjust_link) phy_stop_machine(phydev); - if (!mdio_bus_phy_may_suspend(phydev)) + if (!mdio_bus_phy_may_suspend(phydev, true)) return 0; return phy_suspend(phydev); @@ -281,7 +282,7 @@ static int mdio_bus_phy_resume(struct device *dev) struct phy_device *phydev = to_phy_device(dev); int ret; - if (!mdio_bus_phy_may_suspend(phydev)) + if (!mdio_bus_phy_may_suspend(phydev, false)) goto no_resume; ret = phy_resume(phydev); From 8c85f4b81296a530b8af2796c110fa482ac42d4f Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Wed, 28 Nov 2018 09:02:42 +0000 Subject: [PATCH 53/93] net: phy: micrel: add toggling phy reset if PHY is not attached This patch adds toggling phy reset if PHY is not attached. Otherwise, some boards (e.g. R-Car H3 Salvator-XS) cannot link up correctly if we do the following method: 1) Kernel boots by using initramfs. --> No open the nic, so phy_device_register() and phy_probe() deasserts the reset. 2) Kernel enters the suspend. --> So, keep the reset signal as deassert. --> On R-Car Salvator-XS board, unfortunately, the board power is turned off. 3) Kernel returns from suspend. 4) ifconfig eth0 up --> Then, since edge signal of the reset doesn't happen, it cannot link up. 5) ifconfig eth0 down 6) ifconfig eth0 up --> In this case, it can link up. Reported-by: Hiromitsu Yamasaki Signed-off-by: Yoshihiro Shimoda Signed-off-by: David S. Miller --- drivers/net/phy/micrel.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index 9265dea79412..1679a6ea104c 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -23,6 +23,7 @@ * ksz9477 */ +#include #include #include #include @@ -835,6 +836,13 @@ static int kszphy_resume(struct phy_device *phydev) { int ret; + if (!phydev->attached_dev) { + /* If the PHY is not attached, toggle the reset */ + phy_device_reset(phydev, 1); + udelay(1); + phy_device_reset(phydev, 0); + } + genphy_resume(phydev); ret = kszphy_config_reset(phydev); From e3f787189e10f5fafce77ba8aa948741ebb93c2b Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Fri, 30 Nov 2018 16:05:53 +0000 Subject: [PATCH 54/93] vhost: fix IOTLB locking Commit 78139c94dc8c ("net: vhost: lock the vqs one by one") moved the vq lock to improve scalability, but introduced a possible deadlock in vhost-iotlb. vhost_iotlb_notify_vq() now takes vq->mutex while holding the device's IOTLB spinlock. And on the vhost_iotlb_miss() path, the spinlock is taken while holding vq->mutex. Since calling vhost_poll_queue() doesn't require any lock, avoid the deadlock by not taking vq->mutex. Fixes: 78139c94dc8c ("net: vhost: lock the vqs one by one") Acked-by: Jason Wang Acked-by: Michael S. Tsirkin Signed-off-by: Jean-Philippe Brucker Signed-off-by: David S. Miller --- drivers/vhost/vhost.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 3a5f81a66d34..6b98d8e3a5bf 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -944,10 +944,7 @@ static void vhost_iotlb_notify_vq(struct vhost_dev *d, if (msg->iova <= vq_msg->iova && msg->iova + msg->size - 1 >= vq_msg->iova && vq_msg->type == VHOST_IOTLB_MISS) { - mutex_lock(&node->vq->mutex); vhost_poll_queue(&node->vq->poll); - mutex_unlock(&node->vq->mutex); - list_del(&node->node); kfree(node); } From 986103e7920cabc0b910749e77ae5589d3934d52 Mon Sep 17 00:00:00 2001 From: Thomas Falcon Date: Fri, 30 Nov 2018 10:59:08 -0600 Subject: [PATCH 55/93] net/ibmvnic: Fix RTNL deadlock during device reset Commit a5681e20b541 ("net/ibmnvic: Fix deadlock problem in reset") made the change to hold the RTNL lock during driver reset but still calls netdev_notify_peers, which results in a deadlock. Instead, use call_netdevice_notifiers, which is functionally the same except that it does not take the RTNL lock again. Fixes: a5681e20b541 ("net/ibmnvic: Fix deadlock problem in reset") Signed-off-by: Thomas Falcon Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index c0203a0d5e3b..ed50b8dee44f 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -1859,7 +1859,7 @@ static int do_reset(struct ibmvnic_adapter *adapter, if (adapter->reset_reason != VNIC_RESET_FAILOVER && adapter->reset_reason != VNIC_RESET_CHANGE_PARAM) - netdev_notify_peers(netdev); + call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, netdev); netif_carrier_on(netdev); From fb6df5a6234c38a9c551559506a49a677ac6f07a Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 1 Dec 2018 01:36:59 +0800 Subject: [PATCH 56/93] sctp: kfree_rcu asoc In sctp_hash_transport/sctp_epaddr_lookup_transport, it dereferences a transport's asoc under rcu_read_lock while asoc is freed not after a grace period, which leads to a use-after-free panic. This patch fixes it by calling kfree_rcu to make asoc be freed after a grace period. Note that only the asoc's memory is delayed to free in the patch, it won't cause sk to linger longer. Thanks Neil and Marcelo to make this clear. Fixes: 7fda702f9315 ("sctp: use new rhlist interface on sctp transport rhashtable") Fixes: cd2b70875058 ("sctp: check duplicate node before inserting a new transport") Reported-by: syzbot+0b05d8aa7cb185107483@syzkaller.appspotmail.com Reported-by: syzbot+aad231d51b1923158444@syzkaller.appspotmail.com Suggested-by: Neil Horman Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 2 ++ net/sctp/associola.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index a11f93790476..feada358d872 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -2075,6 +2075,8 @@ struct sctp_association { __u64 abandoned_unsent[SCTP_PR_INDEX(MAX) + 1]; __u64 abandoned_sent[SCTP_PR_INDEX(MAX) + 1]; + + struct rcu_head rcu; }; diff --git a/net/sctp/associola.c b/net/sctp/associola.c index dd77ec3892b6..914750b819b2 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -435,7 +435,7 @@ static void sctp_association_destroy(struct sctp_association *asoc) WARN_ON(atomic_read(&asoc->rmem_alloc)); - kfree(asoc); + kfree_rcu(asoc, rcu); SCTP_DBG_OBJCNT_DEC(assoc); } From 59f997b088d26a774958cb7b17b0763cd82de7ec Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Sat, 1 Dec 2018 00:26:27 +0100 Subject: [PATCH 57/93] macvlan: return correct error value A MAC address must be unique among all the macvlan devices with the same lower device. The only exception is the passthru [sic] mode, which shares the lower device address. When duplicate addresses are detected, EBUSY is returned when bringing the interface up: # ip link add macvlan0 link eth0 type macvlan # read addr Signed-off-by: David S. Miller --- drivers/net/macvlan.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index fc8d5f1ee1ad..0da3d36b283b 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -608,7 +608,7 @@ static int macvlan_open(struct net_device *dev) goto hash_add; } - err = -EBUSY; + err = -EADDRINUSE; if (macvlan_addr_busy(vlan->port, dev->dev_addr)) goto out; @@ -706,7 +706,7 @@ static int macvlan_sync_address(struct net_device *dev, unsigned char *addr) } else { /* Rehash and update the device filters */ if (macvlan_addr_busy(vlan->port, addr)) - return -EBUSY; + return -EADDRINUSE; if (!macvlan_passthru(port)) { err = dev_uc_add(lowerdev, addr); @@ -747,6 +747,9 @@ static int macvlan_set_mac_address(struct net_device *dev, void *p) return dev_set_mac_address(vlan->lowerdev, addr); } + if (macvlan_addr_busy(vlan->port, addr->sa_data)) + return -EADDRINUSE; + return macvlan_sync_address(dev, addr->sa_data); } From a74515604a7b171f2702bdcbd1e231225fb456d0 Mon Sep 17 00:00:00 2001 From: Anderson Luiz Alves Date: Fri, 30 Nov 2018 21:58:36 -0200 Subject: [PATCH 58/93] mv88e6060: disable hardware level MAC learning Disable hardware level MAC learning because it breaks station roaming. When enabled it drops all frames that arrive from a MAC address that is on a different port at learning table. Signed-off-by: Anderson Luiz Alves Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/dsa/mv88e6060.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/net/dsa/mv88e6060.c b/drivers/net/dsa/mv88e6060.c index 65f10fec25b3..0b3e51f248c2 100644 --- a/drivers/net/dsa/mv88e6060.c +++ b/drivers/net/dsa/mv88e6060.c @@ -116,8 +116,7 @@ static int mv88e6060_switch_reset(struct dsa_switch *ds) /* Reset the switch. */ REG_WRITE(REG_GLOBAL, GLOBAL_ATU_CONTROL, GLOBAL_ATU_CONTROL_SWRESET | - GLOBAL_ATU_CONTROL_ATUSIZE_1024 | - GLOBAL_ATU_CONTROL_ATE_AGE_5MIN); + GLOBAL_ATU_CONTROL_LEARNDIS); /* Wait up to one second for reset to complete. */ timeout = jiffies + 1 * HZ; @@ -142,13 +141,10 @@ static int mv88e6060_setup_global(struct dsa_switch *ds) */ REG_WRITE(REG_GLOBAL, GLOBAL_CONTROL, GLOBAL_CONTROL_MAX_FRAME_1536); - /* Enable automatic address learning, set the address - * database size to 1024 entries, and set the default aging - * time to 5 minutes. + /* Disable automatic address learning. */ REG_WRITE(REG_GLOBAL, GLOBAL_ATU_CONTROL, - GLOBAL_ATU_CONTROL_ATUSIZE_1024 | - GLOBAL_ATU_CONTROL_ATE_AGE_5MIN); + GLOBAL_ATU_CONTROL_LEARNDIS); return 0; } From bf29e9e9b6d2f09cdbf39b48d028f0b49e944f85 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Sat, 1 Dec 2018 21:11:19 -0500 Subject: [PATCH 59/93] net/core: tidy up an error message netif_napi_add() could report an error like this below due to it allows to pass a format string for wildcarding before calling dev_get_valid_name(), "netif_napi_add() called with weight 256 on device eth%d" For example, hns_enet_drv module does this. hns_nic_try_get_ae hns_nic_init_ring_data netif_napi_add register_netdev dev_get_valid_name Hence, make it a bit more human-readable by using netdev_err_once() instead. Signed-off-by: Qian Cai Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 3470e7fff1f4..e06223b65674 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6209,8 +6209,8 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, napi->skb = NULL; napi->poll = poll; if (weight > NAPI_POLL_WEIGHT) - pr_err_once("netif_napi_add() called with weight %d on device %s\n", - weight, dev->name); + netdev_err_once(dev, "%s() called with weight %d\n", __func__, + weight); napi->weight = weight; list_add(&napi->dev_list, &dev->napi_list); napi->dev = dev; From 24be19e47779d604d1492c114459dca9a92acf78 Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Sun, 2 Dec 2018 14:34:36 +0200 Subject: [PATCH 60/93] net/mlx4_en: Change min MTU size to ETH_MIN_MTU NIC driver minimal MTU size shall be set to ETH_MIN_MTU, as defined in the RFC791 and in the network stack. Remove old mlx4_en only define for it, which was set to wrong value. Fixes: b80f71f5816f ("ethernet/mellanox: use core min/max MTU checking") Signed-off-by: Eran Ben Elisha Signed-off-by: Tariq Toukan Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 4 ++-- drivers/net/ethernet/mellanox/mlx4/mlx4_en.h | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index b744cd49a785..6b88881b8e35 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -3493,8 +3493,8 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port, dev->gso_partial_features = NETIF_F_GSO_UDP_TUNNEL_CSUM; } - /* MTU range: 46 - hw-specific max */ - dev->min_mtu = MLX4_EN_MIN_MTU; + /* MTU range: 68 - hw-specific max */ + dev->min_mtu = ETH_MIN_MTU; dev->max_mtu = priv->max_mtu; mdev->pndev[port] = dev; diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h index 485d856546c6..8137454e2534 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h @@ -161,7 +161,6 @@ #define MLX4_SELFTEST_LB_MIN_MTU (MLX4_LOOPBACK_TEST_PAYLOAD + NET_IP_ALIGN + \ ETH_HLEN + PREAMBLE_LEN) -#define MLX4_EN_MIN_MTU 46 /* VLAN_HLEN is added twice,to support skb vlan tagged with multiple * headers. (For example: ETH_P_8021Q and ETH_P_8021AD). */ From 1b603f9e4313348608f256b564ed6e3d9e67f377 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Sun, 2 Dec 2018 14:34:37 +0200 Subject: [PATCH 61/93] net/mlx4_en: Fix build break when CONFIG_INET is off MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MLX4_EN depends on NETDEVICES, ETHERNET and INET Kconfigs. Make sure they are listed in MLX4_EN Kconfig dependencies. This fixes the following build break: drivers/net/ethernet/mellanox/mlx4/en_rx.c:582:18: warning: ‘struct iphdr’ declared inside parameter list [enabled by default] struct iphdr *iph) ^ drivers/net/ethernet/mellanox/mlx4/en_rx.c:582:18: warning: its scope is only this definition or declaration, which is probably not what you want [enabled by default] drivers/net/ethernet/mellanox/mlx4/en_rx.c: In function ‘get_fixed_ipv4_csum’: drivers/net/ethernet/mellanox/mlx4/en_rx.c:586:20: error: dereferencing pointer to incomplete type _u8 ipproto = iph->protocol; Signed-off-by: Saeed Mahameed Signed-off-by: Tariq Toukan Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/Kconfig b/drivers/net/ethernet/mellanox/mlx4/Kconfig index 36054e6fb9d3..f200b8c420d5 100644 --- a/drivers/net/ethernet/mellanox/mlx4/Kconfig +++ b/drivers/net/ethernet/mellanox/mlx4/Kconfig @@ -5,7 +5,7 @@ config MLX4_EN tristate "Mellanox Technologies 1/10/40Gbit Ethernet support" depends on MAY_USE_DEVLINK - depends on PCI + depends on PCI && NETDEVICES && ETHERNET && INET select MLX4_CORE imply PTP_1588_CLOCK ---help--- From c3494801cd1785e2c25f1a5735fa19ddcf9665da Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 3 Dec 2018 22:46:04 -0800 Subject: [PATCH 62/93] bpf: check pending signals while verifying programs Malicious user space may try to force the verifier to use as much cpu time and memory as possible. Hence check for pending signals while verifying the program. Note that suspend of sys_bpf(PROG_LOAD) syscall will lead to EAGAIN, since the kernel has to release the resources used for program verification. Reported-by: Anatoly Trosinenko Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Acked-by: Edward Cree Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6dd419550aba..751bb30b7c5c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5148,6 +5148,9 @@ static int do_check(struct bpf_verifier_env *env) goto process_bpf_exit; } + if (signal_pending(current)) + return -EAGAIN; + if (need_resched()) cond_resched(); From 4f7b3e82589e0de723780198ec7983e427144c0a Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 3 Dec 2018 22:46:05 -0800 Subject: [PATCH 63/93] bpf: improve verifier branch analysis pathological bpf programs may try to force verifier to explode in the number of branch states: 20: (d5) if r1 s<= 0x24000028 goto pc+0 21: (b5) if r0 <= 0xe1fa20 goto pc+2 22: (d5) if r1 s<= 0x7e goto pc+0 23: (b5) if r0 <= 0xe880e000 goto pc+0 24: (c5) if r0 s< 0x2100ecf4 goto pc+0 25: (d5) if r1 s<= 0xe880e000 goto pc+1 26: (c5) if r0 s< 0xf4041810 goto pc+0 27: (d5) if r1 s<= 0x1e007e goto pc+0 28: (b5) if r0 <= 0xe86be000 goto pc+0 29: (07) r0 += 16614 30: (c5) if r0 s< 0x6d0020da goto pc+0 31: (35) if r0 >= 0x2100ecf4 goto pc+0 Teach verifier to recognize always taken and always not taken branches. This analysis is already done for == and != comparison. Expand it to all other branches. It also helps real bpf programs to be verified faster: before after bpf_lb-DLB_L3.o 2003 1940 bpf_lb-DLB_L4.o 3173 3089 bpf_lb-DUNKNOWN.o 1080 1065 bpf_lxc-DDROP_ALL.o 29584 28052 bpf_lxc-DUNKNOWN.o 36916 35487 bpf_netdev.o 11188 10864 bpf_overlay.o 6679 6643 bpf_lcx_jit.o 39555 38437 Reported-by: Anatoly Trosinenko Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Acked-by: Edward Cree Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 93 ++++++++++++++++++--- tools/testing/selftests/bpf/test_verifier.c | 4 +- 2 files changed, 82 insertions(+), 15 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 751bb30b7c5c..55a49703f423 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3751,6 +3751,79 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, } } +/* compute branch direction of the expression "if (reg opcode val) goto target;" + * and return: + * 1 - branch will be taken and "goto target" will be executed + * 0 - branch will not be taken and fall-through to next insn + * -1 - unknown. Example: "if (reg < 5)" is unknown when register value range [0,10] + */ +static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) +{ + if (__is_pointer_value(false, reg)) + return -1; + + switch (opcode) { + case BPF_JEQ: + if (tnum_is_const(reg->var_off)) + return !!tnum_equals_const(reg->var_off, val); + break; + case BPF_JNE: + if (tnum_is_const(reg->var_off)) + return !tnum_equals_const(reg->var_off, val); + break; + case BPF_JGT: + if (reg->umin_value > val) + return 1; + else if (reg->umax_value <= val) + return 0; + break; + case BPF_JSGT: + if (reg->smin_value > (s64)val) + return 1; + else if (reg->smax_value < (s64)val) + return 0; + break; + case BPF_JLT: + if (reg->umax_value < val) + return 1; + else if (reg->umin_value >= val) + return 0; + break; + case BPF_JSLT: + if (reg->smax_value < (s64)val) + return 1; + else if (reg->smin_value >= (s64)val) + return 0; + break; + case BPF_JGE: + if (reg->umin_value >= val) + return 1; + else if (reg->umax_value < val) + return 0; + break; + case BPF_JSGE: + if (reg->smin_value >= (s64)val) + return 1; + else if (reg->smax_value < (s64)val) + return 0; + break; + case BPF_JLE: + if (reg->umax_value <= val) + return 1; + else if (reg->umin_value > val) + return 0; + break; + case BPF_JSLE: + if (reg->smax_value <= (s64)val) + return 1; + else if (reg->smin_value > (s64)val) + return 0; + break; + } + + return -1; +} + /* Adjusts the register min/max values in the case that the dst_reg is the * variable register that we are working on, and src_reg is a constant or we're * simply doing a BPF_K check. @@ -4152,21 +4225,15 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, dst_reg = ®s[insn->dst_reg]; - /* detect if R == 0 where R was initialized to zero earlier */ - if (BPF_SRC(insn->code) == BPF_K && - (opcode == BPF_JEQ || opcode == BPF_JNE) && - dst_reg->type == SCALAR_VALUE && - tnum_is_const(dst_reg->var_off)) { - if ((opcode == BPF_JEQ && dst_reg->var_off.value == insn->imm) || - (opcode == BPF_JNE && dst_reg->var_off.value != insn->imm)) { - /* if (imm == imm) goto pc+off; - * only follow the goto, ignore fall-through - */ + if (BPF_SRC(insn->code) == BPF_K) { + int pred = is_branch_taken(dst_reg, insn->imm, opcode); + + if (pred == 1) { + /* only follow the goto, ignore fall-through */ *insn_idx += insn->off; return 0; - } else { - /* if (imm != imm) goto pc+off; - * only follow fall-through branch, since + } else if (pred == 0) { + /* only follow fall-through branch, since * that's where the program will go */ return 0; diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 5dd4410a716c..df6f751cc1e8 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -8576,7 +8576,7 @@ static struct bpf_test tests[] = { BPF_JMP_IMM(BPF_JA, 0, 0, -7), }, .fixup_map_hash_8b = { 4 }, - .errstr = "R0 invalid mem access 'inv'", + .errstr = "unbounded min value", .result = REJECT, }, { @@ -10547,7 +10547,7 @@ static struct bpf_test tests[] = { "check deducing bounds from const, 5", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 1), + BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 1, 1), BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, From ceefbc96fa5c5b975d87bf8e89ba8416f6b764d9 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 3 Dec 2018 22:46:06 -0800 Subject: [PATCH 64/93] bpf: add per-insn complexity limit malicious bpf program may try to force the verifier to remember a lot of distinct verifier states. Put a limit to number of per-insn 'struct bpf_verifier_state'. Note that hitting the limit doesn't reject the program. It potentially makes the verifier do more steps to analyze the program. It means that malicious programs will hit BPF_COMPLEXITY_LIMIT_INSNS sooner instead of spending cpu time walking long link list. The limit of BPF_COMPLEXITY_LIMIT_STATES==64 affects cilium progs with slight increase in number of "steps" it takes to successfully verify the programs: before after bpf_lb-DLB_L3.o 1940 1940 bpf_lb-DLB_L4.o 3089 3089 bpf_lb-DUNKNOWN.o 1065 1065 bpf_lxc-DDROP_ALL.o 28052 | 28162 bpf_lxc-DUNKNOWN.o 35487 | 35541 bpf_netdev.o 10864 10864 bpf_overlay.o 6643 6643 bpf_lcx_jit.o 38437 38437 But it also makes malicious program to be rejected in 0.4 seconds vs 6.5 Hence apply this limit to unprivileged programs only. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Acked-by: Edward Cree Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 55a49703f423..fc760d00a38c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -175,6 +175,7 @@ struct bpf_verifier_stack_elem { #define BPF_COMPLEXITY_LIMIT_INSNS 131072 #define BPF_COMPLEXITY_LIMIT_STACK 1024 +#define BPF_COMPLEXITY_LIMIT_STATES 64 #define BPF_MAP_PTR_UNPRIV 1UL #define BPF_MAP_PTR_POISON ((void *)((0xeB9FUL << 1) + \ @@ -5047,7 +5048,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) struct bpf_verifier_state_list *new_sl; struct bpf_verifier_state_list *sl; struct bpf_verifier_state *cur = env->cur_state, *new; - int i, j, err; + int i, j, err, states_cnt = 0; sl = env->explored_states[insn_idx]; if (!sl) @@ -5074,8 +5075,12 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) return 1; } sl = sl->next; + states_cnt++; } + if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) + return 0; + /* there were no equivalent states, remember current one. * technically the current state is not proven to be safe yet, * but it will either reach outer most bpf_exit (which means it's safe) From 7b566f70e1bf65b189b66eb3de6f431c30f7dff2 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 4 Dec 2018 08:47:44 -0800 Subject: [PATCH 65/93] phy: Revert toggling reset changes. This reverts: ef1b5bf506b1 ("net: phy: Fix not to call phy_resume() if PHY is not attached") 8c85f4b81296 ("net: phy: micrel: add toggling phy reset if PHY is not attached") Andrew Lunn informs me that there are alternative efforts underway to fix this more properly. Signed-off-by: David S. Miller --- drivers/net/phy/micrel.c | 8 -------- drivers/net/phy/phy_device.c | 11 +++++------ 2 files changed, 5 insertions(+), 14 deletions(-) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index 1679a6ea104c..9265dea79412 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -23,7 +23,6 @@ * ksz9477 */ -#include #include #include #include @@ -836,13 +835,6 @@ static int kszphy_resume(struct phy_device *phydev) { int ret; - if (!phydev->attached_dev) { - /* If the PHY is not attached, toggle the reset */ - phy_device_reset(phydev, 1); - udelay(1); - phy_device_reset(phydev, 0); - } - genphy_resume(phydev); ret = kszphy_config_reset(phydev); diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index c4b9008c52d2..18e92c19c5ab 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -220,7 +220,7 @@ static LIST_HEAD(phy_fixup_list); static DEFINE_MUTEX(phy_fixup_lock); #ifdef CONFIG_PM -static bool mdio_bus_phy_may_suspend(struct phy_device *phydev, bool suspend) +static bool mdio_bus_phy_may_suspend(struct phy_device *phydev) { struct device_driver *drv = phydev->mdio.dev.driver; struct phy_driver *phydrv = to_phy_driver(drv); @@ -232,11 +232,10 @@ static bool mdio_bus_phy_may_suspend(struct phy_device *phydev, bool suspend) /* PHY not attached? May suspend if the PHY has not already been * suspended as part of a prior call to phy_disconnect() -> * phy_detach() -> phy_suspend() because the parent netdev might be the - * MDIO bus driver and clock gated at this point. Also may resume if - * PHY is not attached. + * MDIO bus driver and clock gated at this point. */ if (!netdev) - return suspend ? !phydev->suspended : phydev->suspended; + return !phydev->suspended; if (netdev->wol_enabled) return false; @@ -271,7 +270,7 @@ static int mdio_bus_phy_suspend(struct device *dev) if (phydev->attached_dev && phydev->adjust_link) phy_stop_machine(phydev); - if (!mdio_bus_phy_may_suspend(phydev, true)) + if (!mdio_bus_phy_may_suspend(phydev)) return 0; return phy_suspend(phydev); @@ -282,7 +281,7 @@ static int mdio_bus_phy_resume(struct device *dev) struct phy_device *phydev = to_phy_device(dev); int ret; - if (!mdio_bus_phy_may_suspend(phydev, false)) + if (!mdio_bus_phy_may_suspend(phydev)) goto no_resume; ret = phy_resume(phydev); From 688838934c231bb08f46db687e57f6d8bf82709c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 4 Dec 2018 09:40:35 -0800 Subject: [PATCH 66/93] rtnetlink: ndo_dflt_fdb_dump() only work for ARPHRD_ETHER devices kmsan was able to trigger a kernel-infoleak using a gre device [1] nlmsg_populate_fdb_fill() has a hard coded assumption that dev->addr_len is ETH_ALEN, as normally guaranteed for ARPHRD_ETHER devices. A similar issue was fixed recently in commit da71577545a5 ("rtnetlink: Disallow FDB configuration for non-Ethernet device") [1] BUG: KMSAN: kernel-infoleak in copyout lib/iov_iter.c:143 [inline] BUG: KMSAN: kernel-infoleak in _copy_to_iter+0x4c0/0x2700 lib/iov_iter.c:576 CPU: 0 PID: 6697 Comm: syz-executor310 Not tainted 4.20.0-rc3+ #95 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x32d/0x480 lib/dump_stack.c:113 kmsan_report+0x12c/0x290 mm/kmsan/kmsan.c:683 kmsan_internal_check_memory+0x32a/0xa50 mm/kmsan/kmsan.c:743 kmsan_copy_to_user+0x78/0xd0 mm/kmsan/kmsan_hooks.c:634 copyout lib/iov_iter.c:143 [inline] _copy_to_iter+0x4c0/0x2700 lib/iov_iter.c:576 copy_to_iter include/linux/uio.h:143 [inline] skb_copy_datagram_iter+0x4e2/0x1070 net/core/datagram.c:431 skb_copy_datagram_msg include/linux/skbuff.h:3316 [inline] netlink_recvmsg+0x6f9/0x19d0 net/netlink/af_netlink.c:1975 sock_recvmsg_nosec net/socket.c:794 [inline] sock_recvmsg+0x1d1/0x230 net/socket.c:801 ___sys_recvmsg+0x444/0xae0 net/socket.c:2278 __sys_recvmsg net/socket.c:2327 [inline] __do_sys_recvmsg net/socket.c:2337 [inline] __se_sys_recvmsg+0x2fa/0x450 net/socket.c:2334 __x64_sys_recvmsg+0x4a/0x70 net/socket.c:2334 do_syscall_64+0xcf/0x110 arch/x86/entry/common.c:291 entry_SYSCALL_64_after_hwframe+0x63/0xe7 RIP: 0033:0x441119 Code: 18 89 d0 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 db 0a fc ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007fffc7f008a8 EFLAGS: 00000207 ORIG_RAX: 000000000000002f RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 0000000000441119 RDX: 0000000000000040 RSI: 00000000200005c0 RDI: 0000000000000003 RBP: 00000000006cc018 R08: 0000000000000100 R09: 0000000000000100 R10: 0000000000000100 R11: 0000000000000207 R12: 0000000000402080 R13: 0000000000402110 R14: 0000000000000000 R15: 0000000000000000 Uninit was stored to memory at: kmsan_save_stack_with_flags mm/kmsan/kmsan.c:246 [inline] kmsan_save_stack mm/kmsan/kmsan.c:261 [inline] kmsan_internal_chain_origin+0x13d/0x240 mm/kmsan/kmsan.c:469 kmsan_memcpy_memmove_metadata+0x1a9/0xf70 mm/kmsan/kmsan.c:344 kmsan_memcpy_metadata+0xb/0x10 mm/kmsan/kmsan.c:362 __msan_memcpy+0x61/0x70 mm/kmsan/kmsan_instr.c:162 __nla_put lib/nlattr.c:744 [inline] nla_put+0x20a/0x2d0 lib/nlattr.c:802 nlmsg_populate_fdb_fill+0x444/0x810 net/core/rtnetlink.c:3466 nlmsg_populate_fdb net/core/rtnetlink.c:3775 [inline] ndo_dflt_fdb_dump+0x73a/0x960 net/core/rtnetlink.c:3807 rtnl_fdb_dump+0x1318/0x1cb0 net/core/rtnetlink.c:3979 netlink_dump+0xc79/0x1c90 net/netlink/af_netlink.c:2244 __netlink_dump_start+0x10c4/0x11d0 net/netlink/af_netlink.c:2352 netlink_dump_start include/linux/netlink.h:216 [inline] rtnetlink_rcv_msg+0x141b/0x1540 net/core/rtnetlink.c:4910 netlink_rcv_skb+0x394/0x640 net/netlink/af_netlink.c:2477 rtnetlink_rcv+0x50/0x60 net/core/rtnetlink.c:4965 netlink_unicast_kernel net/netlink/af_netlink.c:1310 [inline] netlink_unicast+0x1699/0x1740 net/netlink/af_netlink.c:1336 netlink_sendmsg+0x13c7/0x1440 net/netlink/af_netlink.c:1917 sock_sendmsg_nosec net/socket.c:621 [inline] sock_sendmsg net/socket.c:631 [inline] ___sys_sendmsg+0xe3b/0x1240 net/socket.c:2116 __sys_sendmsg net/socket.c:2154 [inline] __do_sys_sendmsg net/socket.c:2163 [inline] __se_sys_sendmsg+0x305/0x460 net/socket.c:2161 __x64_sys_sendmsg+0x4a/0x70 net/socket.c:2161 do_syscall_64+0xcf/0x110 arch/x86/entry/common.c:291 entry_SYSCALL_64_after_hwframe+0x63/0xe7 Uninit was created at: kmsan_save_stack_with_flags mm/kmsan/kmsan.c:246 [inline] kmsan_internal_poison_shadow+0x6d/0x130 mm/kmsan/kmsan.c:170 kmsan_kmalloc+0xa1/0x100 mm/kmsan/kmsan_hooks.c:186 __kmalloc+0x14c/0x4d0 mm/slub.c:3825 kmalloc include/linux/slab.h:551 [inline] __hw_addr_create_ex net/core/dev_addr_lists.c:34 [inline] __hw_addr_add_ex net/core/dev_addr_lists.c:80 [inline] __dev_mc_add+0x357/0x8a0 net/core/dev_addr_lists.c:670 dev_mc_add+0x6d/0x80 net/core/dev_addr_lists.c:687 ip_mc_filter_add net/ipv4/igmp.c:1128 [inline] igmp_group_added+0x4d4/0xb80 net/ipv4/igmp.c:1311 __ip_mc_inc_group+0xea9/0xf70 net/ipv4/igmp.c:1444 ip_mc_inc_group net/ipv4/igmp.c:1453 [inline] ip_mc_up+0x1c3/0x400 net/ipv4/igmp.c:1775 inetdev_event+0x1d03/0x1d80 net/ipv4/devinet.c:1522 notifier_call_chain kernel/notifier.c:93 [inline] __raw_notifier_call_chain kernel/notifier.c:394 [inline] raw_notifier_call_chain+0x13d/0x240 kernel/notifier.c:401 __dev_notify_flags+0x3da/0x860 net/core/dev.c:1733 dev_change_flags+0x1ac/0x230 net/core/dev.c:7569 do_setlink+0x165f/0x5ea0 net/core/rtnetlink.c:2492 rtnl_newlink+0x2ad7/0x35a0 net/core/rtnetlink.c:3111 rtnetlink_rcv_msg+0x1148/0x1540 net/core/rtnetlink.c:4947 netlink_rcv_skb+0x394/0x640 net/netlink/af_netlink.c:2477 rtnetlink_rcv+0x50/0x60 net/core/rtnetlink.c:4965 netlink_unicast_kernel net/netlink/af_netlink.c:1310 [inline] netlink_unicast+0x1699/0x1740 net/netlink/af_netlink.c:1336 netlink_sendmsg+0x13c7/0x1440 net/netlink/af_netlink.c:1917 sock_sendmsg_nosec net/socket.c:621 [inline] sock_sendmsg net/socket.c:631 [inline] ___sys_sendmsg+0xe3b/0x1240 net/socket.c:2116 __sys_sendmsg net/socket.c:2154 [inline] __do_sys_sendmsg net/socket.c:2163 [inline] __se_sys_sendmsg+0x305/0x460 net/socket.c:2161 __x64_sys_sendmsg+0x4a/0x70 net/socket.c:2161 do_syscall_64+0xcf/0x110 arch/x86/entry/common.c:291 entry_SYSCALL_64_after_hwframe+0x63/0xe7 Bytes 36-37 of 105 are uninitialized Memory access of size 105 starts at ffff88819686c000 Data copied to user address 0000000020000380 Fixes: d83b06036048 ("net: add fdb generic dump routine") Signed-off-by: Eric Dumazet Cc: John Fastabend Cc: Ido Schimmel Cc: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 33d9227a8b80..7819f7804eeb 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3800,6 +3800,9 @@ int ndo_dflt_fdb_dump(struct sk_buff *skb, { int err; + if (dev->type != ARPHRD_ETHER) + return -EINVAL; + netif_addr_lock_bh(dev); err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->uc); if (err) From ecb239d96d369c23c33d41708646df646de669f4 Mon Sep 17 00:00:00 2001 From: Nicolas Saenz Julienne Date: Mon, 3 Dec 2018 13:21:01 +0100 Subject: [PATCH 67/93] ethernet: fman: fix wrong of_node_put() in probe function After getting a reference to the platform device's of_node the probe function ends up calling of_find_matching_node() using the node as an argument. The function takes care of decreasing the refcount on it. We are then incorrectly decreasing the refcount on that node again. This patch removes the unwarranted call to of_node_put(). Fixes: 414fd46e7762 ("fsl/fman: Add FMan support") Signed-off-by: Nicolas Saenz Julienne Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/fman/fman.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/freescale/fman/fman.c b/drivers/net/ethernet/freescale/fman/fman.c index c415ac67cb7b..e80fedb27cee 100644 --- a/drivers/net/ethernet/freescale/fman/fman.c +++ b/drivers/net/ethernet/freescale/fman/fman.c @@ -2786,7 +2786,7 @@ static struct fman *read_dts_node(struct platform_device *of_dev) if (!muram_node) { dev_err(&of_dev->dev, "%s: could not find MURAM node\n", __func__); - goto fman_node_put; + goto fman_free; } err = of_address_to_resource(muram_node, 0, @@ -2795,11 +2795,10 @@ static struct fman *read_dts_node(struct platform_device *of_dev) of_node_put(muram_node); dev_err(&of_dev->dev, "%s: of_address_to_resource() = %d\n", __func__, err); - goto fman_node_put; + goto fman_free; } of_node_put(muram_node); - of_node_put(fm_node); err = devm_request_irq(&of_dev->dev, irq, fman_irq, IRQF_SHARED, "fman", fman); From 01b3fd5ac97caffb8e5d5bd85086da33db3b361f Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Tue, 4 Dec 2018 16:03:52 +0200 Subject: [PATCH 68/93] net: mvpp2: fix detection of 10G SFP modules The mvpp2_phylink_validate() relies on the interface field of phylink_link_state to determine valid link modes. However, when called from phylink_sfp_module_insert() this field in not initialized. The default switch case then excludes 10G link modes. This allows 10G SFP modules that are detected correctly to be configured at max rate of 2.5G. Catch the uninitialized PHY mode case, and allow 10G rates. Fixes: d97c9f4ab000b ("net: mvpp2: 1000baseX support") Cc: Maxime Chevallier Cc: Antoine Tenart Acked-by: Russell King Signed-off-by: Baruch Siach Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index 7a37a37e3fb3..eb1dc8abc359 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -4384,6 +4384,7 @@ static void mvpp2_phylink_validate(struct net_device *dev, switch (state->interface) { case PHY_INTERFACE_MODE_10GKR: + case PHY_INTERFACE_MODE_NA: phylink_set(mask, 10000baseCR_Full); phylink_set(mask, 10000baseSR_Full); phylink_set(mask, 10000baseLR_Full); From 0fb628f0f250c74b1023edd0ca4a57c8b35b9b2c Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Tue, 4 Dec 2018 16:03:53 +0200 Subject: [PATCH 69/93] net: mvpp2: fix phylink handling of invalid PHY modes The .validate phylink callback should empty the supported bitmap when the interface mode is invalid. Cc: Maxime Chevallier Cc: Antoine Tenart Reported-by: Russell King Signed-off-by: Baruch Siach Signed-off-by: David S. Miller --- .../net/ethernet/marvell/mvpp2/mvpp2_main.c | 33 ++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index eb1dc8abc359..125ea99418df 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -4375,8 +4375,27 @@ static void mvpp2_phylink_validate(struct net_device *dev, unsigned long *supported, struct phylink_link_state *state) { + struct mvpp2_port *port = netdev_priv(dev); __ETHTOOL_DECLARE_LINK_MODE_MASK(mask) = { 0, }; + /* Invalid combinations */ + switch (state->interface) { + case PHY_INTERFACE_MODE_10GKR: + case PHY_INTERFACE_MODE_XAUI: + if (port->gop_id != 0) + goto empty_set; + break; + case PHY_INTERFACE_MODE_RGMII: + case PHY_INTERFACE_MODE_RGMII_ID: + case PHY_INTERFACE_MODE_RGMII_RXID: + case PHY_INTERFACE_MODE_RGMII_TXID: + if (port->gop_id == 0) + goto empty_set; + break; + default: + break; + } + phylink_set(mask, Autoneg); phylink_set_port_modes(mask); phylink_set(mask, Pause); @@ -4384,6 +4403,7 @@ static void mvpp2_phylink_validate(struct net_device *dev, switch (state->interface) { case PHY_INTERFACE_MODE_10GKR: + case PHY_INTERFACE_MODE_XAUI: case PHY_INTERFACE_MODE_NA: phylink_set(mask, 10000baseCR_Full); phylink_set(mask, 10000baseSR_Full); @@ -4392,7 +4412,11 @@ static void mvpp2_phylink_validate(struct net_device *dev, phylink_set(mask, 10000baseER_Full); phylink_set(mask, 10000baseKR_Full); /* Fall-through */ - default: + case PHY_INTERFACE_MODE_RGMII: + case PHY_INTERFACE_MODE_RGMII_ID: + case PHY_INTERFACE_MODE_RGMII_RXID: + case PHY_INTERFACE_MODE_RGMII_TXID: + case PHY_INTERFACE_MODE_SGMII: phylink_set(mask, 10baseT_Half); phylink_set(mask, 10baseT_Full); phylink_set(mask, 100baseT_Half); @@ -4404,11 +4428,18 @@ static void mvpp2_phylink_validate(struct net_device *dev, phylink_set(mask, 1000baseT_Full); phylink_set(mask, 1000baseX_Full); phylink_set(mask, 2500baseX_Full); + break; + default: + goto empty_set; } bitmap_and(supported, supported, mask, __ETHTOOL_LINK_MODE_MASK_NBITS); bitmap_and(state->advertising, state->advertising, mask, __ETHTOOL_LINK_MODE_MASK_NBITS); + return; + +empty_set: + bitmap_zero(supported, __ETHTOOL_LINK_MODE_MASK_NBITS); } static void mvpp22_xlg_link_state(struct mvpp2_port *port, From a317e65face482371de30246b6494feb093ff7f9 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Tue, 13 Nov 2018 20:32:13 +0100 Subject: [PATCH 70/93] mac80211: ignore tx status for PS stations in ieee80211_tx_status_ext Make it behave like regular ieee80211_tx_status calls, except for the lack of filtered frame processing. This fixes spurious low-ack triggered disconnections with powersave clients connected to an AP. Fixes: f027c2aca0cf4 ("mac80211: add ieee80211_tx_status_noskb") Cc: stable@vger.kernel.org Signed-off-by: Felix Fietkau Signed-off-by: Johannes Berg --- net/mac80211/status.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/mac80211/status.c b/net/mac80211/status.c index aa4afbf0abaf..a794ca729000 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -964,6 +964,8 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw, /* Track when last TDLS packet was ACKed */ if (test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH)) sta->status_stats.last_tdls_pkt_time = jiffies; + } else if (test_sta_flag(sta, WLAN_STA_PS_STA)) { + return; } else { ieee80211_lost_packet(sta, info); } From 9ec1190d065998650fd9260dea8cf3e1f56c0e8c Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 28 Nov 2018 22:39:16 +0100 Subject: [PATCH 71/93] mac80211: fix reordering of buffered broadcast packets If the buffered broadcast queue contains packets, letting new packets bypass that queue can lead to heavy reordering, since the driver is probably throttling transmission of buffered multicast packets after beacons. Keep buffering packets until the buffer has been cleared (and no client is in powersave mode). Cc: stable@vger.kernel.org Signed-off-by: Felix Fietkau Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index e0ccee23fbcd..1f536ba573b4 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -439,8 +439,8 @@ ieee80211_tx_h_multicast_ps_buf(struct ieee80211_tx_data *tx) if (ieee80211_hw_check(&tx->local->hw, QUEUE_CONTROL)) info->hw_queue = tx->sdata->vif.cab_queue; - /* no stations in PS mode */ - if (!atomic_read(&ps->num_sta_ps)) + /* no stations in PS mode and no buffered packets */ + if (!atomic_read(&ps->num_sta_ps) && skb_queue_empty(&ps->bc_buf)) return TX_CONTINUE; info->flags |= IEEE80211_TX_CTL_SEND_AFTER_DTIM; From 990d71846a0b7281bd933c34d734e6afc7408e7e Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Mon, 3 Dec 2018 21:16:07 +0200 Subject: [PATCH 72/93] mac80211: ignore NullFunc frames in the duplicate detection NullFunc packets should never be duplicate just like QoS-NullFunc packets. We saw a client that enters / exits power save with NullFunc frames (and not with QoS-NullFunc) despite the fact that the association supports HT. This specific client also re-uses a non-zero sequence number for different NullFunc frames. At some point, the client had to send a retransmission of the NullFunc frame and we dropped it, leading to a misalignment in the power save state. Fix this by never consider a NullFunc frame as duplicate, just like we do for QoS NullFunc frames. This fixes https://bugzilla.kernel.org/show_bug.cgi?id=201449 CC: Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index a69ecfb212ed..428f7ad5f9b5 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1403,6 +1403,7 @@ ieee80211_rx_h_check_dup(struct ieee80211_rx_data *rx) return RX_CONTINUE; if (ieee80211_is_ctl(hdr->frame_control) || + ieee80211_is_nullfunc(hdr->frame_control) || ieee80211_is_qos_nullfunc(hdr->frame_control) || is_multicast_ether_addr(hdr->addr1)) return RX_CONTINUE; From 312ca38ddda64bac6513ec68e0ac3789b4eb44dc Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Wed, 5 Dec 2018 12:55:54 +0200 Subject: [PATCH 73/93] cfg80211: Fix busy loop regression in ieee80211_ie_split_ric() This function was modified to support the information element extension case (WLAN_EID_EXTENSION) in a manner that would result in an infinite loop when going through set of IEs that include WLAN_EID_RIC_DATA and contain an IE that is in the after_ric array. The only place where this can currently happen is in mac80211 ieee80211_send_assoc() where ieee80211_ie_split_ric() is called with after_ric[]. This can be triggered by valid data from user space nl80211 association/connect request (i.e., requiring GENL_UNS_ADMIN_PERM). The only known application having an option to include WLAN_EID_RIC_DATA in these requests is wpa_supplicant and it had a bug that prevented this specific contents from being used (and because of that, not triggering this kernel bug in an automated test case ap_ft_ric) and now that this bug is fixed, it has a workaround to avoid this kernel issue. WLAN_EID_RIC_DATA is currently used only for testing purposes, so this does not cause significant harm for production use cases. Fixes: 2512b1b18d07 ("mac80211: extend ieee80211_ie_split to support EXTENSION") Cc: stable@vger.kernel.org Signed-off-by: Jouni Malinen Signed-off-by: Johannes Berg --- net/wireless/util.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/wireless/util.c b/net/wireless/util.c index ef14d80ca03e..d473bd135da8 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -1421,6 +1421,8 @@ size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen, ies[pos + ext], ext == 2)) pos = skip_ie(ies, ielen, pos); + else + break; } } else { pos = skip_ie(ies, ielen, pos); From 22f6bbb7bcfcef0b373b0502a7ff390275c575dd Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Tue, 4 Dec 2018 17:37:57 +0000 Subject: [PATCH 74/93] net: use skb_list_del_init() to remove from RX sublists list_del() leaves the skb->next pointer poisoned, which can then lead to a crash in e.g. OVS forwarding. For example, setting up an OVS VXLAN forwarding bridge on sfc as per: ======== $ ovs-vsctl show 5dfd9c47-f04b-4aaa-aa96-4fbb0a522a30 Bridge "br0" Port "br0" Interface "br0" type: internal Port "enp6s0f0" Interface "enp6s0f0" Port "vxlan0" Interface "vxlan0" type: vxlan options: {key="1", local_ip="10.0.0.5", remote_ip="10.0.0.4"} ovs_version: "2.5.0" ======== (where 10.0.0.5 is an address on enp6s0f1) and sending traffic across it will lead to the following panic: ======== general protection fault: 0000 [#1] SMP PTI CPU: 5 PID: 0 Comm: swapper/5 Not tainted 4.20.0-rc3-ehc+ #701 Hardware name: Dell Inc. PowerEdge R710/0M233H, BIOS 6.4.0 07/23/2013 RIP: 0010:dev_hard_start_xmit+0x38/0x200 Code: 53 48 89 fb 48 83 ec 20 48 85 ff 48 89 54 24 08 48 89 4c 24 18 0f 84 ab 01 00 00 48 8d 86 90 00 00 00 48 89 f5 48 89 44 24 10 <4c> 8b 33 48 c7 03 00 00 00 00 48 8b 05 c7 d1 b3 00 4d 85 f6 0f 95 RSP: 0018:ffff888627b437e0 EFLAGS: 00010202 RAX: 0000000000000000 RBX: dead000000000100 RCX: ffff88862279c000 RDX: ffff888614a342c0 RSI: 0000000000000000 RDI: 0000000000000000 RBP: ffff888618a88000 R08: 0000000000000001 R09: 00000000000003e8 R10: 0000000000000000 R11: ffff888614a34140 R12: 0000000000000000 R13: 0000000000000062 R14: dead000000000100 R15: ffff888616430000 FS: 0000000000000000(0000) GS:ffff888627b40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f6d2bc6d000 CR3: 000000000200a000 CR4: 00000000000006e0 Call Trace: __dev_queue_xmit+0x623/0x870 ? masked_flow_lookup+0xf7/0x220 [openvswitch] ? ep_poll_callback+0x101/0x310 do_execute_actions+0xaba/0xaf0 [openvswitch] ? __wake_up_common+0x8a/0x150 ? __wake_up_common_lock+0x87/0xc0 ? queue_userspace_packet+0x31c/0x5b0 [openvswitch] ovs_execute_actions+0x47/0x120 [openvswitch] ovs_dp_process_packet+0x7d/0x110 [openvswitch] ovs_vport_receive+0x6e/0xd0 [openvswitch] ? dst_alloc+0x64/0x90 ? rt_dst_alloc+0x50/0xd0 ? ip_route_input_slow+0x19a/0x9a0 ? __udp_enqueue_schedule_skb+0x198/0x1b0 ? __udp4_lib_rcv+0x856/0xa30 ? __udp4_lib_rcv+0x856/0xa30 ? cpumask_next_and+0x19/0x20 ? find_busiest_group+0x12d/0xcd0 netdev_frame_hook+0xce/0x150 [openvswitch] __netif_receive_skb_core+0x205/0xae0 __netif_receive_skb_list_core+0x11e/0x220 netif_receive_skb_list+0x203/0x460 ? __efx_rx_packet+0x335/0x5e0 [sfc] efx_poll+0x182/0x320 [sfc] net_rx_action+0x294/0x3c0 __do_softirq+0xca/0x297 irq_exit+0xa6/0xb0 do_IRQ+0x54/0xd0 common_interrupt+0xf/0xf ======== So, in all listified-receive handling, instead pull skbs off the lists with skb_list_del_init(). Fixes: 9af86f933894 ("net: core: fix use-after-free in __netif_receive_skb_list_core") Fixes: 7da517a3bc52 ("net: core: Another step of skb receive list processing") Fixes: a4ca8b7df73c ("net: ipv4: fix drop handling in ip_list_rcv() and ip_list_rcv_finish()") Fixes: d8269e2cbf90 ("net: ipv6: listify ipv6_rcv() and ip6_rcv_finish()") Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- net/core/dev.c | 8 ++++---- net/ipv4/ip_input.c | 4 ++-- net/ipv6/ip6_input.c | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index e06223b65674..722d50dbf8a4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5014,7 +5014,7 @@ static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemallo struct net_device *orig_dev = skb->dev; struct packet_type *pt_prev = NULL; - list_del(&skb->list); + skb_list_del_init(skb); __netif_receive_skb_core(skb, pfmemalloc, &pt_prev); if (!pt_prev) continue; @@ -5170,7 +5170,7 @@ static void netif_receive_skb_list_internal(struct list_head *head) INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { net_timestamp_check(netdev_tstamp_prequeue, skb); - list_del(&skb->list); + skb_list_del_init(skb); if (!skb_defer_rx_timestamp(skb)) list_add_tail(&skb->list, &sublist); } @@ -5181,7 +5181,7 @@ static void netif_receive_skb_list_internal(struct list_head *head) rcu_read_lock(); list_for_each_entry_safe(skb, next, head, list) { xdp_prog = rcu_dereference(skb->dev->xdp_prog); - list_del(&skb->list); + skb_list_del_init(skb); if (do_xdp_generic(xdp_prog, skb) == XDP_PASS) list_add_tail(&skb->list, &sublist); } @@ -5200,7 +5200,7 @@ static void netif_receive_skb_list_internal(struct list_head *head) if (cpu >= 0) { /* Will be handled, remove from list */ - list_del(&skb->list); + skb_list_del_init(skb); enqueue_to_backlog(skb, cpu, &rflow->last_qtail); } } diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 35a786c0aaa0..e609b08c9df4 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -547,7 +547,7 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk, list_for_each_entry_safe(skb, next, head, list) { struct dst_entry *dst; - list_del(&skb->list); + skb_list_del_init(skb); /* if ingress device is enslaved to an L3 master device pass the * skb to its handler for processing */ @@ -594,7 +594,7 @@ void ip_list_rcv(struct list_head *head, struct packet_type *pt, struct net_device *dev = skb->dev; struct net *net = dev_net(dev); - list_del(&skb->list); + skb_list_del_init(skb); skb = ip_rcv_core(skb, net); if (skb == NULL) continue; diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 96577e742afd..c1d85830c906 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -95,7 +95,7 @@ static void ip6_list_rcv_finish(struct net *net, struct sock *sk, list_for_each_entry_safe(skb, next, head, list) { struct dst_entry *dst; - list_del(&skb->list); + skb_list_del_init(skb); /* if ingress device is enslaved to an L3 master device pass the * skb to its handler for processing */ @@ -296,7 +296,7 @@ void ipv6_list_rcv(struct list_head *head, struct packet_type *pt, struct net_device *dev = skb->dev; struct net *net = dev_net(dev); - list_del(&skb->list); + skb_list_del_init(skb); skb = ip6_rcv_core(skb, dev, net); if (skb == NULL) continue; From 41727549de3e7281feb174d568c6e46823db8684 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 5 Dec 2018 14:24:31 -0800 Subject: [PATCH 75/93] tcp: Do not underestimate rwnd_limited If available rwnd is too small, tcp_tso_should_defer() can decide it is worth waiting before splitting a TSO packet. This really means we are rwnd limited. Fixes: 5615f88614a4 ("tcp: instrument how long TCP is limited by receive window") Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 68b5326f7321..318690234758 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2356,8 +2356,11 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, } else { if (!push_one && tcp_tso_should_defer(sk, skb, &is_cwnd_limited, - max_segs)) + max_segs)) { + if (!is_cwnd_limited) + is_rwnd_limited = true; break; + } } limit = mss_now; From b2b7af861122a0c0f6260155c29a1b2e594cd5b5 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 5 Dec 2018 14:38:38 -0800 Subject: [PATCH 76/93] tcp: fix NULL ref in tail loss probe TCP loss probe timer may fire when the retranmission queue is empty but has a non-zero tp->packets_out counter. tcp_send_loss_probe will call tcp_rearm_rto which triggers NULL pointer reference by fetching the retranmission queue head in its sub-routines. Add a more detailed warning to help catch the root cause of the inflight accounting inconsistency. Reported-by: Rafael Tinoco Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 318690234758..5aa600900695 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2497,15 +2497,18 @@ void tcp_send_loss_probe(struct sock *sk) goto rearm_timer; } skb = skb_rb_last(&sk->tcp_rtx_queue); + if (unlikely(!skb)) { + WARN_ONCE(tp->packets_out, + "invalid inflight: %u state %u cwnd %u mss %d\n", + tp->packets_out, sk->sk_state, tp->snd_cwnd, mss); + inet_csk(sk)->icsk_pending = 0; + return; + } /* At most one outstanding TLP retransmission. */ if (tp->tlp_high_seq) goto rearm_timer; - /* Retransmit last segment. */ - if (WARN_ON(!skb)) - goto rearm_timer; - if (skb_still_in_host_queue(sk, skb)) goto rearm_timer; From afd0a8006e98b1890908f81746c94ca5dae29d7c Mon Sep 17 00:00:00 2001 From: Jakub Audykowicz Date: Tue, 4 Dec 2018 20:27:41 +0100 Subject: [PATCH 77/93] sctp: frag_point sanity check If for some reason an association's fragmentation point is zero, sctp_datamsg_from_user will try to endlessly try to divide a message into zero-sized chunks. This eventually causes kernel panic due to running out of memory. Although this situation is quite unlikely, it has occurred before as reported. I propose to add this simple last-ditch sanity check due to the severity of the potential consequences. Signed-off-by: Jakub Audykowicz Acked-by: Neil Horman Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/sctp.h | 5 +++++ net/sctp/chunk.c | 6 ++++++ net/sctp/socket.c | 3 +-- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index ab9242e51d9e..2abbc15824af 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -620,4 +620,9 @@ static inline bool sctp_transport_pmtu_check(struct sctp_transport *t) return false; } +static inline __u32 sctp_min_frag_point(struct sctp_sock *sp, __u16 datasize) +{ + return sctp_mtu_payload(sp, SCTP_DEFAULT_MINSEGMENT, datasize); +} + #endif /* __net_sctp_h__ */ diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index ce8087846f05..d2048de86e7c 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -191,6 +191,12 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, * the packet */ max_data = asoc->frag_point; + if (unlikely(!max_data)) { + max_data = sctp_min_frag_point(sctp_sk(asoc->base.sk), + sctp_datachk_len(&asoc->stream)); + pr_warn_ratelimited("%s: asoc:%p frag_point is zero, forcing max_data to default minimum (%Zu)", + __func__, asoc, max_data); + } /* If the the peer requested that we authenticate DATA chunks * we need to account for bundling of the AUTH chunks along with diff --git a/net/sctp/socket.c b/net/sctp/socket.c index bf618d1b41fd..b8cebd5a87e5 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -3324,8 +3324,7 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned __u16 datasize = asoc ? sctp_datachk_len(&asoc->stream) : sizeof(struct sctp_data_chunk); - min_len = sctp_mtu_payload(sp, SCTP_DEFAULT_MINSEGMENT, - datasize); + min_len = sctp_min_frag_point(sp, datasize); max_len = SCTP_MAX_CHUNK_LEN - datasize; if (val < min_len || val > max_len) From ebaf39e6032faf77218220707fc3fa22487784e0 Mon Sep 17 00:00:00 2001 From: Jiri Wiesner Date: Wed, 5 Dec 2018 16:55:29 +0100 Subject: [PATCH 78/93] ipv4: ipv6: netfilter: Adjust the frag mem limit when truesize changes The *_frag_reasm() functions are susceptible to miscalculating the byte count of packet fragments in case the truesize of a head buffer changes. The truesize member may be changed by the call to skb_unclone(), leaving the fragment memory limit counter unbalanced even if all fragments are processed. This miscalculation goes unnoticed as long as the network namespace which holds the counter is not destroyed. Should an attempt be made to destroy a network namespace that holds an unbalanced fragment memory limit counter the cleanup of the namespace never finishes. The thread handling the cleanup gets stuck in inet_frags_exit_net() waiting for the percpu counter to reach zero. The thread is usually in running state with a stacktrace similar to: PID: 1073 TASK: ffff880626711440 CPU: 1 COMMAND: "kworker/u48:4" #5 [ffff880621563d48] _raw_spin_lock at ffffffff815f5480 #6 [ffff880621563d48] inet_evict_bucket at ffffffff8158020b #7 [ffff880621563d80] inet_frags_exit_net at ffffffff8158051c #8 [ffff880621563db0] ops_exit_list at ffffffff814f5856 #9 [ffff880621563dd8] cleanup_net at ffffffff814f67c0 #10 [ffff880621563e38] process_one_work at ffffffff81096f14 It is not possible to create new network namespaces, and processes that call unshare() end up being stuck in uninterruptible sleep state waiting to acquire the net_mutex. The bug was observed in the IPv6 netfilter code by Per Sundstrom. I thank him for his analysis of the problem. The parts of this patch that apply to IPv4 and IPv6 fragment reassembly are preemptive measures. Signed-off-by: Jiri Wiesner Reported-by: Per Sundstrom Acked-by: Peter Oskolkov Signed-off-by: David S. Miller --- net/ipv4/ip_fragment.c | 7 +++++++ net/ipv6/netfilter/nf_conntrack_reasm.c | 8 +++++++- net/ipv6/reassembly.c | 8 +++++++- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index d6ee343fdb86..aa0b22697998 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -515,6 +515,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, struct rb_node *rbn; int len; int ihlen; + int delta; int err; u8 ecn; @@ -556,10 +557,16 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, if (len > 65535) goto out_oversize; + delta = - head->truesize; + /* Head of list must not be cloned. */ if (skb_unclone(head, GFP_ATOMIC)) goto out_nomem; + delta += head->truesize; + if (delta) + add_frag_mem_limit(qp->q.net, delta); + /* If the first fragment is fragmented itself, we split * it to two chunks: the first with data and paged part * and the second, holding only fragments. */ diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index d219979c3e52..181da2c40f9a 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -341,7 +341,7 @@ static bool nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_device *dev) { struct sk_buff *fp, *head = fq->q.fragments; - int payload_len; + int payload_len, delta; u8 ecn; inet_frag_kill(&fq->q); @@ -363,10 +363,16 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_devic return false; } + delta = - head->truesize; + /* Head of list must not be cloned. */ if (skb_unclone(head, GFP_ATOMIC)) return false; + delta += head->truesize; + if (delta) + add_frag_mem_limit(fq->q.net, delta); + /* If the first fragment is fragmented itself, we split * it to two chunks: the first with data and paged part * and the second, holding only fragments. */ diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 5c3c92713096..aa26c45486d9 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -281,7 +281,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, { struct net *net = container_of(fq->q.net, struct net, ipv6.frags); struct sk_buff *fp, *head = fq->q.fragments; - int payload_len; + int payload_len, delta; unsigned int nhoff; int sum_truesize; u8 ecn; @@ -322,10 +322,16 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, if (payload_len > IPV6_MAXPLEN) goto out_oversize; + delta = - head->truesize; + /* Head of list must not be cloned. */ if (skb_unclone(head, GFP_ATOMIC)) goto out_oom; + delta += head->truesize; + if (delta) + add_frag_mem_limit(fq->q.net, delta); + /* If the first fragment is fragmented itself, we split * it to two chunks: the first with data and paged part * and the second, holding only fragments. */ From 050fc01fb1d916058605569cd7f4e15152afc3af Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 6 Dec 2018 17:44:50 +0000 Subject: [PATCH 79/93] mlxsw: spectrum_nve: Remove easily triggerable warnings It is possible to trigger a warning in mlxsw in case a flood entry which mlxsw is not aware of is deleted from the VxLAN device. This is because mlxsw expects to find a singly linked list where the flood entry is present in. Fix by removing these warnings for now. Will re-add them in the next release after we teach mlxsw to ask for a dump of FDB entries from the VxLAN device, once it is enslaved to a bridge mlxsw cares about. Fixes: 6e6030bd5412 ("mlxsw: spectrum_nve: Implement common NVE core") Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_nve.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_nve.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_nve.c index ad06d9969bc1..5c13674439f1 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_nve.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_nve.c @@ -560,7 +560,7 @@ static void mlxsw_sp_nve_mc_list_ip_del(struct mlxsw_sp *mlxsw_sp, mc_record = mlxsw_sp_nve_mc_record_find(mc_list, proto, addr, &mc_entry); - if (WARN_ON(!mc_record)) + if (!mc_record) return; mlxsw_sp_nve_mc_record_entry_del(mc_record, mc_entry); @@ -647,7 +647,7 @@ void mlxsw_sp_nve_flood_ip_del(struct mlxsw_sp *mlxsw_sp, key.fid_index = mlxsw_sp_fid_index(fid); mc_list = mlxsw_sp_nve_mc_list_find(mlxsw_sp, &key); - if (WARN_ON(!mc_list)) + if (!mc_list) return; mlxsw_sp_nve_fid_flood_index_clear(fid, mc_list); From f58a83c207b791c6586b9675a589db5c6ac7909e Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 6 Dec 2018 17:44:51 +0000 Subject: [PATCH 80/93] mlxsw: spectrum_switchdev: Avoid leaking FID's reference count It should never be possible for a user to set a VNI on a FID in case one is already set. The driver therefore returns an error, but fails to drop the reference count taken earlier when calling mlxsw_sp_fid_8021d_lookup(). Drop the reference when this unlikely error is hit. Fixes: 1c30d1836aeb ("mlxsw: spectrum: Enable VxLAN enslavement to bridges") Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index 739a51f0a366..7f2091c2648e 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -2134,8 +2134,10 @@ mlxsw_sp_bridge_8021d_vxlan_join(struct mlxsw_sp_bridge_device *bridge_device, if (!fid) return -EINVAL; - if (mlxsw_sp_fid_vni_is_set(fid)) - return -EINVAL; + if (mlxsw_sp_fid_vni_is_set(fid)) { + err = -EINVAL; + goto err_vni_exists; + } err = mlxsw_sp_nve_fid_enable(mlxsw_sp, fid, ¶ms, extack); if (err) @@ -2149,6 +2151,7 @@ mlxsw_sp_bridge_8021d_vxlan_join(struct mlxsw_sp_bridge_device *bridge_device, return 0; err_nve_fid_enable: +err_vni_exists: mlxsw_sp_fid_put(fid); return err; } From da93d2913fdf43d5cde3c5a53ac9cc29684d5c7c Mon Sep 17 00:00:00 2001 From: Nir Dotan Date: Thu, 6 Dec 2018 17:44:52 +0000 Subject: [PATCH 81/93] mlxsw: spectrum_router: Relax GRE decap matching check GRE decap offload is configured when local routes prefix correspond to the local address of one of the offloaded GRE tunnels. The matching check was found to be too strict, such that for a flat GRE configuration, in which the overlay and underlay traffic share the same non-default VRF, decap flow was not offloaded. Relax the check for decap flow offloading. A match occurs if the local address of the tunnel matches the local route address while both share the same VRF table. Fixes: 4607f6d26950 ("mlxsw: spectrum_router: Support IPv4 underlay decap") Signed-off-by: Nir Dotan Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 9e9bb57134f2..6ebf99cc3154 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -1275,15 +1275,12 @@ mlxsw_sp_ipip_entry_matches_decap(struct mlxsw_sp *mlxsw_sp, { u32 ul_tb_id = l3mdev_fib_table(ul_dev) ? : RT_TABLE_MAIN; enum mlxsw_sp_ipip_type ipipt = ipip_entry->ipipt; - struct net_device *ipip_ul_dev; if (mlxsw_sp->router->ipip_ops_arr[ipipt]->ul_proto != ul_proto) return false; - ipip_ul_dev = __mlxsw_sp_ipip_netdev_ul_dev_get(ipip_entry->ol_dev); return mlxsw_sp_ipip_entry_saddr_matches(mlxsw_sp, ul_proto, ul_dip, - ul_tb_id, ipip_entry) && - (!ipip_ul_dev || ipip_ul_dev == ul_dev); + ul_tb_id, ipip_entry); } /* Given decap parameters, find the corresponding IPIP entry. */ From 993107fea5eefdfdfde1ca38d3f01f0bebf76e77 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 6 Dec 2018 17:44:53 +0000 Subject: [PATCH 82/93] mlxsw: spectrum_switchdev: Fix VLAN device deletion via ioctl When deleting a VLAN device using an ioctl the netdev is unregistered before the VLAN filter is updated via ndo_vlan_rx_kill_vid(). It can lead to a use-after-free in mlxsw in case the VLAN device is deleted while being enslaved to a bridge. The reason for the above is that when mlxsw receives the CHANGEUPPER event, it wrongly assumes that the VLAN device is no longer its upper and thus destroys the internal representation of the bridge port despite the reference count being non-zero. Fix this by checking if the VLAN device is our upper using its real device. In net-next I'm going to remove this trick and instead make mlxsw completely agnostic to the order of the events. Fixes: c57529e1d5d8 ("mlxsw: spectrum: Replace vPorts with Port-VLAN") Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlxsw/spectrum_switchdev.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index 7f2091c2648e..50080c60a279 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -296,7 +296,13 @@ static bool mlxsw_sp_bridge_port_should_destroy(const struct mlxsw_sp_bridge_port * bridge_port) { - struct mlxsw_sp *mlxsw_sp = mlxsw_sp_lower_get(bridge_port->dev); + struct net_device *dev = bridge_port->dev; + struct mlxsw_sp *mlxsw_sp; + + if (is_vlan_dev(dev)) + mlxsw_sp = mlxsw_sp_lower_get(vlan_dev_real_dev(dev)); + else + mlxsw_sp = mlxsw_sp_lower_get(dev); /* In case ports were pulled from out of a bridged LAG, then * it's possible the reference count isn't zero, yet the bridge @@ -2109,7 +2115,7 @@ mlxsw_sp_bridge_8021d_port_leave(struct mlxsw_sp_bridge_device *bridge_device, vid = is_vlan_dev(dev) ? vlan_dev_vlan_id(dev) : 1; mlxsw_sp_port_vlan = mlxsw_sp_port_vlan_find_by_vid(mlxsw_sp_port, vid); - if (WARN_ON(!mlxsw_sp_port_vlan)) + if (!mlxsw_sp_port_vlan) return; mlxsw_sp_port_vlan_bridge_leave(mlxsw_sp_port_vlan); From 1b4e5ad5d6b9f15cd0b5121f86d4719165958417 Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Fri, 7 Dec 2018 09:50:17 +0200 Subject: [PATCH 83/93] ipv6: sr: properly initialize flowi6 prior passing to ip6_route_output In 'seg6_output', stack variable 'struct flowi6 fl6' was missing initialization. Fixes: 6c8702c60b88 ("ipv6: sr: add support for SRH encapsulation and injection with lwtunnels") Signed-off-by: Shmulik Ladkani Signed-off-by: David S. Miller --- net/ipv6/seg6_iptunnel.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index a8854dd3e9c5..8181ee7e1e27 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -347,6 +347,7 @@ static int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb) struct ipv6hdr *hdr = ipv6_hdr(skb); struct flowi6 fl6; + memset(&fl6, 0, sizeof(fl6)); fl6.daddr = hdr->daddr; fl6.saddr = hdr->saddr; fl6.flowlabel = ip6_flowinfo(hdr); From f9bfe4e6a9d08d405fe7b081ee9a13e649c97ecf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 6 Dec 2018 09:58:24 -0800 Subject: [PATCH 84/93] tcp: lack of available data can also cause TSO defer tcp_tso_should_defer() can return true in three different cases : 1) We are cwnd-limited 2) We are rwnd-limited 3) We are application limited. Neal pointed out that my recent fix went too far, since it assumed that if we were not in 1) case, we must be rwnd-limited Fix this by properly populating the is_cwnd_limited and is_rwnd_limited booleans. After this change, we can finally move the silly check for FIN flag only for the application-limited case. The same move for EOR bit will be handled in net-next, since commit 1c09f7d073b1 ("tcp: do not try to defer skbs with eor mark (MSG_EOR)") is scheduled for linux-4.21 Tested by running 200 concurrent netperf -t TCP_RR -- -r 60000,100 and checking none of them was rwnd_limited in the chrono_stat output from "ss -ti" command. Fixes: 41727549de3e ("tcp: Do not underestimate rwnd_limited") Signed-off-by: Eric Dumazet Suggested-by: Neal Cardwell Reviewed-by: Neal Cardwell Acked-by: Soheil Hassas Yeganeh Reviewed-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 5aa600900695..d1676d8a6ed7 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1904,7 +1904,9 @@ static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue, * This algorithm is from John Heffner. */ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, - bool *is_cwnd_limited, u32 max_segs) + bool *is_cwnd_limited, + bool *is_rwnd_limited, + u32 max_segs) { const struct inet_connection_sock *icsk = inet_csk(sk); u32 age, send_win, cong_win, limit, in_flight; @@ -1912,9 +1914,6 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, struct sk_buff *head; int win_divisor; - if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) - goto send_now; - if (icsk->icsk_ca_state >= TCP_CA_Recovery) goto send_now; @@ -1973,10 +1972,27 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, if (age < (tp->srtt_us >> 4)) goto send_now; - /* Ok, it looks like it is advisable to defer. */ + /* Ok, it looks like it is advisable to defer. + * Three cases are tracked : + * 1) We are cwnd-limited + * 2) We are rwnd-limited + * 3) We are application limited. + */ + if (cong_win < send_win) { + if (cong_win <= skb->len) { + *is_cwnd_limited = true; + return true; + } + } else { + if (send_win <= skb->len) { + *is_rwnd_limited = true; + return true; + } + } - if (cong_win < send_win && cong_win <= skb->len) - *is_cwnd_limited = true; + /* If this packet won't get more data, do not wait. */ + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) + goto send_now; return true; @@ -2356,11 +2372,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, } else { if (!push_one && tcp_tso_should_defer(sk, skb, &is_cwnd_limited, - max_segs)) { - if (!is_cwnd_limited) - is_rwnd_limited = true; + &is_rwnd_limited, max_segs)) break; - } } limit = mss_now; From 66033f47ca60294a95fc85ec3a3cc909dab7b765 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Thu, 6 Dec 2018 19:30:36 +0100 Subject: [PATCH 85/93] ipv6: Check available headroom in ip6_xmit() even without options Even if we send an IPv6 packet without options, MAX_HEADER might not be enough to account for the additional headroom required by alignment of hardware headers. On a configuration without HYPERV_NET, WLAN, AX25, and with IPV6_TUNNEL, sending short SCTP packets over IPv4 over L2TP over IPv6, we start with 100 bytes of allocated headroom in sctp_packet_transmit(), end up with 54 bytes after l2tp_xmit_skb(), and 14 bytes in ip6_finish_output2(). Those would be enough to append our 14 bytes header, but we're going to align that to 16 bytes, and write 2 bytes out of the allocated slab in neigh_hh_output(). KASan says: [ 264.967848] ================================================================== [ 264.967861] BUG: KASAN: slab-out-of-bounds in ip6_finish_output2+0x1aec/0x1c70 [ 264.967866] Write of size 16 at addr 000000006af1c7fe by task netperf/6201 [ 264.967870] [ 264.967876] CPU: 0 PID: 6201 Comm: netperf Not tainted 4.20.0-rc4+ #1 [ 264.967881] Hardware name: IBM 2827 H43 400 (z/VM 6.4.0) [ 264.967887] Call Trace: [ 264.967896] ([<00000000001347d6>] show_stack+0x56/0xa0) [ 264.967903] [<00000000017e379c>] dump_stack+0x23c/0x290 [ 264.967912] [<00000000007bc594>] print_address_description+0xf4/0x290 [ 264.967919] [<00000000007bc8fc>] kasan_report+0x13c/0x240 [ 264.967927] [<000000000162f5e4>] ip6_finish_output2+0x1aec/0x1c70 [ 264.967935] [<000000000163f890>] ip6_finish_output+0x430/0x7f0 [ 264.967943] [<000000000163fe44>] ip6_output+0x1f4/0x580 [ 264.967953] [<000000000163882a>] ip6_xmit+0xfea/0x1ce8 [ 264.967963] [<00000000017396e2>] inet6_csk_xmit+0x282/0x3f8 [ 264.968033] [<000003ff805fb0ba>] l2tp_xmit_skb+0xe02/0x13e0 [l2tp_core] [ 264.968037] [<000003ff80631192>] l2tp_eth_dev_xmit+0xda/0x150 [l2tp_eth] [ 264.968041] [<0000000001220020>] dev_hard_start_xmit+0x268/0x928 [ 264.968069] [<0000000001330e8e>] sch_direct_xmit+0x7ae/0x1350 [ 264.968071] [<000000000122359c>] __dev_queue_xmit+0x2b7c/0x3478 [ 264.968075] [<00000000013d2862>] ip_finish_output2+0xce2/0x11a0 [ 264.968078] [<00000000013d9b14>] ip_finish_output+0x56c/0x8c8 [ 264.968081] [<00000000013ddd1e>] ip_output+0x226/0x4c0 [ 264.968083] [<00000000013dbd6c>] __ip_queue_xmit+0x894/0x1938 [ 264.968100] [<000003ff80bc3a5c>] sctp_packet_transmit+0x29d4/0x3648 [sctp] [ 264.968116] [<000003ff80b7bf68>] sctp_outq_flush_ctrl.constprop.5+0x8d0/0xe50 [sctp] [ 264.968131] [<000003ff80b7c716>] sctp_outq_flush+0x22e/0x7d8 [sctp] [ 264.968146] [<000003ff80b35c68>] sctp_cmd_interpreter.isra.16+0x530/0x6800 [sctp] [ 264.968161] [<000003ff80b3410a>] sctp_do_sm+0x222/0x648 [sctp] [ 264.968177] [<000003ff80bbddac>] sctp_primitive_ASSOCIATE+0xbc/0xf8 [sctp] [ 264.968192] [<000003ff80b93328>] __sctp_connect+0x830/0xc20 [sctp] [ 264.968208] [<000003ff80bb11ce>] sctp_inet_connect+0x2e6/0x378 [sctp] [ 264.968212] [<0000000001197942>] __sys_connect+0x21a/0x450 [ 264.968215] [<000000000119aff8>] sys_socketcall+0x3d0/0xb08 [ 264.968218] [<000000000184ea7a>] system_call+0x2a2/0x2c0 [...] Just like ip_finish_output2() does for IPv4, check that we have enough headroom in ip6_xmit(), and reallocate it if we don't. This issue is older than git history. Reported-by: Jianlin Shi Signed-off-by: Stefano Brivio Signed-off-by: David S. Miller --- net/ipv6/ip6_output.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 827a3f5ff3bb..fcd3c66ded16 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -195,37 +195,37 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, const struct ipv6_pinfo *np = inet6_sk(sk); struct in6_addr *first_hop = &fl6->daddr; struct dst_entry *dst = skb_dst(skb); + unsigned int head_room; struct ipv6hdr *hdr; u8 proto = fl6->flowi6_proto; int seg_len = skb->len; int hlimit = -1; u32 mtu; - if (opt) { - unsigned int head_room; + head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev); + if (opt) + head_room += opt->opt_nflen + opt->opt_flen; - /* First: exthdrs may take lots of space (~8K for now) - MAX_HEADER is not enough. - */ - head_room = opt->opt_nflen + opt->opt_flen; - seg_len += head_room; - head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev); - - if (skb_headroom(skb) < head_room) { - struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room); - if (!skb2) { - IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_OUTDISCARDS); - kfree_skb(skb); - return -ENOBUFS; - } - if (skb->sk) - skb_set_owner_w(skb2, skb->sk); - consume_skb(skb); - skb = skb2; + if (unlikely(skb_headroom(skb) < head_room)) { + struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room); + if (!skb2) { + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_OUTDISCARDS); + kfree_skb(skb); + return -ENOBUFS; } + if (skb->sk) + skb_set_owner_w(skb2, skb->sk); + consume_skb(skb); + skb = skb2; + } + + if (opt) { + seg_len += opt->opt_nflen + opt->opt_flen; + if (opt->opt_flen) ipv6_push_frag_opts(skb, opt, &proto); + if (opt->opt_nflen) ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop, &fl6->saddr); From e6ac64d4c4d095085d7dd71cbd05704ac99829b2 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Thu, 6 Dec 2018 19:30:37 +0100 Subject: [PATCH 86/93] neighbour: Avoid writing before skb->head in neigh_hh_output() While skb_push() makes the kernel panic if the skb headroom is less than the unaligned hardware header size, it will proceed normally in case we copy more than that because of alignment, and we'll silently corrupt adjacent slabs. In the case fixed by the previous patch, "ipv6: Check available headroom in ip6_xmit() even without options", we end up in neigh_hh_output() with 14 bytes headroom, 14 bytes hardware header and write 16 bytes, starting 2 bytes before the allocated buffer. Always check we're not writing before skb->head and, if the headroom is not enough, warn and drop the packet. v2: - instead of panicking with BUG_ON(), WARN_ON_ONCE() and drop the packet (Eric Dumazet) - if we avoid the panic, though, we need to explicitly check the headroom before the memcpy(), otherwise we'll have corrupted slabs on a running kernel, after we warn - use __skb_push() instead of skb_push(), as the headroom check is already implemented here explicitly (Eric Dumazet) Signed-off-by: Stefano Brivio Signed-off-by: David S. Miller --- include/net/neighbour.h | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/include/net/neighbour.h b/include/net/neighbour.h index f58b384aa6c9..665990c7dec8 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -454,6 +454,7 @@ static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb) static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb) { + unsigned int hh_alen = 0; unsigned int seq; unsigned int hh_len; @@ -461,16 +462,33 @@ static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb seq = read_seqbegin(&hh->hh_lock); hh_len = hh->hh_len; if (likely(hh_len <= HH_DATA_MOD)) { - /* this is inlined by gcc */ - memcpy(skb->data - HH_DATA_MOD, hh->hh_data, HH_DATA_MOD); - } else { - unsigned int hh_alen = HH_DATA_ALIGN(hh_len); + hh_alen = HH_DATA_MOD; - memcpy(skb->data - hh_alen, hh->hh_data, hh_alen); + /* skb_push() would proceed silently if we have room for + * the unaligned size but not for the aligned size: + * check headroom explicitly. + */ + if (likely(skb_headroom(skb) >= HH_DATA_MOD)) { + /* this is inlined by gcc */ + memcpy(skb->data - HH_DATA_MOD, hh->hh_data, + HH_DATA_MOD); + } + } else { + hh_alen = HH_DATA_ALIGN(hh_len); + + if (likely(skb_headroom(skb) >= hh_alen)) { + memcpy(skb->data - hh_alen, hh->hh_data, + hh_alen); + } } } while (read_seqretry(&hh->hh_lock, seq)); - skb_push(skb, hh_len); + if (WARN_ON_ONCE(skb_headroom(skb) < hh_alen)) { + kfree_skb(skb); + return NET_XMIT_DROP; + } + + __skb_push(skb, hh_len); return dev_queue_xmit(skb); } From 5b3279e2cba2238b37f6c18adfdea8bddb32715a Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Fri, 7 Dec 2018 15:05:04 +1100 Subject: [PATCH 87/93] Revert "net/ibm/emac: wrong bit is used for STA control" This reverts commit 624ca9c33c8a853a4a589836e310d776620f4ab9. This commit is completely bogus. The STACR register has two formats, old and new, depending on the version of the IP block used. There's a pair of device-tree properties that can be used to specify the format used: has-inverted-stacr-oc has-new-stacr-staopc What this commit did was to change the bit definition used with the old parts to match the new parts. This of course breaks the driver on all the old ones. Instead, the author should have set the appropriate properties in the device-tree for the variant used on his board. Signed-off-by: Benjamin Herrenschmidt Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/emac/emac.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ibm/emac/emac.h b/drivers/net/ethernet/ibm/emac/emac.h index e2f80cca9bed..0d2de6f67676 100644 --- a/drivers/net/ethernet/ibm/emac/emac.h +++ b/drivers/net/ethernet/ibm/emac/emac.h @@ -231,7 +231,7 @@ struct emac_regs { #define EMAC_STACR_PHYE 0x00004000 #define EMAC_STACR_STAC_MASK 0x00003000 #define EMAC_STACR_STAC_READ 0x00001000 -#define EMAC_STACR_STAC_WRITE 0x00000800 +#define EMAC_STACR_STAC_WRITE 0x00002000 #define EMAC_STACR_OPBC_MASK 0x00000C00 #define EMAC_STACR_OPBC_50 0x00000000 #define EMAC_STACR_OPBC_66 0x00000400 From bd5122cd1e0644d8bd8dd84517c932773e999766 Mon Sep 17 00:00:00 2001 From: Tarick Bedeir Date: Fri, 7 Dec 2018 00:30:26 -0800 Subject: [PATCH 88/93] net/mlx4_core: Correctly set PFC param if global pause is turned off. rx_ppp and tx_ppp can be set between 0 and 255, so don't clamp to 1. Fixes: 6e8814ceb7e8 ("net/mlx4_en: Fix mixed PFC and Global pause user control requests") Signed-off-by: Tarick Bedeir Reviewed-by: Eran Ben Elisha Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/en_ethtool.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c index f11b45001cad..d290f0787dfb 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c @@ -1084,8 +1084,8 @@ static int mlx4_en_set_pauseparam(struct net_device *dev, tx_pause = !!(pause->tx_pause); rx_pause = !!(pause->rx_pause); - rx_ppp = priv->prof->rx_ppp && !(tx_pause || rx_pause); - tx_ppp = priv->prof->tx_ppp && !(tx_pause || rx_pause); + rx_ppp = (tx_pause || rx_pause) ? 0 : priv->prof->rx_ppp; + tx_ppp = (tx_pause || rx_pause) ? 0 : priv->prof->tx_ppp; err = mlx4_SET_PORT_general(mdev->dev, priv->port, priv->rx_skb_size + ETH_FCS_LEN, From 804fba4e9f508c8004a4bfbdf3f300ca237c56df Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Sun, 9 Dec 2018 07:00:59 -0500 Subject: [PATCH 89/93] bnxt_en: Fix CNP CoS queue regression. Recent changes to support the 57500 devices have created this regression. The bnxt_hwrm_queue_qportcfg() call was moved to be called earlier before the RDMA support was determined, causing the CoS queues configuration to be set before knowing whether RDMA was supported or not. Fix it by moving it to the right place right after RDMA support is determined. Fixes: 98f04cf0f1fc ("bnxt_en: Check context memory requirements from firmware.") Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index d4c300117529..0cf4cb93c1e1 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -6292,6 +6292,8 @@ hwrm_func_qcaps_exit: return rc; } +static int bnxt_hwrm_queue_qportcfg(struct bnxt *bp); + static int bnxt_hwrm_func_qcaps(struct bnxt *bp) { int rc; @@ -6299,6 +6301,11 @@ static int bnxt_hwrm_func_qcaps(struct bnxt *bp) rc = __bnxt_hwrm_func_qcaps(bp); if (rc) return rc; + rc = bnxt_hwrm_queue_qportcfg(bp); + if (rc) { + netdev_err(bp->dev, "hwrm query qportcfg failure rc: %d\n", rc); + return rc; + } if (bp->hwrm_spec_code >= 0x10803) { rc = bnxt_alloc_ctx_mem(bp); if (rc) From 75720e6323a1d195ae3ebf1a7b5e17c2e687f552 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Sun, 9 Dec 2018 07:01:00 -0500 Subject: [PATCH 90/93] bnxt_en: Keep track of reserved IRQs. The new 57500 chips use 1 NQ per MSIX vector, whereas legacy chips use 1 CP ring per MSIX vector. To better unify this, add a resv_irqs field to struct bnxt_hw_resc. On legacy chips, we initialize resv_irqs with resv_cp_rings. On new chips, we initialize it with the allocated MSIX resources. Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 8 ++++++-- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 1 + drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 0cf4cb93c1e1..c39820b2268f 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -5162,6 +5162,7 @@ static int bnxt_hwrm_get_rings(struct bnxt *bp) cp = le16_to_cpu(resp->alloc_cmpl_rings); stats = le16_to_cpu(resp->alloc_stat_ctx); cp = min_t(u16, cp, stats); + hw_resc->resv_irqs = cp; if (bp->flags & BNXT_FLAG_CHIP_P5) { int rx = hw_resc->resv_rx_rings; int tx = hw_resc->resv_tx_rings; @@ -5175,7 +5176,7 @@ static int bnxt_hwrm_get_rings(struct bnxt *bp) hw_resc->resv_rx_rings = rx; hw_resc->resv_tx_rings = tx; } - cp = le16_to_cpu(resp->alloc_msix); + hw_resc->resv_irqs = le16_to_cpu(resp->alloc_msix); hw_resc->resv_hw_ring_grps = rx; } hw_resc->resv_cp_rings = cp; @@ -7055,7 +7056,9 @@ int bnxt_get_avail_msix(struct bnxt *bp, int num) int total_req = bp->cp_nr_rings + num; int max_idx, avail_msix; - max_idx = min_t(int, bp->total_irqs, max_cp); + max_idx = bp->total_irqs; + if (!(bp->flags & BNXT_FLAG_CHIP_P5)) + max_idx = min_t(int, bp->total_irqs, max_cp); avail_msix = max_idx - bp->cp_nr_rings; if (!BNXT_NEW_RM(bp) || avail_msix >= num) return avail_msix; @@ -7801,6 +7804,7 @@ static int bnxt_hwrm_if_change(struct bnxt *bp, bool up) rc = bnxt_hwrm_func_resc_qcaps(bp, true); hw_resc->resv_cp_rings = 0; + hw_resc->resv_irqs = 0; hw_resc->resv_tx_rings = 0; hw_resc->resv_rx_rings = 0; hw_resc->resv_hw_ring_grps = 0; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 9e99d4ab3e06..3030931ccaf8 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -928,6 +928,7 @@ struct bnxt_hw_resc { u16 min_stat_ctxs; u16 max_stat_ctxs; u16 max_irqs; + u16 resv_irqs; }; #if defined(CONFIG_BNXT_SRIOV) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c index b59b382d34f9..0a3097baafde 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c @@ -168,7 +168,7 @@ static int bnxt_req_msix_vecs(struct bnxt_en_dev *edev, int ulp_id, if (BNXT_NEW_RM(bp)) { struct bnxt_hw_resc *hw_resc = &bp->hw_resc; - avail_msix = hw_resc->resv_cp_rings - bp->cp_nr_rings; + avail_msix = hw_resc->resv_irqs - bp->cp_nr_rings; edev->ulp_tbl[ulp_id].msix_requested = avail_msix; } bnxt_fill_msix_vecs(bp, ent); From c0b8cda05e1d8151f57a79e525c2c7d51cec2f4e Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Sun, 9 Dec 2018 07:01:01 -0500 Subject: [PATCH 91/93] bnxt_en: Fix NQ/CP rings accounting on the new 57500 chips. The new 57500 chips have introduced the NQ structure in addition to the existing CP rings in all chips. We need to introduce a new bnxt_nq_rings_in_use(). On legacy chips, the 2 functions are the same and one will just call the other. On the new chips, they refer to the 2 separate ring structures. The new function is now called to determine the resource (NQ or CP rings) associated with MSIX that are in use. On 57500 chips, the RDMA driver does not use the CP rings so we don't need to do the subtraction adjustment. Fixes: 41e8d7983752 ("bnxt_en: Modify the ring reservation functions for 57500 series chips.") Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 29 ++++++++++++++++++----- 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index c39820b2268f..2e90d98640d1 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -5354,7 +5354,7 @@ static int bnxt_hwrm_reserve_rings(struct bnxt *bp, int tx, int rx, int grp, return bnxt_hwrm_reserve_vf_rings(bp, tx, rx, grp, cp, vnic); } -static int bnxt_cp_rings_in_use(struct bnxt *bp) +static int bnxt_nq_rings_in_use(struct bnxt *bp) { int cp = bp->cp_nr_rings; int ulp_msix, ulp_base; @@ -5369,10 +5369,22 @@ static int bnxt_cp_rings_in_use(struct bnxt *bp) return cp; } +static int bnxt_cp_rings_in_use(struct bnxt *bp) +{ + int cp; + + if (!(bp->flags & BNXT_FLAG_CHIP_P5)) + return bnxt_nq_rings_in_use(bp); + + cp = bp->tx_nr_rings + bp->rx_nr_rings; + return cp; +} + static bool bnxt_need_reserve_rings(struct bnxt *bp) { struct bnxt_hw_resc *hw_resc = &bp->hw_resc; int cp = bnxt_cp_rings_in_use(bp); + int nq = bnxt_nq_rings_in_use(bp); int rx = bp->rx_nr_rings; int vnic = 1, grp = rx; @@ -5388,7 +5400,7 @@ static bool bnxt_need_reserve_rings(struct bnxt *bp) rx <<= 1; if (BNXT_NEW_RM(bp) && (hw_resc->resv_rx_rings != rx || hw_resc->resv_cp_rings != cp || - hw_resc->resv_vnics != vnic || + hw_resc->resv_irqs < nq || hw_resc->resv_vnics != vnic || (hw_resc->resv_hw_ring_grps != grp && !(bp->flags & BNXT_FLAG_CHIP_P5)))) return true; @@ -5398,7 +5410,7 @@ static bool bnxt_need_reserve_rings(struct bnxt *bp) static int __bnxt_reserve_rings(struct bnxt *bp) { struct bnxt_hw_resc *hw_resc = &bp->hw_resc; - int cp = bnxt_cp_rings_in_use(bp); + int cp = bnxt_nq_rings_in_use(bp); int tx = bp->tx_nr_rings; int rx = bp->rx_nr_rings; int grp, rx_rings, rc; @@ -5423,7 +5435,7 @@ static int __bnxt_reserve_rings(struct bnxt *bp) tx = hw_resc->resv_tx_rings; if (BNXT_NEW_RM(bp)) { rx = hw_resc->resv_rx_rings; - cp = hw_resc->resv_cp_rings; + cp = hw_resc->resv_irqs; grp = hw_resc->resv_hw_ring_grps; vnic = hw_resc->resv_vnics; } @@ -7034,7 +7046,12 @@ unsigned int bnxt_get_max_func_cp_rings(struct bnxt *bp) unsigned int bnxt_get_max_func_cp_rings_for_en(struct bnxt *bp) { - return bp->hw_resc.max_cp_rings - bnxt_get_ulp_msix_num(bp); + unsigned int cp = bp->hw_resc.max_cp_rings; + + if (!(bp->flags & BNXT_FLAG_CHIP_P5)) + cp -= bnxt_get_ulp_msix_num(bp); + + return cp; } static unsigned int bnxt_get_max_func_irqs(struct bnxt *bp) @@ -7076,7 +7093,7 @@ static int bnxt_get_num_msix(struct bnxt *bp) if (!BNXT_NEW_RM(bp)) return bnxt_get_max_func_irqs(bp); - return bnxt_cp_rings_in_use(bp); + return bnxt_nq_rings_in_use(bp); } static int bnxt_init_msix(struct bnxt *bp) From e30fbc33190b8ba1d6e8ff4864627f7414b5ca99 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Sun, 9 Dec 2018 07:01:02 -0500 Subject: [PATCH 92/93] bnxt_en: Fix _bnxt_get_max_rings() for 57500 chips. The CP rings are accounted differently on the new 57500 chips. There must be enough CP rings for the sum of RX and TX rings on the new chips. The current logic may be over-estimating the RX and TX rings. The output parameter max_cp should be the maximum NQs capped by MSIX vectors available for networking in the context of 57500 chips. The existing code which uses CMPL rings capped by the MSIX vectors works most of the time but is not always correct. Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 2e90d98640d1..5d21c14853ac 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -9827,13 +9827,16 @@ static void _bnxt_get_max_rings(struct bnxt *bp, int *max_rx, int *max_tx, int *max_cp) { struct bnxt_hw_resc *hw_resc = &bp->hw_resc; - int max_ring_grps = 0; + int max_ring_grps = 0, max_irq; *max_tx = hw_resc->max_tx_rings; *max_rx = hw_resc->max_rx_rings; - *max_cp = min_t(int, bnxt_get_max_func_cp_rings_for_en(bp), - hw_resc->max_irqs - bnxt_get_ulp_msix_num(bp)); - *max_cp = min_t(int, *max_cp, hw_resc->max_stat_ctxs); + *max_cp = bnxt_get_max_func_cp_rings_for_en(bp); + max_irq = min_t(int, bnxt_get_max_func_irqs(bp) - + bnxt_get_ulp_msix_num(bp), + bnxt_get_max_func_stat_ctxs(bp)); + if (!(bp->flags & BNXT_FLAG_CHIP_P5)) + *max_cp = min_t(int, *max_cp, max_irq); max_ring_grps = hw_resc->max_hw_ring_grps; if (BNXT_CHIP_TYPE_NITRO_A0(bp) && BNXT_PF(bp)) { *max_cp -= 1; @@ -9841,6 +9844,11 @@ static void _bnxt_get_max_rings(struct bnxt *bp, int *max_rx, int *max_tx, } if (bp->flags & BNXT_FLAG_AGG_RINGS) *max_rx >>= 1; + if (bp->flags & BNXT_FLAG_CHIP_P5) { + bnxt_trim_rings(bp, max_rx, max_tx, *max_cp, false); + /* On P5 chips, max_cp output param should be available NQs */ + *max_cp = max_irq; + } *max_rx = min_t(int, *max_rx, max_ring_grps); } From 35cc3cefc4de90001c9137e2d01dd9d06b11acfb Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Sun, 9 Dec 2018 18:10:24 +0200 Subject: [PATCH 93/93] net/sched: cls_flower: Reject duplicated rules also under skip_sw Currently, duplicated rules are rejected only for skip_hw or "none", hence allowing users to push duplicates into HW for no reason. Use the flower tables to protect for that. Signed-off-by: Or Gerlitz Signed-off-by: Paul Blakey Reported-by: Chris Mi Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index c6c327874abc..71312d7bd8f4 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -1238,18 +1238,16 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, if (err) goto errout_idr; - if (!tc_skip_sw(fnew->flags)) { - if (!fold && fl_lookup(fnew->mask, &fnew->mkey)) { - err = -EEXIST; - goto errout_mask; - } - - err = rhashtable_insert_fast(&fnew->mask->ht, &fnew->ht_node, - fnew->mask->filter_ht_params); - if (err) - goto errout_mask; + if (!fold && fl_lookup(fnew->mask, &fnew->mkey)) { + err = -EEXIST; + goto errout_mask; } + err = rhashtable_insert_fast(&fnew->mask->ht, &fnew->ht_node, + fnew->mask->filter_ht_params); + if (err) + goto errout_mask; + if (!tc_skip_hw(fnew->flags)) { err = fl_hw_replace_filter(tp, fnew, extack); if (err) @@ -1303,9 +1301,8 @@ static int fl_delete(struct tcf_proto *tp, void *arg, bool *last, struct cls_fl_head *head = rtnl_dereference(tp->root); struct cls_fl_filter *f = arg; - if (!tc_skip_sw(f->flags)) - rhashtable_remove_fast(&f->mask->ht, &f->ht_node, - f->mask->filter_ht_params); + rhashtable_remove_fast(&f->mask->ht, &f->ht_node, + f->mask->filter_ht_params); __fl_delete(tp, f, extack); *last = list_empty(&head->masks); return 0;