From 05cc09de4c017663a217630682041066f2f9a5cd Mon Sep 17 00:00:00 2001
From: Alexey Khoroshilov <khoroshilov@ispras.ru>
Date: Fri, 5 Oct 2018 23:22:06 +0300
Subject: [PATCH 01/93] mac80211_hwsim: fix module init error paths for netlink

There is no unregister netlink notifier and family on error paths
in init_mac80211_hwsim(). Also there is an error path where
hwsim_class is not destroyed.

Found by Linux Driver Verification project (linuxtesting.org).

Signed-off-by: Alexey Khoroshilov <khoroshilov@ispras.ru>
Fixes: 62759361eb49 ("mac80211-hwsim: Provide multicast event for HWSIM_CMD_NEW_RADIO")
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/mac80211_hwsim.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)
diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index aa8058264d5b..07f958c63334 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -3703,16 +3703,16 @@ static int __init init_mac80211_hwsim(void)
 	if (err)
 		goto out_unregister_pernet;
 
+	err = hwsim_init_netlink();
+	if (err)
+		goto out_unregister_driver;
+
 	hwsim_class = class_create(THIS_MODULE, "mac80211_hwsim");
 	if (IS_ERR(hwsim_class)) {
 		err = PTR_ERR(hwsim_class);
-		goto out_unregister_driver;
+		goto out_exit_netlink;
 	}
 
-	err = hwsim_init_netlink();
-	if (err < 0)
-		goto out_unregister_driver;
-
 	for (i = 0; i < radios; i++) {
 		struct hwsim_new_radio_params param = { 0 };
 
@@ -3818,6 +3818,8 @@ out_free_mon:
 	free_netdev(hwsim_mon);
 out_free_radios:
 	mac80211_hwsim_free();
+out_exit_netlink:
+	hwsim_exit_netlink();
 out_unregister_driver:
 	platform_driver_unregister(&mac80211_hwsim_driver);
 out_unregister_pernet:

From 33483a6b88e4c4c3fc50178b185da52c55288b95 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyongjun1@huawei.com>
Date: Tue, 16 Oct 2018 02:35:30 +0000
Subject: [PATCH 02/93] mac80211: fix missing unlock on error in
 ieee80211_mark_sta_auth()

Add the missing unlock before return from function
ieee80211_mark_sta_auth() in the error handling case.

Cc: stable@vger.kernel.org
Fixes: fc107a933071 ("mac80211: Helper function for marking STA authenticated")
Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
[use result variable/label instead of duplicating]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/mlme.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index d2bc8d57c87e..bcf5ffc1567a 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -2766,6 +2766,7 @@ static bool ieee80211_mark_sta_auth(struct ieee80211_sub_if_data *sdata,
 {
 	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 	struct sta_info *sta;
+	bool result = true;
 
 	sdata_info(sdata, "authenticated\n");
 	ifmgd->auth_data->done = true;
@@ -2778,15 +2779,18 @@ static bool ieee80211_mark_sta_auth(struct ieee80211_sub_if_data *sdata,
 	sta = sta_info_get(sdata, bssid);
 	if (!sta) {
 		WARN_ONCE(1, "%s: STA %pM not found", sdata->name, bssid);
-		return false;
+		result = false;
+		goto out;
 	}
 	if (sta_info_move_state(sta, IEEE80211_STA_AUTH)) {
 		sdata_info(sdata, "failed moving %pM to auth\n", bssid);
-		return false;
+		result = false;
+		goto out;
 	}
-	mutex_unlock(&sdata->local->sta_mtx);
 
-	return true;
+out:
+	mutex_unlock(&sdata->local->sta_mtx);
+	return result;
 }
 
 static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata,

From a1881c9b8a1edef0a5ae1d5c1b61406fe3402114 Mon Sep 17 00:00:00 2001
From: Vasyl Vavrychuk <vasyl.vavrychuk@globallogic.com>
Date: Thu, 18 Oct 2018 01:02:12 +0300
Subject: [PATCH 03/93] mac80211_hwsim: Timer should be initialized before
 device registered

Otherwise if network manager starts configuring Wi-Fi interface
immidiatelly after getting notification of its creation, we will get
NULL pointer dereference:

  BUG: unable to handle kernel NULL pointer dereference at           (null)
  IP: [<ffffffff95ae94c8>] hrtimer_active+0x28/0x50
  ...
  Call Trace:
   [<ffffffff95ae9997>] ? hrtimer_try_to_cancel+0x27/0x110
   [<ffffffff95ae9a95>] ? hrtimer_cancel+0x15/0x20
   [<ffffffffc0803bf0>] ? mac80211_hwsim_config+0x140/0x1c0 [mac80211_hwsim]

Cc: stable@vger.kernel.org
Signed-off-by: Vasyl Vavrychuk <vasyl.vavrychuk@globallogic.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/mac80211_hwsim.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index 07f958c63334..d1464e3e1be2 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -2884,6 +2884,10 @@ static int mac80211_hwsim_new_radio(struct genl_info *info,
 
 	wiphy_ext_feature_set(hw->wiphy, NL80211_EXT_FEATURE_CQM_RSSI_LIST);
 
+	tasklet_hrtimer_init(&data->beacon_timer,
+			     mac80211_hwsim_beacon,
+			     CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+
 	err = ieee80211_register_hw(hw);
 	if (err < 0) {
 		pr_debug("mac80211_hwsim: ieee80211_register_hw failed (%d)\n",
@@ -2908,10 +2912,6 @@ static int mac80211_hwsim_new_radio(struct genl_info *info,
 				    data->debugfs,
 				    data, &hwsim_simulate_radar);
 
-	tasklet_hrtimer_init(&data->beacon_timer,
-			     mac80211_hwsim_beacon,
-			     CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
-
 	spin_lock_bh(&hwsim_radio_lock);
 	err = rhashtable_insert_fast(&hwsim_radios_rht, &data->rht,
 				     hwsim_rht_params);

From 81c5dce2cd0bb0ecb61b6212410da5eb78cd8f79 Mon Sep 17 00:00:00 2001
From: Sergey Matyukevich <sergey.matyukevich.os@quantenna.com>
Date: Fri, 19 Oct 2018 15:40:13 +0000
Subject: [PATCH 04/93] cfg80211: add missing constraint for user-supplied VHT
 mask

Do a logical vht_capa &= vht_capa_mask of user-supplied VHT mask with
the driver-supplied mask of modifiable VHT capabilities.
Fix whitespaces and comment typos.

Signed-off-by: Sergey Matyukevich <sergey.matyukevich.os@quantenna.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/mlme.c | 4 ++--
 net/wireless/sme.c  | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index 12b3edf70a7b..1615e503f8e3 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -272,11 +272,11 @@ void cfg80211_oper_and_ht_capa(struct ieee80211_ht_cap *ht_capa,
 
 	p1 = (u8*)(ht_capa);
 	p2 = (u8*)(ht_capa_mask);
-	for (i = 0; i<sizeof(*ht_capa); i++)
+	for (i = 0; i < sizeof(*ht_capa); i++)
 		p1[i] &= p2[i];
 }
 
-/*  Do a logical ht_capa &= ht_capa_mask.  */
+/*  Do a logical vht_capa &= vht_capa_mask.  */
 void cfg80211_oper_and_vht_capa(struct ieee80211_vht_cap *vht_capa,
 				const struct ieee80211_vht_cap *vht_capa_mask)
 {
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index d536b07582f8..1c93412038dc 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -1171,6 +1171,8 @@ int cfg80211_connect(struct cfg80211_registered_device *rdev,
 
 	cfg80211_oper_and_ht_capa(&connect->ht_capa_mask,
 				  rdev->wiphy.ht_capa_mod_mask);
+	cfg80211_oper_and_vht_capa(&connect->vht_capa_mask,
+				   rdev->wiphy.vht_capa_mod_mask);
 
 	if (connkeys && connkeys->def >= 0) {
 		int idx;

From c752cac9db1b0c469db7ba9d17af4ba708984db5 Mon Sep 17 00:00:00 2001
From: Yan-Hsuan Chuang <yhchuang@realtek.com>
Date: Tue, 23 Oct 2018 11:24:44 +0800
Subject: [PATCH 05/93] mac80211: fix GFP_KERNEL under tasklet context

cfg80211_sta_opmode_change_notify needs a gfp_t flag to hint the nl80211
stack when allocating new skb, but it is called under tasklet context
here with GFP_KERNEL and kernel will yield a warning about it.

Cc: stable@vger.kernel.org
Fixes: ff84e7bfe176 ("mac80211: Add support to notify ht/vht opmode modification.")
Signed-off-by: Yan-Hsuan Chuang <yhchuang@realtek.com>
ACKed-by: Larry Finger <Larry.Finger@lwfinger.net>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/rx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 3bd3b5769797..a69ecfb212ed 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -3063,7 +3063,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
 			cfg80211_sta_opmode_change_notify(sdata->dev,
 							  rx->sta->addr,
 							  &sta_opmode,
-							  GFP_KERNEL);
+							  GFP_ATOMIC);
 			goto handled;
 		}
 		case WLAN_HT_ACTION_NOTIFY_CHANWIDTH: {
@@ -3100,7 +3100,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
 			cfg80211_sta_opmode_change_notify(sdata->dev,
 							  rx->sta->addr,
 							  &sta_opmode,
-							  GFP_KERNEL);
+							  GFP_ATOMIC);
 			goto handled;
 		}
 		default:

From 5c21e8100dfd57c806e833ae905e26efbb87840f Mon Sep 17 00:00:00 2001
From: Ben Greear <greearb@candelatech.com>
Date: Tue, 23 Oct 2018 13:36:52 -0700
Subject: [PATCH 06/93] mac80211: Clear beacon_int in ieee80211_do_stop

This fixes stale beacon-int values that would keep a netdev
from going up.

To reproduce:

Create two VAP on one radio.
vap1 has beacon-int 100, start it.
vap2 has beacon-int 240, start it (and it will fail
  because beacon-int mismatch).
reconfigure vap2 to have beacon-int 100 and start it.
  It will fail because the stale beacon-int 240 will be used
  in the ifup path and hostapd never gets a chance to set the
  new beacon interval.

Cc: stable@vger.kernel.org
Signed-off-by: Ben Greear <greearb@candelatech.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/iface.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 5836ddeac9e3..5f3c81e705c7 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -1015,6 +1015,8 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
 	if (local->open_count == 0)
 		ieee80211_clear_tx_pending(local);
 
+	sdata->vif.bss_conf.beacon_int = 0;
+
 	/*
 	 * If the interface goes down while suspended, presumably because
 	 * the device was unplugged and that happens before our resume,

From c177db2d0d5e751d52d3827b8cfdb6ef92a95a2d Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 30 Oct 2018 09:17:44 +0100
Subject: [PATCH 07/93] cfg80211/mac80211: fix FTM settings across CSA

When FTM is enabled, doing a CSA will unexpectedly lose it since
the value of ftm_responder may be initialized to 0 instead of -1,
so fix that.

Fixes: 81e54d08d9d8 ("cfg80211: support FTM responder configuration/statistics")
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/nl80211.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 744b5851bbf9..8d763725498c 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -7870,6 +7870,7 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info)
 	}
 
 	memset(&params, 0, sizeof(params));
+	params.beacon_csa.ftm_responder = -1;
 
 	if (!info->attrs[NL80211_ATTR_WIPHY_FREQ] ||
 	    !info->attrs[NL80211_ATTR_CH_SWITCH_COUNT])

From 03b738625b1e58f4ae2bddf04706ab85c677af2d Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 30 Oct 2018 09:17:45 +0100
Subject: [PATCH 08/93] mac80211: fix CSA beacon allocation size

If the FTM responder settings are changed simultaneously with
the CSA beacon, the buffer size allocated isn't sufficient and
we'll have a heap overrun. Fix this.

While at it, also clean up the ftm_responder assignment, doing
it only if ftm_responder is non-zero is valid as it's 0 to start
with, but not really useful to understand the code.

Fixes: bc847970f432 ("mac80211: support FTM responder configuration/statistics")
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/cfg.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 51622333d460..818aa0060349 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -2891,7 +2891,7 @@ cfg80211_beacon_dup(struct cfg80211_beacon_data *beacon)
 
 	len = beacon->head_len + beacon->tail_len + beacon->beacon_ies_len +
 	      beacon->proberesp_ies_len + beacon->assocresp_ies_len +
-	      beacon->probe_resp_len;
+	      beacon->probe_resp_len + beacon->lci_len + beacon->civicloc_len;
 
 	new_beacon = kzalloc(sizeof(*new_beacon) + len, GFP_KERNEL);
 	if (!new_beacon)
@@ -2934,8 +2934,9 @@ cfg80211_beacon_dup(struct cfg80211_beacon_data *beacon)
 		memcpy(pos, beacon->probe_resp, beacon->probe_resp_len);
 		pos += beacon->probe_resp_len;
 	}
-	if (beacon->ftm_responder)
-		new_beacon->ftm_responder = beacon->ftm_responder;
+
+	/* might copy -1, meaning no changes requested */
+	new_beacon->ftm_responder = beacon->ftm_responder;
 	if (beacon->lci) {
 		new_beacon->lci_len = beacon->lci_len;
 		new_beacon->lci = pos;

From 113f3aaa81bd56aba02659786ed65cbd9cb9a6fc Mon Sep 17 00:00:00 2001
From: Sriram R <srirrama@codeaurora.org>
Date: Fri, 19 Oct 2018 14:42:59 +0530
Subject: [PATCH 09/93] cfg80211: Prevent regulatory restore during STA
 disconnect in concurrent interfaces

Currently when an AP and STA interfaces are active in the same or different
radios, regulatory settings are restored whenever the STA disconnects. This
restores all channel information including dfs states in all radios.
For example, if an AP interface is active in one radio and STA in another,
when radar is detected on the AP interface, the dfs state of the channel
will be changed to UNAVAILABLE. But when the STA interface disconnects,
this issues a regulatory disconnect hint which restores all regulatory
settings in all the radios attached and thereby losing the stored dfs
state on the other radio where the channel was marked as unavailable
earlier. Hence prevent such regulatory restore whenever another active
beaconing interface is present in the same or other radios.

Signed-off-by: Sriram R <srirrama@codeaurora.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/sme.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index 1c93412038dc..f741d8376a46 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -642,11 +642,15 @@ static bool cfg80211_is_all_idle(void)
 	 * All devices must be idle as otherwise if you are actively
 	 * scanning some new beacon hints could be learned and would
 	 * count as new regulatory hints.
+	 * Also if there is any other active beaconing interface we
+	 * need not issue a disconnect hint and reset any info such
+	 * as chan dfs state, etc.
 	 */
 	list_for_each_entry(rdev, &cfg80211_rdev_list, list) {
 		list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
 			wdev_lock(wdev);
-			if (wdev->conn || wdev->current_bss)
+			if (wdev->conn || wdev->current_bss ||
+			    cfg80211_beaconing_iface_active(wdev))
 				is_all_idle = false;
 			wdev_unlock(wdev);
 		}

From cdbb096adddb3f42584cecb5ec2e07c26815b71f Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Tue, 27 Nov 2018 13:23:27 -0800
Subject: [PATCH 10/93] bpf: btf: implement btf_name_valid_identifier()

Function btf_name_valid_identifier() have been implemented in
bpf-next commit 2667a2626f4d ("bpf: btf: Add BTF_KIND_FUNC and
BTF_KIND_FUNC_PROTO"). Backport this function so later patch
can use it.

Fixes: 69b693f0aefa ("bpf: btf: Introduce BPF Type Format (BTF)")
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/btf.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index ee4c82667d65..93c233ab2db6 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5,6 +5,7 @@
 #include <uapi/linux/types.h>
 #include <linux/seq_file.h>
 #include <linux/compiler.h>
+#include <linux/ctype.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/anon_inodes.h>
@@ -426,6 +427,30 @@ static bool btf_name_offset_valid(const struct btf *btf, u32 offset)
 		offset < btf->hdr.str_len;
 }
 
+/* Only C-style identifier is permitted. This can be relaxed if
+ * necessary.
+ */
+static bool btf_name_valid_identifier(const struct btf *btf, u32 offset)
+{
+	/* offset must be valid */
+	const char *src = &btf->strings[offset];
+	const char *src_limit;
+
+	if (!isalpha(*src) && *src != '_')
+		return false;
+
+	/* set a limit on identifier length */
+	src_limit = src + KSYM_NAME_LEN;
+	src++;
+	while (*src && src < src_limit) {
+		if (!isalnum(*src) && *src != '_')
+			return false;
+		src++;
+	}
+
+	return !*src;
+}
+
 static const char *btf_name_by_offset(const struct btf *btf, u32 offset)
 {
 	if (!offset)

From eb04bbb608e683f8fd3ef7f716e2fa32dd90861f Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Tue, 27 Nov 2018 13:23:28 -0800
Subject: [PATCH 11/93] bpf: btf: check name validity for various types

This patch added name checking for the following types:
 . BTF_KIND_PTR, BTF_KIND_ARRAY, BTF_KIND_VOLATILE,
   BTF_KIND_CONST, BTF_KIND_RESTRICT:
     the name must be null
 . BTF_KIND_STRUCT, BTF_KIND_UNION: the struct/member name
     is either null or a valid identifier
 . BTF_KIND_ENUM: the enum type name is either null or a valid
     identifier; the enumerator name must be a valid identifier.
 . BTF_KIND_FWD: the name must be a valid identifier
 . BTF_KIND_TYPEDEF: the name must be a valid identifier

For those places a valid name is required, the name must be
a valid C identifier. This can be relaxed later if we found
use cases for a different (non-C) frontend.

Fixes: 69b693f0aefa ("bpf: btf: Introduce BPF Type Format (BTF)")
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/btf.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 93c233ab2db6..4da543d6bea2 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -1168,6 +1168,22 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env,
 		return -EINVAL;
 	}
 
+	/* typedef type must have a valid name, and other ref types,
+	 * volatile, const, restrict, should have a null name.
+	 */
+	if (BTF_INFO_KIND(t->info) == BTF_KIND_TYPEDEF) {
+		if (!t->name_off ||
+		    !btf_name_valid_identifier(env->btf, t->name_off)) {
+			btf_verifier_log_type(env, t, "Invalid name");
+			return -EINVAL;
+		}
+	} else {
+		if (t->name_off) {
+			btf_verifier_log_type(env, t, "Invalid name");
+			return -EINVAL;
+		}
+	}
+
 	btf_verifier_log_type(env, t, NULL);
 
 	return 0;
@@ -1325,6 +1341,13 @@ static s32 btf_fwd_check_meta(struct btf_verifier_env *env,
 		return -EINVAL;
 	}
 
+	/* fwd type must have a valid name */
+	if (!t->name_off ||
+	    !btf_name_valid_identifier(env->btf, t->name_off)) {
+		btf_verifier_log_type(env, t, "Invalid name");
+		return -EINVAL;
+	}
+
 	btf_verifier_log_type(env, t, NULL);
 
 	return 0;
@@ -1381,6 +1404,12 @@ static s32 btf_array_check_meta(struct btf_verifier_env *env,
 		return -EINVAL;
 	}
 
+	/* array type should not have a name */
+	if (t->name_off) {
+		btf_verifier_log_type(env, t, "Invalid name");
+		return -EINVAL;
+	}
+
 	if (btf_type_vlen(t)) {
 		btf_verifier_log_type(env, t, "vlen != 0");
 		return -EINVAL;
@@ -1557,6 +1586,13 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env,
 		return -EINVAL;
 	}
 
+	/* struct type either no name or a valid one */
+	if (t->name_off &&
+	    !btf_name_valid_identifier(env->btf, t->name_off)) {
+		btf_verifier_log_type(env, t, "Invalid name");
+		return -EINVAL;
+	}
+
 	btf_verifier_log_type(env, t, NULL);
 
 	last_offset = 0;
@@ -1568,6 +1604,12 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env,
 			return -EINVAL;
 		}
 
+		/* struct member either no name or a valid one */
+		if (member->name_off &&
+		    !btf_name_valid_identifier(btf, member->name_off)) {
+			btf_verifier_log_member(env, t, member, "Invalid name");
+			return -EINVAL;
+		}
 		/* A member cannot be in type void */
 		if (!member->type || !BTF_TYPE_ID_VALID(member->type)) {
 			btf_verifier_log_member(env, t, member,
@@ -1755,6 +1797,13 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env,
 		return -EINVAL;
 	}
 
+	/* enum type either no name or a valid one */
+	if (t->name_off &&
+	    !btf_name_valid_identifier(env->btf, t->name_off)) {
+		btf_verifier_log_type(env, t, "Invalid name");
+		return -EINVAL;
+	}
+
 	btf_verifier_log_type(env, t, NULL);
 
 	for (i = 0; i < nr_enums; i++) {
@@ -1764,6 +1813,14 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env,
 			return -EINVAL;
 		}
 
+		/* enum member must have a valid name */
+		if (!enums[i].name_off ||
+		    !btf_name_valid_identifier(btf, enums[i].name_off)) {
+			btf_verifier_log_type(env, t, "Invalid name");
+			return -EINVAL;
+		}
+
+
 		btf_verifier_log(env, "\t%s val=%d\n",
 				 btf_name_by_offset(btf, enums[i].name_off),
 				 enums[i].val);

From 8800cd031af085807028656c6ba7eb7908d78262 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Tue, 27 Nov 2018 13:23:29 -0800
Subject: [PATCH 12/93] tools/bpf: fix two test_btf unit test cases

There are two unit test cases, which should encode
TYPEDEF type, but instead encode PTR type.
The error is flagged out after enforcing name
checking in the previous patch.

Fixes: c0fa1b6c3efc ("bpf: btf: Add BTF tests")
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/test_btf.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_btf.c b/tools/testing/selftests/bpf/test_btf.c
index f42b3396d622..b361bb851829 100644
--- a/tools/testing/selftests/bpf/test_btf.c
+++ b/tools/testing/selftests/bpf/test_btf.c
@@ -432,11 +432,11 @@ static struct btf_raw_test raw_tests[] = {
 		/* const void* */	/* [3] */
 		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 2),
 		/* typedef const void * const_void_ptr */
-		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 3),
-		/* struct A { */	/* [4] */
+		BTF_TYPEDEF_ENC(NAME_TBD, 3),	/* [4] */
+		/* struct A { */	/* [5] */
 		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 1), sizeof(void *)),
 		/* const_void_ptr m; */
-		BTF_MEMBER_ENC(NAME_TBD, 3, 0),
+		BTF_MEMBER_ENC(NAME_TBD, 4, 0),
 		/* } */
 		BTF_END_RAW,
 	},
@@ -494,10 +494,10 @@ static struct btf_raw_test raw_tests[] = {
 		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 0),
 		/* const void* */	/* [3] */
 		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 2),
-		/* typedef const void * const_void_ptr */	/* [4] */
-		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 3),
-		/* const_void_ptr[4] */	/* [5] */
-		BTF_TYPE_ARRAY_ENC(3, 1, 4),
+		/* typedef const void * const_void_ptr */
+		BTF_TYPEDEF_ENC(NAME_TBD, 3),	/* [4] */
+		/* const_void_ptr[4] */
+		BTF_TYPE_ARRAY_ENC(4, 1, 4),	/* [5] */
 		BTF_END_RAW,
 	},
 	.str_sec = "\0const_void_ptr",

From d08489125e04a9f73d9323caea43270fd22d395f Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Tue, 27 Nov 2018 13:23:30 -0800
Subject: [PATCH 13/93] tools/bpf: add addition type tests to test_btf

The following additional unit testcases are added to test_btf:
...
BTF raw test[42] (typedef (invalid name, name_off = 0)): OK
BTF raw test[43] (typedef (invalid name, invalid identifier)): OK
BTF raw test[44] (ptr type (invalid name, name_off <> 0)): OK
BTF raw test[45] (volatile type (invalid name, name_off <> 0)): OK
BTF raw test[46] (const type (invalid name, name_off <> 0)): OK
BTF raw test[47] (restrict type (invalid name, name_off <> 0)): OK
BTF raw test[48] (fwd type (invalid name, name_off = 0)): OK
BTF raw test[49] (fwd type (invalid name, invalid identifier)): OK
BTF raw test[50] (array type (invalid name, name_off <> 0)): OK
BTF raw test[51] (struct type (name_off = 0)): OK
BTF raw test[52] (struct type (invalid name, invalid identifier)): OK
BTF raw test[53] (struct member (name_off = 0)): OK
BTF raw test[54] (struct member (invalid name, invalid identifier)): OK
BTF raw test[55] (enum type (name_off = 0)): OK
BTF raw test[56] (enum type (invalid name, invalid identifier)): OK
BTF raw test[57] (enum member (invalid name, name_off = 0)): OK
BTF raw test[58] (enum member (invalid name, invalid identifier)): OK
...

Fixes: c0fa1b6c3efc ("bpf: btf: Add BTF tests")
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/test_btf.c | 361 +++++++++++++++++++++++++
 1 file changed, 361 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_btf.c b/tools/testing/selftests/bpf/test_btf.c
index b361bb851829..38e1cbaaffdb 100644
--- a/tools/testing/selftests/bpf/test_btf.c
+++ b/tools/testing/selftests/bpf/test_btf.c
@@ -1292,6 +1292,367 @@ static struct btf_raw_test raw_tests[] = {
 	.err_str = "type != 0",
 },
 
+{
+	.descr = "typedef (invalid name, name_off = 0)",
+	.raw_types = {
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),	/* [1] */
+		BTF_TYPEDEF_ENC(0, 1),				/* [2] */
+		BTF_END_RAW,
+	},
+	.str_sec = "\0__int",
+	.str_sec_size = sizeof("\0__int"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "typedef_check_btf",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_type_id = 1,
+	.value_type_id = 1,
+	.max_entries = 4,
+	.btf_load_err = true,
+	.err_str = "Invalid name",
+},
+
+{
+	.descr = "typedef (invalid name, invalid identifier)",
+	.raw_types = {
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),	/* [1] */
+		BTF_TYPEDEF_ENC(NAME_TBD, 1),			/* [2] */
+		BTF_END_RAW,
+	},
+	.str_sec = "\0__!int",
+	.str_sec_size = sizeof("\0__!int"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "typedef_check_btf",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_type_id = 1,
+	.value_type_id = 1,
+	.max_entries = 4,
+	.btf_load_err = true,
+	.err_str = "Invalid name",
+},
+
+{
+	.descr = "ptr type (invalid name, name_off <> 0)",
+	.raw_types = {
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),		/* [1] */
+		BTF_TYPE_ENC(NAME_TBD,
+			     BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 1),	/* [2] */
+		BTF_END_RAW,
+	},
+	.str_sec = "\0__int",
+	.str_sec_size = sizeof("\0__int"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "ptr_type_check_btf",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_type_id = 1,
+	.value_type_id = 1,
+	.max_entries = 4,
+	.btf_load_err = true,
+	.err_str = "Invalid name",
+},
+
+{
+	.descr = "volatile type (invalid name, name_off <> 0)",
+	.raw_types = {
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),		/* [1] */
+		BTF_TYPE_ENC(NAME_TBD,
+			     BTF_INFO_ENC(BTF_KIND_VOLATILE, 0, 0), 1),	/* [2] */
+		BTF_END_RAW,
+	},
+	.str_sec = "\0__int",
+	.str_sec_size = sizeof("\0__int"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "volatile_type_check_btf",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_type_id = 1,
+	.value_type_id = 1,
+	.max_entries = 4,
+	.btf_load_err = true,
+	.err_str = "Invalid name",
+},
+
+{
+	.descr = "const type (invalid name, name_off <> 0)",
+	.raw_types = {
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),		/* [1] */
+		BTF_TYPE_ENC(NAME_TBD,
+			     BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 1),	/* [2] */
+		BTF_END_RAW,
+	},
+	.str_sec = "\0__int",
+	.str_sec_size = sizeof("\0__int"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "const_type_check_btf",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_type_id = 1,
+	.value_type_id = 1,
+	.max_entries = 4,
+	.btf_load_err = true,
+	.err_str = "Invalid name",
+},
+
+{
+	.descr = "restrict type (invalid name, name_off <> 0)",
+	.raw_types = {
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),		/* [1] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 1),	/* [2] */
+		BTF_TYPE_ENC(NAME_TBD,
+			     BTF_INFO_ENC(BTF_KIND_RESTRICT, 0, 0), 2),	/* [3] */
+		BTF_END_RAW,
+	},
+	.str_sec = "\0__int",
+	.str_sec_size = sizeof("\0__int"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "restrict_type_check_btf",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_type_id = 1,
+	.value_type_id = 1,
+	.max_entries = 4,
+	.btf_load_err = true,
+	.err_str = "Invalid name",
+},
+
+{
+	.descr = "fwd type (invalid name, name_off = 0)",
+	.raw_types = {
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),		/* [1] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FWD, 0, 0), 0),	/* [2] */
+		BTF_END_RAW,
+	},
+	.str_sec = "\0__skb",
+	.str_sec_size = sizeof("\0__skb"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "fwd_type_check_btf",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_type_id = 1,
+	.value_type_id = 1,
+	.max_entries = 4,
+	.btf_load_err = true,
+	.err_str = "Invalid name",
+},
+
+{
+	.descr = "fwd type (invalid name, invalid identifier)",
+	.raw_types = {
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),		/* [1] */
+		BTF_TYPE_ENC(NAME_TBD,
+			     BTF_INFO_ENC(BTF_KIND_FWD, 0, 0), 0),	/* [2] */
+		BTF_END_RAW,
+	},
+	.str_sec = "\0__!skb",
+	.str_sec_size = sizeof("\0__!skb"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "fwd_type_check_btf",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_type_id = 1,
+	.value_type_id = 1,
+	.max_entries = 4,
+	.btf_load_err = true,
+	.err_str = "Invalid name",
+},
+
+{
+	.descr = "array type (invalid name, name_off <> 0)",
+	.raw_types = {
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),		/* [1] */
+		BTF_TYPE_ENC(NAME_TBD,
+			     BTF_INFO_ENC(BTF_KIND_ARRAY, 0, 0), 0),	/* [2] */
+		BTF_ARRAY_ENC(1, 1, 4),
+		BTF_END_RAW,
+	},
+	.str_sec = "\0__skb",
+	.str_sec_size = sizeof("\0__skb"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "array_type_check_btf",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_type_id = 1,
+	.value_type_id = 1,
+	.max_entries = 4,
+	.btf_load_err = true,
+	.err_str = "Invalid name",
+},
+
+{
+	.descr = "struct type (name_off = 0)",
+	.raw_types = {
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),		/* [1] */
+		BTF_TYPE_ENC(0,
+			     BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 1), 4),	/* [2] */
+		BTF_MEMBER_ENC(NAME_TBD, 1, 0),
+		BTF_END_RAW,
+	},
+	.str_sec = "\0A",
+	.str_sec_size = sizeof("\0A"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "struct_type_check_btf",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_type_id = 1,
+	.value_type_id = 1,
+	.max_entries = 4,
+},
+
+{
+	.descr = "struct type (invalid name, invalid identifier)",
+	.raw_types = {
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),		/* [1] */
+		BTF_TYPE_ENC(NAME_TBD,
+			     BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 1), 4),	/* [2] */
+		BTF_MEMBER_ENC(NAME_TBD, 1, 0),
+		BTF_END_RAW,
+	},
+	.str_sec = "\0A!\0B",
+	.str_sec_size = sizeof("\0A!\0B"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "struct_type_check_btf",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_type_id = 1,
+	.value_type_id = 1,
+	.max_entries = 4,
+	.btf_load_err = true,
+	.err_str = "Invalid name",
+},
+
+{
+	.descr = "struct member (name_off = 0)",
+	.raw_types = {
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),		/* [1] */
+		BTF_TYPE_ENC(0,
+			     BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 1), 4),	/* [2] */
+		BTF_MEMBER_ENC(NAME_TBD, 1, 0),
+		BTF_END_RAW,
+	},
+	.str_sec = "\0A",
+	.str_sec_size = sizeof("\0A"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "struct_type_check_btf",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_type_id = 1,
+	.value_type_id = 1,
+	.max_entries = 4,
+},
+
+{
+	.descr = "struct member (invalid name, invalid identifier)",
+	.raw_types = {
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),		/* [1] */
+		BTF_TYPE_ENC(NAME_TBD,
+			     BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 1), 4),	/* [2] */
+		BTF_MEMBER_ENC(NAME_TBD, 1, 0),
+		BTF_END_RAW,
+	},
+	.str_sec = "\0A\0B*",
+	.str_sec_size = sizeof("\0A\0B*"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "struct_type_check_btf",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_type_id = 1,
+	.value_type_id = 1,
+	.max_entries = 4,
+	.btf_load_err = true,
+	.err_str = "Invalid name",
+},
+
+{
+	.descr = "enum type (name_off = 0)",
+	.raw_types = {
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),		/* [1] */
+		BTF_TYPE_ENC(0,
+			     BTF_INFO_ENC(BTF_KIND_ENUM, 0, 1),
+			     sizeof(int)),				/* [2] */
+		BTF_ENUM_ENC(NAME_TBD, 0),
+		BTF_END_RAW,
+	},
+	.str_sec = "\0A\0B",
+	.str_sec_size = sizeof("\0A\0B"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "enum_type_check_btf",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_type_id = 1,
+	.value_type_id = 1,
+	.max_entries = 4,
+},
+
+{
+	.descr = "enum type (invalid name, invalid identifier)",
+	.raw_types = {
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),		/* [1] */
+		BTF_TYPE_ENC(NAME_TBD,
+			     BTF_INFO_ENC(BTF_KIND_ENUM, 0, 1),
+			     sizeof(int)),				/* [2] */
+		BTF_ENUM_ENC(NAME_TBD, 0),
+		BTF_END_RAW,
+	},
+	.str_sec = "\0A!\0B",
+	.str_sec_size = sizeof("\0A!\0B"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "enum_type_check_btf",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_type_id = 1,
+	.value_type_id = 1,
+	.max_entries = 4,
+	.btf_load_err = true,
+	.err_str = "Invalid name",
+},
+
+{
+	.descr = "enum member (invalid name, name_off = 0)",
+	.raw_types = {
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),		/* [1] */
+		BTF_TYPE_ENC(0,
+			     BTF_INFO_ENC(BTF_KIND_ENUM, 0, 1),
+			     sizeof(int)),				/* [2] */
+		BTF_ENUM_ENC(0, 0),
+		BTF_END_RAW,
+	},
+	.str_sec = "",
+	.str_sec_size = sizeof(""),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "enum_type_check_btf",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_type_id = 1,
+	.value_type_id = 1,
+	.max_entries = 4,
+	.btf_load_err = true,
+	.err_str = "Invalid name",
+},
+
+{
+	.descr = "enum member (invalid name, invalid identifier)",
+	.raw_types = {
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),		/* [1] */
+		BTF_TYPE_ENC(0,
+			     BTF_INFO_ENC(BTF_KIND_ENUM, 0, 1),
+			     sizeof(int)),				/* [2] */
+		BTF_ENUM_ENC(NAME_TBD, 0),
+		BTF_END_RAW,
+	},
+	.str_sec = "\0A!",
+	.str_sec_size = sizeof("\0A!"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "enum_type_check_btf",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_type_id = 1,
+	.value_type_id = 1,
+	.max_entries = 4,
+	.btf_load_err = true,
+	.err_str = "Invalid name",
+},
 {
 	.descr = "arraymap invalid btf key (a bit field)",
 	.raw_types = {

From 528bff0cdb6649f97f2c4802e4ac7a4b50645f2f Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Wed, 28 Nov 2018 09:38:23 -0800
Subject: [PATCH 14/93] tools: bpftool: fix a bitfield pretty print issue

Commit b12d6ec09730 ("bpf: btf: add btf print functionality")
added btf pretty print functionality to bpftool.
There is a problem though in printing a bitfield whose type
has modifiers.

For example, for a type like
  typedef int ___int;
  struct tmp_t {
          int a:3;
          ___int b:3;
  };
Suppose we have a map
  struct bpf_map_def SEC("maps") tmpmap = {
          .type = BPF_MAP_TYPE_HASH,
          .key_size = sizeof(__u32),
          .value_size = sizeof(struct tmp_t),
          .max_entries = 1,
  };
and the hash table is populated with one element with
key 0 and value (.a = 1 and .b = 2).

In BTF, the struct member "b" will have a type "typedef" which
points to an int type. The current implementation does not
pass the bit offset during transition from typedef to int type,
hence incorrectly print the value as
  $ bpftool m d id 79
  [{
          "key": 0,
          "value": {
              "a": 0x1,
              "b": 0x1
          }
      }
  ]

This patch fixed the issue by carrying bit_offset along the type
chain during bit_field print. The correct result can be printed as
  $ bpftool m d id 76
  [{
          "key": 0,
          "value": {
              "a": 0x1,
              "b": 0x2
          }
      }
  ]

The kernel pretty print is implemented correctly and does not
have this issue.

Fixes: b12d6ec09730 ("bpf: btf: add btf print functionality")
Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/bpf/bpftool/btf_dumper.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/bpf/bpftool/btf_dumper.c b/tools/bpf/bpftool/btf_dumper.c
index 55bc512a1831..e4e6e2b3fd84 100644
--- a/tools/bpf/bpftool/btf_dumper.c
+++ b/tools/bpf/bpftool/btf_dumper.c
@@ -32,7 +32,7 @@ static void btf_dumper_ptr(const void *data, json_writer_t *jw,
 }
 
 static int btf_dumper_modifier(const struct btf_dumper *d, __u32 type_id,
-			       const void *data)
+			       __u8 bit_offset, const void *data)
 {
 	int actual_type_id;
 
@@ -40,7 +40,7 @@ static int btf_dumper_modifier(const struct btf_dumper *d, __u32 type_id,
 	if (actual_type_id < 0)
 		return actual_type_id;
 
-	return btf_dumper_do_type(d, actual_type_id, 0, data);
+	return btf_dumper_do_type(d, actual_type_id, bit_offset, data);
 }
 
 static void btf_dumper_enum(const void *data, json_writer_t *jw)
@@ -237,7 +237,7 @@ static int btf_dumper_do_type(const struct btf_dumper *d, __u32 type_id,
 	case BTF_KIND_VOLATILE:
 	case BTF_KIND_CONST:
 	case BTF_KIND_RESTRICT:
-		return btf_dumper_modifier(d, type_id, data);
+		return btf_dumper_modifier(d, type_id, bit_offset, data);
 	default:
 		jsonw_printf(d->jw, "(unsupported-kind");
 		return -EINVAL;

From 09ee3b4a249dd5c64da7d25a52a4ce42a49d647a Mon Sep 17 00:00:00 2001
From: Kunihiko Hayashi <hayashi.kunihiko@socionext.com>
Date: Thu, 29 Nov 2018 17:08:36 +0900
Subject: [PATCH 15/93] net: ethernet: ave: Increase descriptors to improve
 performance

To improve performance, this increases Rx descriptor to 256, Tx descriptor
to 64, and adjusts NAPI weight to NAPI_POLL_WEIGHT.

Signed-off-by: Kunihiko Hayashi <hayashi.kunihiko@socionext.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/socionext/sni_ave.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/socionext/sni_ave.c b/drivers/net/ethernet/socionext/sni_ave.c
index 6732f5cbde08..29b5b12bce6c 100644
--- a/drivers/net/ethernet/socionext/sni_ave.c
+++ b/drivers/net/ethernet/socionext/sni_ave.c
@@ -185,8 +185,8 @@
 				 NETIF_MSG_TX_ERR)
 
 /* Parameter for descriptor */
-#define AVE_NR_TXDESC		32	/* Tx descriptor */
-#define AVE_NR_RXDESC		64	/* Rx descriptor */
+#define AVE_NR_TXDESC		64	/* Tx descriptor */
+#define AVE_NR_RXDESC		256	/* Rx descriptor */
 
 #define AVE_DESC_OFS_CMDSTS	0
 #define AVE_DESC_OFS_ADDRL	4
@@ -1689,9 +1689,10 @@ static int ave_probe(struct platform_device *pdev)
 		 pdev->name, pdev->id);
 
 	/* Register as a NAPI supported driver */
-	netif_napi_add(ndev, &priv->napi_rx, ave_napi_poll_rx, priv->rx.ndesc);
+	netif_napi_add(ndev, &priv->napi_rx, ave_napi_poll_rx,
+		       NAPI_POLL_WEIGHT);
 	netif_tx_napi_add(ndev, &priv->napi_tx, ave_napi_poll_tx,
-			  priv->tx.ndesc);
+			  NAPI_POLL_WEIGHT);
 
 	platform_set_drvdata(pdev, ndev);
 

From 88113957ddb7b7d5451e28cd708c82ea7e63b097 Mon Sep 17 00:00:00 2001
From: Kunihiko Hayashi <hayashi.kunihiko@socionext.com>
Date: Thu, 29 Nov 2018 17:08:37 +0900
Subject: [PATCH 16/93] net: ethernet: ave: Replace NET_IP_ALIGN with
 AVE_FRAME_HEADROOM

In commit 26a4676faa1a ("arm64: mm: define NET_IP_ALIGN to 0"),
AVE controller affects this modification because the controller forces
to ignore lower 2bits of buffer start address, and make 2-byte headroom,
that is, data reception starts from (buffer + 2).

This patch defines AVE_FRAME_HEADROOM macro as hardware-specific value,
and replaces NET_IP_ALIGN with it.

Signed-off-by: Kunihiko Hayashi <hayashi.kunihiko@socionext.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/socionext/sni_ave.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/socionext/sni_ave.c b/drivers/net/ethernet/socionext/sni_ave.c
index 29b5b12bce6c..0da11344d035 100644
--- a/drivers/net/ethernet/socionext/sni_ave.c
+++ b/drivers/net/ethernet/socionext/sni_ave.c
@@ -194,6 +194,7 @@
 
 /* Parameter for ethernet frame */
 #define AVE_MAX_ETHFRAME	1518
+#define AVE_FRAME_HEADROOM	2
 
 /* Parameter for interrupt */
 #define AVE_INTM_COUNT		20
@@ -576,12 +577,13 @@ static int ave_rxdesc_prepare(struct net_device *ndev, int entry)
 
 	skb = priv->rx.desc[entry].skbs;
 	if (!skb) {
-		skb = netdev_alloc_skb_ip_align(ndev,
-						AVE_MAX_ETHFRAME);
+		skb = netdev_alloc_skb(ndev, AVE_MAX_ETHFRAME);
 		if (!skb) {
 			netdev_err(ndev, "can't allocate skb for Rx\n");
 			return -ENOMEM;
 		}
+		skb->data += AVE_FRAME_HEADROOM;
+		skb->tail += AVE_FRAME_HEADROOM;
 	}
 
 	/* set disable to cmdsts */
@@ -594,12 +596,12 @@ static int ave_rxdesc_prepare(struct net_device *ndev, int entry)
 	 * - Rx buffer begins with 2 byte headroom, and data will be put from
 	 *   (buffer + 2).
 	 * To satisfy this, specify the address to put back the buffer
-	 * pointer advanced by NET_IP_ALIGN by netdev_alloc_skb_ip_align(),
-	 * and expand the map size by NET_IP_ALIGN.
+	 * pointer advanced by AVE_FRAME_HEADROOM, and expand the map size
+	 * by AVE_FRAME_HEADROOM.
 	 */
 	ret = ave_dma_map(ndev, &priv->rx.desc[entry],
-			  skb->data - NET_IP_ALIGN,
-			  AVE_MAX_ETHFRAME + NET_IP_ALIGN,
+			  skb->data - AVE_FRAME_HEADROOM,
+			  AVE_MAX_ETHFRAME + AVE_FRAME_HEADROOM,
 			  DMA_FROM_DEVICE, &paddr);
 	if (ret) {
 		netdev_err(ndev, "can't map skb for Rx\n");

From d75d0e874ffe929dec143d331b53e4bfceb10af2 Mon Sep 17 00:00:00 2001
From: Kunihiko Hayashi <hayashi.kunihiko@socionext.com>
Date: Thu, 29 Nov 2018 17:08:38 +0900
Subject: [PATCH 17/93] net: ethernet: ave: Add MODULE_AUTHOR and MAINTAINERS
 entry

Add missing MODULE_AUTHOR of ave driver and an entry to MAINTAINERS.

Signed-off-by: Kunihiko Hayashi <hayashi.kunihiko@socionext.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS                              | 7 +++++++
 drivers/net/ethernet/socionext/sni_ave.c | 1 +
 2 files changed, 8 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index da57abebaab3..51d35f0ab989 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13885,6 +13885,13 @@ F:	drivers/md/raid*
 F:	include/linux/raid/
 F:	include/uapi/linux/raid/
 
+SOCIONEXT (SNI) AVE NETWORK DRIVER
+M:	Kunihiko Hayashi <hayashi.kunihiko@socionext.com>
+L:	netdev@vger.kernel.org
+S:	Maintained
+F:	drivers/net/ethernet/socionext/sni_ave.c
+F:	Documentation/devicetree/bindings/net/socionext,uniphier-ave4.txt
+
 SOCIONEXT (SNI) NETSEC NETWORK DRIVER
 M:	Jassi Brar <jaswinder.singh@linaro.org>
 L:	netdev@vger.kernel.org
diff --git a/drivers/net/ethernet/socionext/sni_ave.c b/drivers/net/ethernet/socionext/sni_ave.c
index 0da11344d035..7c7cd9d94bcc 100644
--- a/drivers/net/ethernet/socionext/sni_ave.c
+++ b/drivers/net/ethernet/socionext/sni_ave.c
@@ -1916,5 +1916,6 @@ static struct platform_driver ave_driver = {
 };
 module_platform_driver(ave_driver);
 
+MODULE_AUTHOR("Kunihiko Hayashi <hayashi.kunihiko@socionext.com>");
 MODULE_DESCRIPTION("Socionext UniPhier AVE ethernet driver");
 MODULE_LICENSE("GPL v2");

From d7f7e0018b96fd1a30a968faa9464eb57372c1ec Mon Sep 17 00:00:00 2001
From: Baruch Siach <baruch@tkos.co.il>
Date: Thu, 29 Nov 2018 12:40:11 +0200
Subject: [PATCH 18/93] net: phy: sfp: correct store of detected link modes

The link modes that sfp_parse_support() detects are stored in the
'modes' bitmap. There is no reason to make an exception for 1000Base-PX
or 1000Base-BX10.

Fixes: 03145864bd0f ("sfp: support 1G BiDi (eg, FiberStore SFP-GE-BX) modules")
Signed-off-by: Baruch Siach <baruch@tkos.co.il>
Acked-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/sfp-bus.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/phy/sfp-bus.c b/drivers/net/phy/sfp-bus.c
index 83060fb349f4..ad9db652874d 100644
--- a/drivers/net/phy/sfp-bus.c
+++ b/drivers/net/phy/sfp-bus.c
@@ -162,7 +162,7 @@ void sfp_parse_support(struct sfp_bus *bus, const struct sfp_eeprom_id *id,
 	/* 1000Base-PX or 1000Base-BX10 */
 	if ((id->base.e_base_px || id->base.e_base_bx10) &&
 	    br_min <= 1300 && br_max >= 1200)
-		phylink_set(support, 1000baseX_Full);
+		phylink_set(modes, 1000baseX_Full);
 
 	/* For active or passive cables, select the link modes
 	 * based on the bit rates and the cable compliance bytes.

From f28c020fb488e1a8b87469812017044bef88aa2b Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Thu, 29 Nov 2018 14:14:48 +0100
Subject: [PATCH 19/93] net: restore call to netdev_queue_numa_node_write when
 resetting XPS

Before commit 80d19669ecd3 ("net: Refactor XPS for CPUs and Rx queues"),
netif_reset_xps_queues() did netdev_queue_numa_node_write() for all the
queues being reset. Now, this is only done when the "active" variable in
clean_xps_maps() is false, ie when on all the CPUs, there's no active
XPS mapping left.

Fixes: 80d19669ecd3 ("net: Refactor XPS for CPUs and Rx queues")
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index ddc551f24ba2..32a63f4c3a92 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2187,18 +2187,20 @@ static void clean_xps_maps(struct net_device *dev, const unsigned long *mask,
 		active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
 					       count);
 	if (!active) {
-		if (is_rxqs_map) {
+		if (is_rxqs_map)
 			RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
-		} else {
+		else
 			RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
-
-			for (i = offset + (count - 1); count--; i--)
-				netdev_queue_numa_node_write(
-					netdev_get_tx_queue(dev, i),
-							NUMA_NO_NODE);
-		}
 		kfree_rcu(dev_maps, rcu);
 	}
+
+	if (!is_rxqs_map) {
+		for (i = offset + (count - 1); count--; i--) {
+			netdev_queue_numa_node_write(
+				netdev_get_tx_queue(dev, i),
+				NUMA_NO_NODE);
+		}
+	}
 }
 
 static void netif_reset_xps_queues(struct net_device *dev, u16 offset,

From 867d0ad476db89a1e8af3f297af402399a54eea5 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Thu, 29 Nov 2018 14:14:49 +0100
Subject: [PATCH 20/93] net: fix XPS static_key accounting

Commit 04157469b7b8 ("net: Use static_key for XPS maps") introduced a
static key for XPS, but the increments/decrements don't match.

First, the static key's counter is incremented once for each queue, but
only decremented once for a whole batch of queues, leading to large
unbalances.

Second, the xps_rxqs_needed key is decremented whenever we reset a batch
of queues, whether they had any rxqs mapping or not, so that if we setup
cpu-XPS on em1 and RXQS-XPS on em2, resetting the queues on em1 would
decrement the xps_rxqs_needed key.

This reworks the accounting scheme so that the xps_needed key is
incremented only once for each type of XPS for all the queues on a
device, and the xps_rxqs_needed key is incremented only once for all
queues. This is sufficient to let us retrieve queues via
get_xps_queue().

This patch introduces a new reset_xps_maps(), which reinitializes and
frees the appropriate map (xps_rxqs_map or xps_cpus_map), and drops a
reference to the needed keys:
 - both xps_needed and xps_rxqs_needed, in case of rxqs maps,
 - only xps_needed, in case of CPU maps.

Now, we also need to call reset_xps_maps() at the end of
__netif_set_xps_queue() when there's no active map left, for example
when writing '00000000,00000000' to all queues' xps_rxqs setting.

Fixes: 04157469b7b8 ("net: Use static_key for XPS maps")
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 45 ++++++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 21 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 32a63f4c3a92..3470e7fff1f4 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2175,6 +2175,20 @@ static bool remove_xps_queue_cpu(struct net_device *dev,
 	return active;
 }
 
+static void reset_xps_maps(struct net_device *dev,
+			   struct xps_dev_maps *dev_maps,
+			   bool is_rxqs_map)
+{
+	if (is_rxqs_map) {
+		static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
+		RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
+	} else {
+		RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
+	}
+	static_key_slow_dec_cpuslocked(&xps_needed);
+	kfree_rcu(dev_maps, rcu);
+}
+
 static void clean_xps_maps(struct net_device *dev, const unsigned long *mask,
 			   struct xps_dev_maps *dev_maps, unsigned int nr_ids,
 			   u16 offset, u16 count, bool is_rxqs_map)
@@ -2186,13 +2200,8 @@ static void clean_xps_maps(struct net_device *dev, const unsigned long *mask,
 	     j < nr_ids;)
 		active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
 					       count);
-	if (!active) {
-		if (is_rxqs_map)
-			RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
-		else
-			RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
-		kfree_rcu(dev_maps, rcu);
-	}
+	if (!active)
+		reset_xps_maps(dev, dev_maps, is_rxqs_map);
 
 	if (!is_rxqs_map) {
 		for (i = offset + (count - 1); count--; i--) {
@@ -2236,10 +2245,6 @@ static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
 		       false);
 
 out_no_maps:
-	if (static_key_enabled(&xps_rxqs_needed))
-		static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
-
-	static_key_slow_dec_cpuslocked(&xps_needed);
 	mutex_unlock(&xps_map_mutex);
 	cpus_read_unlock();
 }
@@ -2357,9 +2362,12 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
 	if (!new_dev_maps)
 		goto out_no_new_maps;
 
-	static_key_slow_inc_cpuslocked(&xps_needed);
-	if (is_rxqs_map)
-		static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
+	if (!dev_maps) {
+		/* Increment static keys at most once per type */
+		static_key_slow_inc_cpuslocked(&xps_needed);
+		if (is_rxqs_map)
+			static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
+	}
 
 	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
 	     j < nr_ids;) {
@@ -2457,13 +2465,8 @@ out_no_new_maps:
 	}
 
 	/* free map if not active */
-	if (!active) {
-		if (is_rxqs_map)
-			RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
-		else
-			RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
-		kfree_rcu(dev_maps, rcu);
-	}
+	if (!active)
+		reset_xps_maps(dev, dev_maps, is_rxqs_map);
 
 out_no_maps:
 	mutex_unlock(&xps_map_mutex);

From 90230968f102acbe103fbf7c03d41addfef5f153 Mon Sep 17 00:00:00 2001
From: Baruch Siach <baruch@tkos.co.il>
Date: Thu, 29 Nov 2018 12:00:05 +0200
Subject: [PATCH 21/93] net: phy: sfp: correct location of SFP standards

SFP standards are now available from the SNIA (Storage Networking
Industry Association) website.

Cc: Andrew Lunn <andrew@lunn.ch>
Cc: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Baruch Siach <baruch@tkos.co.il>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sfp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/sfp.h b/include/linux/sfp.h
index d37518e89db2..d9d9de3fcf8e 100644
--- a/include/linux/sfp.h
+++ b/include/linux/sfp.h
@@ -224,7 +224,7 @@ struct sfp_eeprom_ext {
  *
  * See the SFF-8472 specification and related documents for the definition
  * of these structure members. This can be obtained from
- * ftp://ftp.seagate.com/sff
+ * https://www.snia.org/technology-communities/sff/specifications
  */
 struct sfp_eeprom_id {
 	struct sfp_eeprom_base base;

From d449ba3d581ed29f751a59792fdc775572c66904 Mon Sep 17 00:00:00 2001
From: Martin Schiller <ms@dev.tdt.de>
Date: Tue, 27 Nov 2018 09:50:27 +0100
Subject: [PATCH 22/93] net/x25: fix called/calling length calculation in
 x25_parse_address_block

The length of the called and calling address was not calculated
correctly (BCD encoding).

Signed-off-by: Martin Schiller <ms@dev.tdt.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/x25/af_x25.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index d49aa79b7997..5226a7f43050 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -100,7 +100,7 @@ int x25_parse_address_block(struct sk_buff *skb,
 	}
 
 	len = *skb->data;
-	needed = 1 + (len >> 4) + (len & 0x0f);
+	needed = 1 + ((len >> 4) + (len & 0x0f) + 1) / 2;
 
 	if (!pskb_may_pull(skb, needed)) {
 		/* packet is too short to hold the addresses it claims

From 06137619f061f498c2924f6543fa45b7d39f0501 Mon Sep 17 00:00:00 2001
From: Martin Schiller <ms@dev.tdt.de>
Date: Tue, 27 Nov 2018 09:50:28 +0100
Subject: [PATCH 23/93] net/x25: fix null_x25_address handling

o x25_find_listener(): the compare for the null_x25_address was wrong.
   We have to check the x25_addr of the listener socket instead of the
   x25_addr of the incomming call.

 o x25_bind(): it was not possible to bind a socket to null_x25_address

Signed-off-by: Martin Schiller <ms@dev.tdt.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/x25/af_x25.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 5226a7f43050..5121729b8b63 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -288,7 +288,7 @@ static struct sock *x25_find_listener(struct x25_address *addr,
 	sk_for_each(s, &x25_list)
 		if ((!strcmp(addr->x25_addr,
 			x25_sk(s)->source_addr.x25_addr) ||
-				!strcmp(addr->x25_addr,
+				!strcmp(x25_sk(s)->source_addr.x25_addr,
 					null_x25_address.x25_addr)) &&
 					s->sk_state == TCP_LISTEN) {
 			/*
@@ -688,11 +688,15 @@ static int x25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 		goto out;
 	}
 
-	len = strlen(addr->sx25_addr.x25_addr);
-	for (i = 0; i < len; i++) {
-		if (!isdigit(addr->sx25_addr.x25_addr[i])) {
-			rc = -EINVAL;
-			goto out;
+	/* check for the null_x25_address */
+	if (strcmp(addr->sx25_addr.x25_addr, null_x25_address.x25_addr)) {
+
+		len = strlen(addr->sx25_addr.x25_addr);
+		for (i = 0; i < len; i++) {
+			if (!isdigit(addr->sx25_addr.x25_addr[i])) {
+				rc = -EINVAL;
+				goto out;
+			}
 		}
 	}
 

From b020fcf6bb4b2d980298c416b3f407075aa2b3b6 Mon Sep 17 00:00:00 2001
From: Martin Schiller <ms@dev.tdt.de>
Date: Tue, 27 Nov 2018 09:50:29 +0100
Subject: [PATCH 24/93] net/x25: handle call collisions

If a session in X25_STATE_1 (Awaiting Call Accept) receives a call
request, the session will be closed (x25_disconnect), cause=0x01
(Number Busy) and diag=0x48 (Call Collision) will be set and a clear
request will be send.

Signed-off-by: Martin Schiller <ms@dev.tdt.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/x25/x25_in.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/net/x25/x25_in.c b/net/x25/x25_in.c
index 3c12cae32001..afb26221d8a8 100644
--- a/net/x25/x25_in.c
+++ b/net/x25/x25_in.c
@@ -142,6 +142,15 @@ static int x25_state1_machine(struct sock *sk, struct sk_buff *skb, int frametyp
 			sk->sk_state_change(sk);
 		break;
 	}
+	case X25_CALL_REQUEST:
+		/* call collision */
+		x25->causediag.cause      = 0x01;
+		x25->causediag.diagnostic = 0x48;
+
+		x25_write_internal(sk, X25_CLEAR_REQUEST);
+		x25_disconnect(sk, EISCONN, 0x01, 0x48);
+		break;
+
 	case X25_CLEAR_REQUEST:
 		if (!pskb_may_pull(skb, X25_STD_MIN_LEN + 2))
 			goto out_clear;

From 9410d386d0a829ace9558336263086c2fbbe8aed Mon Sep 17 00:00:00 2001
From: Christoph Paasch <cpaasch@apple.com>
Date: Thu, 29 Nov 2018 16:01:04 -0800
Subject: [PATCH 25/93] net: Prevent invalid access to skb->prev in
 __qdisc_drop_all

__qdisc_drop_all() accesses skb->prev to get to the tail of the
segment-list.

With commit 68d2f84a1368 ("net: gro: properly remove skb from list")
the skb-list handling has been changed to set skb->next to NULL and set
the list-poison on skb->prev.

With that change, __qdisc_drop_all() will panic when it tries to
dereference skb->prev.

Since commit 992cba7e276d ("net: Add and use skb_list_del_init().")
__list_del_entry is used, leaving skb->prev unchanged (thus,
pointing to the list-head if it's the first skb of the list).
This will make __qdisc_drop_all modify the next-pointer of the list-head
and result in a panic later on:

[   34.501053] general protection fault: 0000 [#1] SMP KASAN PTI
[   34.501968] CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.20.0-rc2.mptcp #108
[   34.502887] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 0.5.1 01/01/2011
[   34.504074] RIP: 0010:dev_gro_receive+0x343/0x1f90
[   34.504751] Code: e0 48 c1 e8 03 42 80 3c 30 00 0f 85 4a 1c 00 00 4d 8b 24 24 4c 39 65 d0 0f 84 0a 04 00 00 49 8d 7c 24 38 48 89 f8 48 c1 e8 03 <42> 0f b6 04 30 84 c0 74 08 3c 04
[   34.507060] RSP: 0018:ffff8883af507930 EFLAGS: 00010202
[   34.507761] RAX: 0000000000000007 RBX: ffff8883970b2c80 RCX: 1ffff11072e165a6
[   34.508640] RDX: 1ffff11075867008 RSI: ffff8883ac338040 RDI: 0000000000000038
[   34.509493] RBP: ffff8883af5079d0 R08: ffff8883970b2d40 R09: 0000000000000062
[   34.510346] R10: 0000000000000034 R11: 0000000000000000 R12: 0000000000000000
[   34.511215] R13: 0000000000000000 R14: dffffc0000000000 R15: ffff8883ac338008
[   34.512082] FS:  0000000000000000(0000) GS:ffff8883af500000(0000) knlGS:0000000000000000
[   34.513036] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   34.513741] CR2: 000055ccc3e9d020 CR3: 00000003abf32000 CR4: 00000000000006e0
[   34.514593] Call Trace:
[   34.514893]  <IRQ>
[   34.515157]  napi_gro_receive+0x93/0x150
[   34.515632]  receive_buf+0x893/0x3700
[   34.516094]  ? __netif_receive_skb+0x1f/0x1a0
[   34.516629]  ? virtnet_probe+0x1b40/0x1b40
[   34.517153]  ? __stable_node_chain+0x4d0/0x850
[   34.517684]  ? kfree+0x9a/0x180
[   34.518067]  ? __kasan_slab_free+0x171/0x190
[   34.518582]  ? detach_buf+0x1df/0x650
[   34.519061]  ? lapic_next_event+0x5a/0x90
[   34.519539]  ? virtqueue_get_buf_ctx+0x280/0x7f0
[   34.520093]  virtnet_poll+0x2df/0xd60
[   34.520533]  ? receive_buf+0x3700/0x3700
[   34.521027]  ? qdisc_watchdog_schedule_ns+0xd5/0x140
[   34.521631]  ? htb_dequeue+0x1817/0x25f0
[   34.522107]  ? sch_direct_xmit+0x142/0xf30
[   34.522595]  ? virtqueue_napi_schedule+0x26/0x30
[   34.523155]  net_rx_action+0x2f6/0xc50
[   34.523601]  ? napi_complete_done+0x2f0/0x2f0
[   34.524126]  ? kasan_check_read+0x11/0x20
[   34.524608]  ? _raw_spin_lock+0x7d/0xd0
[   34.525070]  ? _raw_spin_lock_bh+0xd0/0xd0
[   34.525563]  ? kvm_guest_apic_eoi_write+0x6b/0x80
[   34.526130]  ? apic_ack_irq+0x9e/0xe0
[   34.526567]  __do_softirq+0x188/0x4b5
[   34.527015]  irq_exit+0x151/0x180
[   34.527417]  do_IRQ+0xdb/0x150
[   34.527783]  common_interrupt+0xf/0xf
[   34.528223]  </IRQ>

This patch makes sure that skb->prev is set to NULL when entering
netem_enqueue.

Cc: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Cc: Tyler Hicks <tyhicks@canonical.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Fixes: 68d2f84a1368 ("net: gro: properly remove skb from list")
Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Christoph Paasch <cpaasch@apple.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_netem.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 2c38e3d07924..22cd46a60057 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -431,6 +431,9 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 	int count = 1;
 	int rc = NET_XMIT_SUCCESS;
 
+	/* Do not fool qdisc_drop_all() */
+	skb->prev = NULL;
+
 	/* Random duplication */
 	if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
 		++count;

From 4135cce7fd0a0d755665c02728578c7c5afe4726 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Tue, 27 Nov 2018 19:11:50 +0800
Subject: [PATCH 26/93] sctp: update frag_point when stream_interleave is set

sctp_assoc_update_frag_point() should be called whenever asoc->pathmtu
changes, but we missed one place in sctp_association_init(). It would
cause frag_point is zero when sending data.

As says in Jakub's reproducer, if sp->pathmtu is set by socketopt, the
new asoc->pathmtu inherits it in sctp_association_init(). Later when
transports are added and their pmtu >= asoc->pathmtu, it will never
call sctp_assoc_update_frag_point() to set frag_point.

This patch is to fix it by updating frag_point after asoc->pathmtu is
set as sp->pathmtu in sctp_association_init(). Note that it moved them
after sctp_stream_init(), as stream->si needs to be set first.

Frag_point's calculation is also related with datachunk's type, so it
needs to update frag_point when stream->si may be changed in
sctp_process_init().

v1->v2:
  - call sctp_assoc_update_frag_point() separately in sctp_process_init
    and sctp_association_init, per Marcelo's suggestion.

Fixes: 2f5e3c9df693 ("sctp: introduce sctp_assoc_update_frag_point")
Reported-by: Jakub Audykowicz <jakub.audykowicz@gmail.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/associola.c     | 7 ++++---
 net/sctp/sm_make_chunk.c | 3 +++
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 6a28b96e779e..dd77ec3892b6 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -118,9 +118,6 @@ static struct sctp_association *sctp_association_init(
 	asoc->flowlabel = sp->flowlabel;
 	asoc->dscp = sp->dscp;
 
-	/* Initialize default path MTU. */
-	asoc->pathmtu = sp->pathmtu;
-
 	/* Set association default SACK delay */
 	asoc->sackdelay = msecs_to_jiffies(sp->sackdelay);
 	asoc->sackfreq = sp->sackfreq;
@@ -252,6 +249,10 @@ static struct sctp_association *sctp_association_init(
 			     0, gfp))
 		goto fail_init;
 
+	/* Initialize default path MTU. */
+	asoc->pathmtu = sp->pathmtu;
+	sctp_assoc_update_frag_point(asoc);
+
 	/* Assume that peer would support both address types unless we are
 	 * told otherwise.
 	 */
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 4a4fd1971255..f4ac6c592e13 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -2462,6 +2462,9 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk,
 			     asoc->c.sinit_max_instreams, gfp))
 		goto clean_up;
 
+	/* Update frag_point when stream_interleave may get changed. */
+	sctp_assoc_update_frag_point(asoc);
+
 	if (!asoc->temp && sctp_assoc_set_id(asoc, gfp))
 		goto clean_up;
 

From 5f2b8b62786853341a20d4cd4948f9cbca3db002 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Tue, 27 Nov 2018 14:21:43 +0100
Subject: [PATCH 27/93] net: stmmac: Move debugfs init/exit to
 ->probe()/->remove()

Setting up and tearing down debugfs is current unbalanced, as seen by
this error during resume from suspend:

    [  752.134067] dwc-eth-dwmac 2490000.ethernet eth0: ERROR failed to create debugfs directory
    [  752.134347] dwc-eth-dwmac 2490000.ethernet eth0: stmmac_hw_setup: failed debugFS registration

The imbalance happens because the driver creates the debugfs hierarchy
when the device is opened and tears it down when the device is closed.
There's little gain in that, and it could be argued that it is even
surprising because it's not usually done for other devices. Fix the
imbalance by moving the debugfs creation and teardown to the driver's
->probe() and ->remove() implementations instead.

Note that the ring descriptors cannot be read while the interface is
down, so make sure to return an empty file when the descriptors_status
debugfs file is read.

Signed-off-by: Thierry Reding <treding@nvidia.com>
Acked-by: Jose Abreu <joabreu@synopsys.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/stmicro/stmmac/stmmac_main.c | 23 +++++++++++--------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 076a8be18d67..5551fead8f66 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -2550,12 +2550,6 @@ static int stmmac_hw_setup(struct net_device *dev, bool init_ptp)
 			netdev_warn(priv->dev, "PTP init failed\n");
 	}
 
-#ifdef CONFIG_DEBUG_FS
-	ret = stmmac_init_fs(dev);
-	if (ret < 0)
-		netdev_warn(priv->dev, "%s: failed debugFS registration\n",
-			    __func__);
-#endif
 	priv->tx_lpi_timer = STMMAC_DEFAULT_TWT_LS;
 
 	if (priv->use_riwt) {
@@ -2756,10 +2750,6 @@ static int stmmac_release(struct net_device *dev)
 
 	netif_carrier_off(dev);
 
-#ifdef CONFIG_DEBUG_FS
-	stmmac_exit_fs(dev);
-#endif
-
 	stmmac_release_ptp(priv);
 
 	return 0;
@@ -3899,6 +3889,9 @@ static int stmmac_sysfs_ring_read(struct seq_file *seq, void *v)
 	u32 tx_count = priv->plat->tx_queues_to_use;
 	u32 queue;
 
+	if ((dev->flags & IFF_UP) == 0)
+		return 0;
+
 	for (queue = 0; queue < rx_count; queue++) {
 		struct stmmac_rx_queue *rx_q = &priv->rx_queue[queue];
 
@@ -4397,6 +4390,13 @@ int stmmac_dvr_probe(struct device *device,
 		goto error_netdev_register;
 	}
 
+#ifdef CONFIG_DEBUG_FS
+	ret = stmmac_init_fs(ndev);
+	if (ret < 0)
+		netdev_warn(priv->dev, "%s: failed debugFS registration\n",
+			    __func__);
+#endif
+
 	return ret;
 
 error_netdev_register:
@@ -4432,6 +4432,9 @@ int stmmac_dvr_remove(struct device *dev)
 
 	netdev_info(priv->dev, "%s: removing driver", __func__);
 
+#ifdef CONFIG_DEBUG_FS
+	stmmac_exit_fs(ndev);
+#endif
 	stmmac_stop_all_dma(priv);
 
 	stmmac_mac_set(priv, priv->ioaddr, false);

From 56e0e295091dde5d0346fad08d3d8b6c07084c9d Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Tue, 27 Nov 2018 14:00:15 +0000
Subject: [PATCH 28/93] liquidio: fix spelling mistake "deferal" -> "deferral"

There is a spelling mistake in the oct_stats_strings array, fix it.

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/cavium/liquidio/lio_ethtool.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c b/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c
index 4c3925af53bc..abe5d0dac851 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c
@@ -111,7 +111,7 @@ static const char oct_stats_strings[][ETH_GSTRING_LEN] = {
 	"mac_tx_one_collision",
 	"mac_tx_multi_collision",
 	"mac_tx_max_collision_fail",
-	"mac_tx_max_deferal_fail",
+	"mac_tx_max_deferral_fail",
 	"mac_tx_fifo_err",
 	"mac_tx_runts",
 

From 43d0e96022ae3c66743c01bba6c18a3afec7b578 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Tue, 27 Nov 2018 14:37:17 +0000
Subject: [PATCH 29/93] openvswitch: fix spelling mistake "execeeds" ->
 "exceeds"

There is a spelling mistake in a net_warn_ratelimited message, fix this.

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/conntrack.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index a4660c48ff01..cd94f925495a 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -1166,7 +1166,7 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
 				&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
 			if (err) {
 				net_warn_ratelimited("openvswitch: zone: %u "
-					"execeeds conntrack limit\n",
+					"exceeds conntrack limit\n",
 					info->zone.id);
 				return err;
 			}

From 37c4b91f955fdd5f4ad771956b97d35f1321098e Mon Sep 17 00:00:00 2001
From: Dmitry Bogdanov <dmitry.bogdanov@aquantia.com>
Date: Tue, 27 Nov 2018 14:51:17 +0000
Subject: [PATCH 30/93] net: aquantia: fix rx checksum offload bits

The last set of csum offload fixes had a leak:

Checksum enabled status bits from rx descriptor were incorrectly
interpreted. Consequently all the other valid logic worked on zero bits.
That caused rx checksum offloads never to trigger.

Tested by dumping rx descriptors and validating resulting csum_level.

Reported-by: Igor Russkikh <igor.russkikh@aquantia.com>
Signed-off-by: Dmitry Bogdanov <dmitry.bogdanov@aquantia.com>
Signed-off-by: Igor Russkikh <igor.russkikh@aquantia.com>
Fixes: ad703c2b9127f ("net: aquantia: invalid checksumm offload implementation")
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c
index f02592f43fe3..a7e853fa43c2 100644
--- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c
+++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c
@@ -674,7 +674,7 @@ static int hw_atl_b0_hw_ring_rx_receive(struct aq_hw_s *self,
 
 		rx_stat = (0x0000003CU & rxd_wb->status) >> 2;
 
-		is_rx_check_sum_enabled = (rxd_wb->type) & (0x3U << 19);
+		is_rx_check_sum_enabled = (rxd_wb->type >> 19) & 0x3U;
 
 		pkt_type = 0xFFU & (rxd_wb->type >> 4);
 

From 3b5b3a3331d141e8f2a7aaae3a94dfa1e61ecbe4 Mon Sep 17 00:00:00 2001
From: Toni Peltonen <peltzi@peltzi.fi>
Date: Tue, 27 Nov 2018 16:56:57 +0200
Subject: [PATCH 31/93] bonding: fix 802.3ad state sent to partner when
 unbinding slave

Previously when unbinding a slave the 802.3ad implementation only told
partner that the port is not suitable for aggregation by setting the port
aggregation state from aggregatable to individual. This is not enough. If the
physical layer still stays up and we only unbinded this port from the bond there
is nothing in the aggregation status alone to prevent the partner from sending
traffic towards us. To ensure that the partner doesn't consider this
port at all anymore we should also disable collecting and distributing to
signal that this actor is going away. Also clear AD_STATE_SYNCHRONIZATION to
ensure partner exits collecting + distributing state.

I have tested this behaviour againts Arista EOS switches with mlx5 cards
(physical link stays up even when interface is down) and simulated
the same situation virtually Linux <-> Linux with two network namespaces
running two veth device pairs. In both cases setting aggregation to
individual doesn't alone prevent traffic from being to sent towards this
port given that the link stays up in partners end. Partner still keeps
it's end in collecting + distributing state and continues until timeout is
reached. In most cases this means we are losing the traffic partner sends
towards our port while we wait for timeout. This is most visible with slow
periodic time (LACP rate slow).

Other open source implementations like Open VSwitch and libreswitch, and
vendor implementations like Arista EOS, seem to disable collecting +
distributing to when doing similar port disabling/detaching/removing change.
With this patch kernel implementation would behave the same way and ensure
partner doesn't consider our actor viable anymore.

Signed-off-by: Toni Peltonen <peltzi@peltzi.fi>
Signed-off-by: Jay Vosburgh <jay.vosburgh@canonical.com>
Acked-by: Jonathan Toppins <jtoppins@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_3ad.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
index f43fb2f958a5..93dfcef8afc4 100644
--- a/drivers/net/bonding/bond_3ad.c
+++ b/drivers/net/bonding/bond_3ad.c
@@ -2086,6 +2086,9 @@ void bond_3ad_unbind_slave(struct slave *slave)
 		   aggregator->aggregator_identifier);
 
 	/* Tell the partner that this port is not suitable for aggregation */
+	port->actor_oper_port_state &= ~AD_STATE_SYNCHRONIZATION;
+	port->actor_oper_port_state &= ~AD_STATE_COLLECTING;
+	port->actor_oper_port_state &= ~AD_STATE_DISTRIBUTING;
 	port->actor_oper_port_state &= ~AD_STATE_AGGREGATION;
 	__update_lacpdu_from_port(port);
 	ad_lacpdu_send(port);

From 1166494891da88af25c444e65cd4f32c3e026b46 Mon Sep 17 00:00:00 2001
From: John Hurley <john.hurley@netronome.com>
Date: Tue, 27 Nov 2018 14:04:11 -0800
Subject: [PATCH 32/93] nfp: flower: release metadata on offload failure

Calling nfp_compile_flow_metadata both assigns a stats context and
increments a ref counter on (or allocates) a mask id table entry. These
are released by the nfp_modify_flow_metadata call on flow deletion,
however, if a flow add fails after metadata is set then the flow entry
will be deleted but the metadata assignments leaked.

Add an error path to the flow add offload function to ensure allocated
metadata is released in the event of an offload fail.

Fixes: 81f3ddf2547d ("nfp: add control message passing capabilities to flower offloads")
Signed-off-by: John Hurley <john.hurley@netronome.com>
Reviewed-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/flower/offload.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c
index 29c95423ab64..c3ad8d737cf0 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c
@@ -479,13 +479,13 @@ nfp_flower_add_offload(struct nfp_app *app, struct net_device *netdev,
 	err = nfp_flower_xmit_flow(netdev, flow_pay,
 				   NFP_FLOWER_CMSG_TYPE_FLOW_ADD);
 	if (err)
-		goto err_destroy_flow;
+		goto err_release_metadata;
 
 	flow_pay->tc_flower_cookie = flow->cookie;
 	err = rhashtable_insert_fast(&priv->flow_table, &flow_pay->fl_node,
 				     nfp_flower_table_params);
 	if (err)
-		goto err_destroy_flow;
+		goto err_release_metadata;
 
 	port->tc_offload_cnt++;
 
@@ -494,6 +494,8 @@ nfp_flower_add_offload(struct nfp_app *app, struct net_device *netdev,
 
 	return 0;
 
+err_release_metadata:
+	nfp_modify_flow_metadata(app, flow_pay);
 err_destroy_flow:
 	kfree(flow_pay->action_data);
 	kfree(flow_pay->mask_data);

From b5f0cf08340090d1503dbdbfd797e32264974100 Mon Sep 17 00:00:00 2001
From: John Hurley <john.hurley@netronome.com>
Date: Tue, 27 Nov 2018 14:04:12 -0800
Subject: [PATCH 33/93] nfp: flower: prevent offload if rhashtable insert fails

For flow offload adds, if the rhash insert code fails, the flow will still
have been offloaded but the reference to it in the driver freed.

Re-order the offload setup calls to ensure that a flow will only be written
to FW if a kernel reference is held and stored in the rhashtable. Remove
this hashtable entry if the offload fails.

Fixes: c01d0efa5136 ("nfp: flower: use rhashtable for flow caching")
Signed-off-by: John Hurley <john.hurley@netronome.com>
Reviewed-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/netronome/nfp/flower/offload.c    | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c
index c3ad8d737cf0..2f49eb75f3cc 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c
@@ -476,17 +476,17 @@ nfp_flower_add_offload(struct nfp_app *app, struct net_device *netdev,
 	if (err)
 		goto err_destroy_flow;
 
-	err = nfp_flower_xmit_flow(netdev, flow_pay,
-				   NFP_FLOWER_CMSG_TYPE_FLOW_ADD);
-	if (err)
-		goto err_release_metadata;
-
 	flow_pay->tc_flower_cookie = flow->cookie;
 	err = rhashtable_insert_fast(&priv->flow_table, &flow_pay->fl_node,
 				     nfp_flower_table_params);
 	if (err)
 		goto err_release_metadata;
 
+	err = nfp_flower_xmit_flow(netdev, flow_pay,
+				   NFP_FLOWER_CMSG_TYPE_FLOW_ADD);
+	if (err)
+		goto err_remove_rhash;
+
 	port->tc_offload_cnt++;
 
 	/* Deallocate flow payload when flower rule has been destroyed. */
@@ -494,6 +494,10 @@ nfp_flower_add_offload(struct nfp_app *app, struct net_device *netdev,
 
 	return 0;
 
+err_remove_rhash:
+	WARN_ON_ONCE(rhashtable_remove_fast(&priv->flow_table,
+					    &flow_pay->fl_node,
+					    nfp_flower_table_params));
 err_release_metadata:
 	nfp_modify_flow_metadata(app, flow_pay);
 err_destroy_flow:

From c01ac66b38660f2b507ccd0b75d28e3002d56fbb Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Wed, 28 Nov 2018 22:33:53 -0800
Subject: [PATCH 34/93] bpf: Fix verifier log string check for bad alignment.

The message got changed a lot time ago.

This was responsible for 36 test case failures on sparc64.

Fixes: f1174f77b50c ("bpf/verifier: rework value tracking")
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/test_verifier.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 550b7e46bf4a..5dd4410a716c 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -14230,7 +14230,7 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
 
 	reject_from_alignment = fd_prog < 0 &&
 				(test->flags & F_NEEDS_EFFICIENT_UNALIGNED_ACCESS) &&
-				strstr(bpf_vlog, "Unknown alignment.");
+				strstr(bpf_vlog, "misaligned");
 #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
 	if (reject_from_alignment) {
 		printf("FAIL\nFailed due to alignment despite having efficient unaligned access: '%s'!\n",

From b7df9ada9a7700dbcca1ba53d217c01e3d48179c Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 1 Dec 2018 01:18:53 +0100
Subject: [PATCH 35/93] bpf: fix pointer offsets in context for 32 bit

Currently, pointer offsets in three BPF context structures are
broken in two scenarios: i) 32 bit compiled applications running
on 64 bit kernels, and ii) LLVM compiled BPF programs running
on 32 bit kernels. The latter is due to BPF target machine being
strictly 64 bit. So in each of the cases the offsets will mismatch
in verifier when checking / rewriting context access. Fix this by
providing a helper macro __bpf_md_ptr() that will enforce padding
up to 64 bit and proper alignment, and for context access a macro
bpf_ctx_range_ptr() which will cover full 64 bit member range on
32 bit archs. For flow_keys, we additionally need to force the
size check to sizeof(__u64) as with other pointer types.

Fixes: d58e468b1112 ("flow_dissector: implements flow dissector BPF hook")
Fixes: 4f738adba30a ("bpf: create tcp_bpf_ulp allowing BPF to monitor socket TX/RX data")
Fixes: 2dbb9b9e6df6 ("bpf: Introduce BPF_PROG_TYPE_SK_REUSEPORT")
Reported-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: David S. Miller <davem@davemloft.net>
Tested-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h         |  7 +++++++
 include/uapi/linux/bpf.h       | 17 ++++++++++++-----
 net/core/filter.c              | 16 ++++++++--------
 tools/include/uapi/linux/bpf.h | 17 ++++++++++++-----
 4 files changed, 39 insertions(+), 18 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 448dcc448f1f..795ff0b869bb 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -449,6 +449,13 @@ struct sock_reuseport;
 	offsetof(TYPE, MEMBER) ... offsetofend(TYPE, MEMBER) - 1
 #define bpf_ctx_range_till(TYPE, MEMBER1, MEMBER2)				\
 	offsetof(TYPE, MEMBER1) ... offsetofend(TYPE, MEMBER2) - 1
+#if BITS_PER_LONG == 64
+# define bpf_ctx_range_ptr(TYPE, MEMBER)					\
+	offsetof(TYPE, MEMBER) ... offsetofend(TYPE, MEMBER) - 1
+#else
+# define bpf_ctx_range_ptr(TYPE, MEMBER)					\
+	offsetof(TYPE, MEMBER) ... offsetof(TYPE, MEMBER) + 8 - 1
+#endif /* BITS_PER_LONG == 64 */
 
 #define bpf_target_off(TYPE, MEMBER, SIZE, PTR_SIZE)				\
 	({									\
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 852dc17ab47a..426b5c8a245b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2422,6 +2422,12 @@ enum bpf_lwt_encap_mode {
 	BPF_LWT_ENCAP_SEG6_INLINE
 };
 
+#define __bpf_md_ptr(type, name)	\
+union {					\
+	type name;			\
+	__u64 :64;			\
+} __attribute__((aligned(8)))
+
 /* user accessible mirror of in-kernel sk_buff.
  * new fields can only be added to the end of this structure
  */
@@ -2456,7 +2462,7 @@ struct __sk_buff {
 	/* ... here. */
 
 	__u32 data_meta;
-	struct bpf_flow_keys *flow_keys;
+	__bpf_md_ptr(struct bpf_flow_keys *, flow_keys);
 };
 
 struct bpf_tunnel_key {
@@ -2572,8 +2578,8 @@ enum sk_action {
  * be added to the end of this structure
  */
 struct sk_msg_md {
-	void *data;
-	void *data_end;
+	__bpf_md_ptr(void *, data);
+	__bpf_md_ptr(void *, data_end);
 
 	__u32 family;
 	__u32 remote_ip4;	/* Stored in network byte order */
@@ -2589,8 +2595,9 @@ struct sk_reuseport_md {
 	 * Start of directly accessible data. It begins from
 	 * the tcp/udp header.
 	 */
-	void *data;
-	void *data_end;		/* End of directly accessible data */
+	__bpf_md_ptr(void *, data);
+	/* End of directly accessible data */
+	__bpf_md_ptr(void *, data_end);
 	/*
 	 * Total length of packet (starting from the tcp/udp header).
 	 * Note that the directly accessible bytes (data_end - data)
diff --git a/net/core/filter.c b/net/core/filter.c
index 9a1327eb25fa..6ee605da990f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5435,8 +5435,8 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type
 		if (size != size_default)
 			return false;
 		break;
-	case bpf_ctx_range(struct __sk_buff, flow_keys):
-		if (size != sizeof(struct bpf_flow_keys *))
+	case bpf_ctx_range_ptr(struct __sk_buff, flow_keys):
+		if (size != sizeof(__u64))
 			return false;
 		break;
 	default:
@@ -5464,7 +5464,7 @@ static bool sk_filter_is_valid_access(int off, int size,
 	case bpf_ctx_range(struct __sk_buff, data):
 	case bpf_ctx_range(struct __sk_buff, data_meta):
 	case bpf_ctx_range(struct __sk_buff, data_end):
-	case bpf_ctx_range(struct __sk_buff, flow_keys):
+	case bpf_ctx_range_ptr(struct __sk_buff, flow_keys):
 	case bpf_ctx_range_till(struct __sk_buff, family, local_port):
 		return false;
 	}
@@ -5489,7 +5489,7 @@ static bool cg_skb_is_valid_access(int off, int size,
 	switch (off) {
 	case bpf_ctx_range(struct __sk_buff, tc_classid):
 	case bpf_ctx_range(struct __sk_buff, data_meta):
-	case bpf_ctx_range(struct __sk_buff, flow_keys):
+	case bpf_ctx_range_ptr(struct __sk_buff, flow_keys):
 		return false;
 	case bpf_ctx_range(struct __sk_buff, data):
 	case bpf_ctx_range(struct __sk_buff, data_end):
@@ -5530,7 +5530,7 @@ static bool lwt_is_valid_access(int off, int size,
 	case bpf_ctx_range(struct __sk_buff, tc_classid):
 	case bpf_ctx_range_till(struct __sk_buff, family, local_port):
 	case bpf_ctx_range(struct __sk_buff, data_meta):
-	case bpf_ctx_range(struct __sk_buff, flow_keys):
+	case bpf_ctx_range_ptr(struct __sk_buff, flow_keys):
 		return false;
 	}
 
@@ -5756,7 +5756,7 @@ static bool tc_cls_act_is_valid_access(int off, int size,
 	case bpf_ctx_range(struct __sk_buff, data_end):
 		info->reg_type = PTR_TO_PACKET_END;
 		break;
-	case bpf_ctx_range(struct __sk_buff, flow_keys):
+	case bpf_ctx_range_ptr(struct __sk_buff, flow_keys):
 	case bpf_ctx_range_till(struct __sk_buff, family, local_port):
 		return false;
 	}
@@ -5958,7 +5958,7 @@ static bool sk_skb_is_valid_access(int off, int size,
 	switch (off) {
 	case bpf_ctx_range(struct __sk_buff, tc_classid):
 	case bpf_ctx_range(struct __sk_buff, data_meta):
-	case bpf_ctx_range(struct __sk_buff, flow_keys):
+	case bpf_ctx_range_ptr(struct __sk_buff, flow_keys):
 		return false;
 	}
 
@@ -6039,7 +6039,7 @@ static bool flow_dissector_is_valid_access(int off, int size,
 	case bpf_ctx_range(struct __sk_buff, data_end):
 		info->reg_type = PTR_TO_PACKET_END;
 		break;
-	case bpf_ctx_range(struct __sk_buff, flow_keys):
+	case bpf_ctx_range_ptr(struct __sk_buff, flow_keys):
 		info->reg_type = PTR_TO_FLOW_KEYS;
 		break;
 	case bpf_ctx_range(struct __sk_buff, tc_classid):
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 852dc17ab47a..426b5c8a245b 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2422,6 +2422,12 @@ enum bpf_lwt_encap_mode {
 	BPF_LWT_ENCAP_SEG6_INLINE
 };
 
+#define __bpf_md_ptr(type, name)	\
+union {					\
+	type name;			\
+	__u64 :64;			\
+} __attribute__((aligned(8)))
+
 /* user accessible mirror of in-kernel sk_buff.
  * new fields can only be added to the end of this structure
  */
@@ -2456,7 +2462,7 @@ struct __sk_buff {
 	/* ... here. */
 
 	__u32 data_meta;
-	struct bpf_flow_keys *flow_keys;
+	__bpf_md_ptr(struct bpf_flow_keys *, flow_keys);
 };
 
 struct bpf_tunnel_key {
@@ -2572,8 +2578,8 @@ enum sk_action {
  * be added to the end of this structure
  */
 struct sk_msg_md {
-	void *data;
-	void *data_end;
+	__bpf_md_ptr(void *, data);
+	__bpf_md_ptr(void *, data_end);
 
 	__u32 family;
 	__u32 remote_ip4;	/* Stored in network byte order */
@@ -2589,8 +2595,9 @@ struct sk_reuseport_md {
 	 * Start of directly accessible data. It begins from
 	 * the tcp/udp header.
 	 */
-	void *data;
-	void *data_end;		/* End of directly accessible data */
+	__bpf_md_ptr(void *, data);
+	/* End of directly accessible data */
+	__bpf_md_ptr(void *, data_end);
 	/*
 	 * Total length of packet (starting from the tcp/udp header).
 	 * Note that the directly accessible bytes (data_end - data)

From fd6d433865a2ad1f7e018ef80408cb3dc3be1ab3 Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Wed, 28 Nov 2018 18:43:42 +0100
Subject: [PATCH 36/93] net/sched: act_police: fix memory leak in case of
 invalid control action

when users set an invalid control action, kmemleak complains as follows:

 # echo clear >/sys/kernel/debug/kmemleak
 # ./tdc.py -e b48b
 Test b48b: Add police action with exceed goto chain control action
 All test results:

 1..1
 ok 1 - b48b # Add police action with exceed goto chain control action
 about to flush the tap output if tests need to be skipped
 done flushing skipped test tap output
 # echo scan >/sys/kernel/debug/kmemleak
 # cat /sys/kernel/debug/kmemleak
 unreferenced object 0xffffa0fafbc3dde0 (size 96):
  comm "tc", pid 2358, jiffies 4294922738 (age 17.022s)
  hex dump (first 32 bytes):
    2a 00 00 20 00 00 00 00 00 00 7d 00 00 00 00 00  *.. ......}.....
    f8 07 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  backtrace:
    [<00000000648803d2>] tcf_action_init_1+0x384/0x4c0
    [<00000000cb69382e>] tcf_action_init+0x12b/0x1a0
    [<00000000847ef0d4>] tcf_action_add+0x73/0x170
    [<0000000093656e14>] tc_ctl_action+0x122/0x160
    [<0000000023c98e32>] rtnetlink_rcv_msg+0x263/0x2d0
    [<000000003493ae9c>] netlink_rcv_skb+0x4d/0x130
    [<00000000de63f8ba>] netlink_unicast+0x209/0x2d0
    [<00000000c3da0ebe>] netlink_sendmsg+0x2c1/0x3c0
    [<000000007a9e0753>] sock_sendmsg+0x33/0x40
    [<00000000457c6d2e>] ___sys_sendmsg+0x2a0/0x2f0
    [<00000000c5c6a086>] __sys_sendmsg+0x5e/0xa0
    [<00000000446eafce>] do_syscall_64+0x5b/0x180
    [<000000004aa871f2>] entry_SYSCALL_64_after_hwframe+0x44/0xa9
    [<00000000450c38ef>] 0xffffffffffffffff

change tcf_police_init() to avoid leaking 'new' in case TCA_POLICE_RESULT
contains TC_ACT_GOTO_CHAIN extended action.

Fixes: c08f5ed5d625 ("net/sched: act_police: disallow 'goto chain' on fallback control action")
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_police.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 37c9b8f0e10f..ec8ec55e0fe8 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -85,7 +85,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
 			       int ovr, int bind, bool rtnl_held,
 			       struct netlink_ext_ack *extack)
 {
-	int ret = 0, err;
+	int ret = 0, tcfp_result = TC_ACT_OK, err, size;
 	struct nlattr *tb[TCA_POLICE_MAX + 1];
 	struct tc_police *parm;
 	struct tcf_police *police;
@@ -93,7 +93,6 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
 	struct tc_action_net *tn = net_generic(net, police_net_id);
 	struct tcf_police_params *new;
 	bool exists = false;
-	int size;
 
 	if (nla == NULL)
 		return -EINVAL;
@@ -160,6 +159,16 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
 		goto failure;
 	}
 
+	if (tb[TCA_POLICE_RESULT]) {
+		tcfp_result = nla_get_u32(tb[TCA_POLICE_RESULT]);
+		if (TC_ACT_EXT_CMP(tcfp_result, TC_ACT_GOTO_CHAIN)) {
+			NL_SET_ERR_MSG(extack,
+				       "goto chain not allowed on fallback");
+			err = -EINVAL;
+			goto failure;
+		}
+	}
+
 	new = kzalloc(sizeof(*new), GFP_KERNEL);
 	if (unlikely(!new)) {
 		err = -ENOMEM;
@@ -167,6 +176,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
 	}
 
 	/* No failure allowed after this point */
+	new->tcfp_result = tcfp_result;
 	new->tcfp_mtu = parm->mtu;
 	if (!new->tcfp_mtu) {
 		new->tcfp_mtu = ~0;
@@ -196,16 +206,6 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
 	if (tb[TCA_POLICE_AVRATE])
 		new->tcfp_ewma_rate = nla_get_u32(tb[TCA_POLICE_AVRATE]);
 
-	if (tb[TCA_POLICE_RESULT]) {
-		new->tcfp_result = nla_get_u32(tb[TCA_POLICE_RESULT]);
-		if (TC_ACT_EXT_CMP(new->tcfp_result, TC_ACT_GOTO_CHAIN)) {
-			NL_SET_ERR_MSG(extack,
-				       "goto chain not allowed on fallback");
-			err = -EINVAL;
-			goto failure;
-		}
-	}
-
 	spin_lock_bh(&police->tcf_lock);
 	spin_lock_bh(&police->tcfp_lock);
 	police->tcfp_t_c = ktime_get_ns();

From f71c6143c2038df1cb43a4b9c90740d14f77467c Mon Sep 17 00:00:00 2001
From: Joe Stringer <joe@wand.net.nz>
Date: Fri, 30 Nov 2018 15:32:20 -0800
Subject: [PATCH 37/93] bpf: Support sk lookup in netns with id 0

David Ahern and Nicolas Dichtel report that the handling of the netns id
0 is incorrect for the BPF socket lookup helpers: rather than finding
the netns with id 0, it is resolving to the current netns. This renders
the netns_id 0 inaccessible.

To fix this, adjust the API for the netns to treat all negative s32
values as a lookup in the current netns (including u64 values which when
truncated to s32 become negative), while any values with a positive
value in the signed 32-bit integer space would result in a lookup for a
socket in the netns corresponding to that id. As before, if the netns
with that ID does not exist, no socket will be found. Any netns outside
of these ranges will fail to find a corresponding socket, as those
values are reserved for future usage.

Signed-off-by: Joe Stringer <joe@wand.net.nz>
Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Acked-by: Joey Pabalinas <joeypabalinas@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h                      | 35 ++++++++++-------
 net/core/filter.c                             | 11 +++---
 tools/include/uapi/linux/bpf.h                | 39 ++++++++++++-------
 tools/testing/selftests/bpf/bpf_helpers.h     |  4 +-
 .../selftests/bpf/test_sk_lookup_kern.c       | 18 ++++-----
 5 files changed, 63 insertions(+), 44 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 426b5c8a245b..cba518c57229 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2170,7 +2170,7 @@ union bpf_attr {
  *	Return
  *		0 on success, or a negative error in case of failure.
  *
- * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags)
+ * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
  *	Description
  *		Look for TCP socket matching *tuple*, optionally in a child
  *		network namespace *netns*. The return value must be checked,
@@ -2187,12 +2187,14 @@ union bpf_attr {
  *		**sizeof**\ (*tuple*\ **->ipv6**)
  *			Look for an IPv6 socket.
  *
- *		If the *netns* is zero, then the socket lookup table in the
- *		netns associated with the *ctx* will be used. For the TC hooks,
- *		this in the netns of the device in the skb. For socket hooks,
- *		this in the netns of the socket. If *netns* is non-zero, then
- *		it specifies the ID of the netns relative to the netns
- *		associated with the *ctx*.
+ *		If the *netns* is a negative signed 32-bit integer, then the
+ *		socket lookup table in the netns associated with the *ctx* will
+ *		will be used. For the TC hooks, this is the netns of the device
+ *		in the skb. For socket hooks, this is the netns of the socket.
+ *		If *netns* is any other signed 32-bit value greater than or
+ *		equal to zero then it specifies the ID of the netns relative to
+ *		the netns associated with the *ctx*. *netns* values beyond the
+ *		range of 32-bit integers are reserved for future use.
  *
  *		All values for *flags* are reserved for future usage, and must
  *		be left at zero.
@@ -2202,7 +2204,7 @@ union bpf_attr {
  *	Return
  *		Pointer to *struct bpf_sock*, or NULL in case of failure.
  *
- * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags)
+ * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
  *	Description
  *		Look for UDP socket matching *tuple*, optionally in a child
  *		network namespace *netns*. The return value must be checked,
@@ -2219,12 +2221,14 @@ union bpf_attr {
  *		**sizeof**\ (*tuple*\ **->ipv6**)
  *			Look for an IPv6 socket.
  *
- *		If the *netns* is zero, then the socket lookup table in the
- *		netns associated with the *ctx* will be used. For the TC hooks,
- *		this in the netns of the device in the skb. For socket hooks,
- *		this in the netns of the socket. If *netns* is non-zero, then
- *		it specifies the ID of the netns relative to the netns
- *		associated with the *ctx*.
+ *		If the *netns* is a negative signed 32-bit integer, then the
+ *		socket lookup table in the netns associated with the *ctx* will
+ *		will be used. For the TC hooks, this is the netns of the device
+ *		in the skb. For socket hooks, this is the netns of the socket.
+ *		If *netns* is any other signed 32-bit value greater than or
+ *		equal to zero then it specifies the ID of the netns relative to
+ *		the netns associated with the *ctx*. *netns* values beyond the
+ *		range of 32-bit integers are reserved for future use.
  *
  *		All values for *flags* are reserved for future usage, and must
  *		be left at zero.
@@ -2405,6 +2409,9 @@ enum bpf_func_id {
 /* BPF_FUNC_perf_event_output for sk_buff input context. */
 #define BPF_F_CTXLEN_MASK		(0xfffffULL << 32)
 
+/* Current network namespace */
+#define BPF_F_CURRENT_NETNS		(-1L)
+
 /* Mode for BPF_FUNC_skb_adjust_room helper. */
 enum bpf_adj_room_mode {
 	BPF_ADJ_ROOM_NET,
diff --git a/net/core/filter.c b/net/core/filter.c
index 6ee605da990f..8d2c629501e2 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4890,22 +4890,23 @@ bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
 	struct net *net;
 
 	family = len == sizeof(tuple->ipv4) ? AF_INET : AF_INET6;
-	if (unlikely(family == AF_UNSPEC || netns_id > U32_MAX || flags))
+	if (unlikely(family == AF_UNSPEC || flags ||
+		     !((s32)netns_id < 0 || netns_id <= S32_MAX)))
 		goto out;
 
 	if (skb->dev)
 		caller_net = dev_net(skb->dev);
 	else
 		caller_net = sock_net(skb->sk);
-	if (netns_id) {
+	if ((s32)netns_id < 0) {
+		net = caller_net;
+		sk = sk_lookup(net, tuple, skb, family, proto);
+	} else {
 		net = get_net_ns_by_id(caller_net, netns_id);
 		if (unlikely(!net))
 			goto out;
 		sk = sk_lookup(net, tuple, skb, family, proto);
 		put_net(net);
-	} else {
-		net = caller_net;
-		sk = sk_lookup(net, tuple, skb, family, proto);
 	}
 
 	if (sk)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 426b5c8a245b..76b265c7d93e 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2170,7 +2170,7 @@ union bpf_attr {
  *	Return
  *		0 on success, or a negative error in case of failure.
  *
- * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags)
+ * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
  *	Description
  *		Look for TCP socket matching *tuple*, optionally in a child
  *		network namespace *netns*. The return value must be checked,
@@ -2187,12 +2187,14 @@ union bpf_attr {
  *		**sizeof**\ (*tuple*\ **->ipv6**)
  *			Look for an IPv6 socket.
  *
- *		If the *netns* is zero, then the socket lookup table in the
- *		netns associated with the *ctx* will be used. For the TC hooks,
- *		this in the netns of the device in the skb. For socket hooks,
- *		this in the netns of the socket. If *netns* is non-zero, then
- *		it specifies the ID of the netns relative to the netns
- *		associated with the *ctx*.
+ *		If the *netns* is a negative signed 32-bit integer, then the
+ *		socket lookup table in the netns associated with the *ctx* will
+ *		will be used. For the TC hooks, this is the netns of the device
+ *		in the skb. For socket hooks, this is the netns of the socket.
+ *		If *netns* is any other signed 32-bit value greater than or
+ *		equal to zero then it specifies the ID of the netns relative to
+ *		the netns associated with the *ctx*. *netns* values beyond the
+ *		range of 32-bit integers are reserved for future use.
  *
  *		All values for *flags* are reserved for future usage, and must
  *		be left at zero.
@@ -2201,8 +2203,10 @@ union bpf_attr {
  *		**CONFIG_NET** configuration option.
  *	Return
  *		Pointer to *struct bpf_sock*, or NULL in case of failure.
+ *		For sockets with reuseport option, *struct bpf_sock*
+ *		return is from reuse->socks[] using hash of the packet.
  *
- * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags)
+ * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
  *	Description
  *		Look for UDP socket matching *tuple*, optionally in a child
  *		network namespace *netns*. The return value must be checked,
@@ -2219,12 +2223,14 @@ union bpf_attr {
  *		**sizeof**\ (*tuple*\ **->ipv6**)
  *			Look for an IPv6 socket.
  *
- *		If the *netns* is zero, then the socket lookup table in the
- *		netns associated with the *ctx* will be used. For the TC hooks,
- *		this in the netns of the device in the skb. For socket hooks,
- *		this in the netns of the socket. If *netns* is non-zero, then
- *		it specifies the ID of the netns relative to the netns
- *		associated with the *ctx*.
+ *		If the *netns* is a negative signed 32-bit integer, then the
+ *		socket lookup table in the netns associated with the *ctx* will
+ *		will be used. For the TC hooks, this is the netns of the device
+ *		in the skb. For socket hooks, this is the netns of the socket.
+ *		If *netns* is any other signed 32-bit value greater than or
+ *		equal to zero then it specifies the ID of the netns relative to
+ *		the netns associated with the *ctx*. *netns* values beyond the
+ *		range of 32-bit integers are reserved for future use.
  *
  *		All values for *flags* are reserved for future usage, and must
  *		be left at zero.
@@ -2233,6 +2239,8 @@ union bpf_attr {
  *		**CONFIG_NET** configuration option.
  *	Return
  *		Pointer to *struct bpf_sock*, or NULL in case of failure.
+ *		For sockets with reuseport option, *struct bpf_sock*
+ *		return is from reuse->socks[] using hash of the packet.
  *
  * int bpf_sk_release(struct bpf_sock *sk)
  *	Description
@@ -2405,6 +2413,9 @@ enum bpf_func_id {
 /* BPF_FUNC_perf_event_output for sk_buff input context. */
 #define BPF_F_CTXLEN_MASK		(0xfffffULL << 32)
 
+/* Current network namespace */
+#define BPF_F_CURRENT_NETNS		(-1L)
+
 /* Mode for BPF_FUNC_skb_adjust_room helper. */
 enum bpf_adj_room_mode {
 	BPF_ADJ_ROOM_NET,
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 686e57ce40f4..efb6c13ab0de 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -154,12 +154,12 @@ static unsigned long long (*bpf_skb_ancestor_cgroup_id)(void *ctx, int level) =
 	(void *) BPF_FUNC_skb_ancestor_cgroup_id;
 static struct bpf_sock *(*bpf_sk_lookup_tcp)(void *ctx,
 					     struct bpf_sock_tuple *tuple,
-					     int size, unsigned int netns_id,
+					     int size, unsigned long long netns_id,
 					     unsigned long long flags) =
 	(void *) BPF_FUNC_sk_lookup_tcp;
 static struct bpf_sock *(*bpf_sk_lookup_udp)(void *ctx,
 					     struct bpf_sock_tuple *tuple,
-					     int size, unsigned int netns_id,
+					     int size, unsigned long long netns_id,
 					     unsigned long long flags) =
 	(void *) BPF_FUNC_sk_lookup_udp;
 static int (*bpf_sk_release)(struct bpf_sock *sk) =
diff --git a/tools/testing/selftests/bpf/test_sk_lookup_kern.c b/tools/testing/selftests/bpf/test_sk_lookup_kern.c
index b745bdc08c2b..e21cd736c196 100644
--- a/tools/testing/selftests/bpf/test_sk_lookup_kern.c
+++ b/tools/testing/selftests/bpf/test_sk_lookup_kern.c
@@ -72,7 +72,7 @@ int bpf_sk_lookup_test0(struct __sk_buff *skb)
 		return TC_ACT_SHOT;
 
 	tuple_len = ipv4 ? sizeof(tuple->ipv4) : sizeof(tuple->ipv6);
-	sk = bpf_sk_lookup_tcp(skb, tuple, tuple_len, 0, 0);
+	sk = bpf_sk_lookup_tcp(skb, tuple, tuple_len, BPF_F_CURRENT_NETNS, 0);
 	if (sk)
 		bpf_sk_release(sk);
 	return sk ? TC_ACT_OK : TC_ACT_UNSPEC;
@@ -84,7 +84,7 @@ int bpf_sk_lookup_test1(struct __sk_buff *skb)
 	struct bpf_sock_tuple tuple = {};
 	struct bpf_sock *sk;
 
-	sk = bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), 0, 0);
+	sk = bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), BPF_F_CURRENT_NETNS, 0);
 	if (sk)
 		bpf_sk_release(sk);
 	return 0;
@@ -97,7 +97,7 @@ int bpf_sk_lookup_uaf(struct __sk_buff *skb)
 	struct bpf_sock *sk;
 	__u32 family = 0;
 
-	sk = bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), 0, 0);
+	sk = bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), BPF_F_CURRENT_NETNS, 0);
 	if (sk) {
 		bpf_sk_release(sk);
 		family = sk->family;
@@ -112,7 +112,7 @@ int bpf_sk_lookup_modptr(struct __sk_buff *skb)
 	struct bpf_sock *sk;
 	__u32 family;
 
-	sk = bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), 0, 0);
+	sk = bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), BPF_F_CURRENT_NETNS, 0);
 	if (sk) {
 		sk += 1;
 		bpf_sk_release(sk);
@@ -127,7 +127,7 @@ int bpf_sk_lookup_modptr_or_null(struct __sk_buff *skb)
 	struct bpf_sock *sk;
 	__u32 family;
 
-	sk = bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), 0, 0);
+	sk = bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), BPF_F_CURRENT_NETNS, 0);
 	sk += 1;
 	if (sk)
 		bpf_sk_release(sk);
@@ -139,7 +139,7 @@ int bpf_sk_lookup_test2(struct __sk_buff *skb)
 {
 	struct bpf_sock_tuple tuple = {};
 
-	bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), 0, 0);
+	bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), BPF_F_CURRENT_NETNS, 0);
 	return 0;
 }
 
@@ -149,7 +149,7 @@ int bpf_sk_lookup_test3(struct __sk_buff *skb)
 	struct bpf_sock_tuple tuple = {};
 	struct bpf_sock *sk;
 
-	sk = bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), 0, 0);
+	sk = bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), BPF_F_CURRENT_NETNS, 0);
 	bpf_sk_release(sk);
 	bpf_sk_release(sk);
 	return 0;
@@ -161,7 +161,7 @@ int bpf_sk_lookup_test4(struct __sk_buff *skb)
 	struct bpf_sock_tuple tuple = {};
 	struct bpf_sock *sk;
 
-	sk = bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), 0, 0);
+	sk = bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), BPF_F_CURRENT_NETNS, 0);
 	bpf_sk_release(sk);
 	return 0;
 }
@@ -169,7 +169,7 @@ int bpf_sk_lookup_test4(struct __sk_buff *skb)
 void lookup_no_release(struct __sk_buff *skb)
 {
 	struct bpf_sock_tuple tuple = {};
-	bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), 0, 0);
+	bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), BPF_F_CURRENT_NETNS, 0);
 }
 
 SEC("fail_no_release_subcall")

From d74286d2c25ad29dbf9e342955dd8dc31f21653b Mon Sep 17 00:00:00 2001
From: Joe Stringer <joe@wand.net.nz>
Date: Fri, 30 Nov 2018 15:32:21 -0800
Subject: [PATCH 38/93] bpf: Improve socket lookup reuseport documentation

Improve the wording around socket lookup for reuseport sockets, and
ensure that both bpf.h headers are in sync.

Signed-off-by: Joe Stringer <joe@wand.net.nz>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h       | 4 ++++
 tools/include/uapi/linux/bpf.h | 8 ++++----
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index cba518c57229..72c453a8bf50 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2203,6 +2203,8 @@ union bpf_attr {
  *		**CONFIG_NET** configuration option.
  *	Return
  *		Pointer to *struct bpf_sock*, or NULL in case of failure.
+ *		For sockets with reuseport option, the *struct bpf_sock*
+ *		result is from reuse->socks[] using the hash of the tuple.
  *
  * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
  *	Description
@@ -2237,6 +2239,8 @@ union bpf_attr {
  *		**CONFIG_NET** configuration option.
  *	Return
  *		Pointer to *struct bpf_sock*, or NULL in case of failure.
+ *		For sockets with reuseport option, the *struct bpf_sock*
+ *		result is from reuse->socks[] using the hash of the tuple.
  *
  * int bpf_sk_release(struct bpf_sock *sk)
  *	Description
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 76b265c7d93e..72c453a8bf50 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2203,8 +2203,8 @@ union bpf_attr {
  *		**CONFIG_NET** configuration option.
  *	Return
  *		Pointer to *struct bpf_sock*, or NULL in case of failure.
- *		For sockets with reuseport option, *struct bpf_sock*
- *		return is from reuse->socks[] using hash of the packet.
+ *		For sockets with reuseport option, the *struct bpf_sock*
+ *		result is from reuse->socks[] using the hash of the tuple.
  *
  * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
  *	Description
@@ -2239,8 +2239,8 @@ union bpf_attr {
  *		**CONFIG_NET** configuration option.
  *	Return
  *		Pointer to *struct bpf_sock*, or NULL in case of failure.
- *		For sockets with reuseport option, *struct bpf_sock*
- *		return is from reuse->socks[] using hash of the packet.
+ *		For sockets with reuseport option, the *struct bpf_sock*
+ *		result is from reuse->socks[] using the hash of the tuple.
  *
  * int bpf_sk_release(struct bpf_sock *sk)
  *	Description

From a3d7e01da06013dc580641a1da57c3b482d58157 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Wed, 28 Nov 2018 13:40:04 -0800
Subject: [PATCH 39/93] net: dsa: Fix tagging attribute location

While introducing the DSA tagging protocol attribute, it was added to the DSA
slave network devices, but those actually see untagged traffic (that is their
whole purpose). Correct this mistake by putting the tagging sysfs attribute
under the DSA master network device where this is the information that we need.

While at it, also correct the sysfs documentation mistake that missed the
"dsa/" directory component of the attribute.

Fixes: 98cdb4807123 ("net: dsa: Expose tagging protocol to user-space")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/ABI/testing/sysfs-class-net-dsa |  2 +-
 net/dsa/master.c                              | 34 ++++++++++++++++++-
 net/dsa/slave.c                               | 28 ---------------
 3 files changed, 34 insertions(+), 30 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-class-net-dsa b/Documentation/ABI/testing/sysfs-class-net-dsa
index f240221e071e..985d84c585c6 100644
--- a/Documentation/ABI/testing/sysfs-class-net-dsa
+++ b/Documentation/ABI/testing/sysfs-class-net-dsa
@@ -1,4 +1,4 @@
-What:		/sys/class/net/<iface>/tagging
+What:		/sys/class/net/<iface>/dsa/tagging
 Date:		August 2018
 KernelVersion:	4.20
 Contact:	netdev@vger.kernel.org
diff --git a/net/dsa/master.c b/net/dsa/master.c
index c90ee3227dea..5e8c9bef78bd 100644
--- a/net/dsa/master.c
+++ b/net/dsa/master.c
@@ -158,8 +158,31 @@ static void dsa_master_ethtool_teardown(struct net_device *dev)
 	cpu_dp->orig_ethtool_ops = NULL;
 }
 
+static ssize_t tagging_show(struct device *d, struct device_attribute *attr,
+			    char *buf)
+{
+	struct net_device *dev = to_net_dev(d);
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
+
+	return sprintf(buf, "%s\n",
+		       dsa_tag_protocol_to_str(cpu_dp->tag_ops));
+}
+static DEVICE_ATTR_RO(tagging);
+
+static struct attribute *dsa_slave_attrs[] = {
+	&dev_attr_tagging.attr,
+	NULL
+};
+
+static const struct attribute_group dsa_group = {
+	.name	= "dsa",
+	.attrs	= dsa_slave_attrs,
+};
+
 int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp)
 {
+	int ret;
+
 	/* If we use a tagging format that doesn't have an ethertype
 	 * field, make sure that all packets from this point on get
 	 * sent to the tag format's receive function.
@@ -168,11 +191,20 @@ int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp)
 
 	dev->dsa_ptr = cpu_dp;
 
-	return dsa_master_ethtool_setup(dev);
+	ret = dsa_master_ethtool_setup(dev);
+	if (ret)
+		return ret;
+
+	ret = sysfs_create_group(&dev->dev.kobj, &dsa_group);
+	if (ret)
+		dsa_master_ethtool_teardown(dev);
+
+	return ret;
 }
 
 void dsa_master_teardown(struct net_device *dev)
 {
+	sysfs_remove_group(&dev->dev.kobj, &dsa_group);
 	dsa_master_ethtool_teardown(dev);
 
 	dev->dsa_ptr = NULL;
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 7d0c19e7edcf..aec78f5aca72 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1058,27 +1058,6 @@ static struct device_type dsa_type = {
 	.name	= "dsa",
 };
 
-static ssize_t tagging_show(struct device *d, struct device_attribute *attr,
-			    char *buf)
-{
-	struct net_device *dev = to_net_dev(d);
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-
-	return sprintf(buf, "%s\n",
-		       dsa_tag_protocol_to_str(dp->cpu_dp->tag_ops));
-}
-static DEVICE_ATTR_RO(tagging);
-
-static struct attribute *dsa_slave_attrs[] = {
-	&dev_attr_tagging.attr,
-	NULL
-};
-
-static const struct attribute_group dsa_group = {
-	.name	= "dsa",
-	.attrs	= dsa_slave_attrs,
-};
-
 static void dsa_slave_phylink_validate(struct net_device *dev,
 				       unsigned long *supported,
 				       struct phylink_link_state *state)
@@ -1374,14 +1353,8 @@ int dsa_slave_create(struct dsa_port *port)
 		goto out_phy;
 	}
 
-	ret = sysfs_create_group(&slave_dev->dev.kobj, &dsa_group);
-	if (ret)
-		goto out_unreg;
-
 	return 0;
 
-out_unreg:
-	unregister_netdev(slave_dev);
 out_phy:
 	rtnl_lock();
 	phylink_disconnect_phy(p->dp->pl);
@@ -1405,7 +1378,6 @@ void dsa_slave_destroy(struct net_device *slave_dev)
 	rtnl_unlock();
 
 	dsa_slave_notify(slave_dev, DSA_PORT_UNREGISTER);
-	sysfs_remove_group(&slave_dev->dev.kobj, &dsa_group);
 	unregister_netdev(slave_dev);
 	phylink_destroy(dp->pl);
 	free_percpu(p->stats64);

From ef6fcd455278c2be3032a346cc66d9dd9866b787 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Wed, 28 Nov 2018 15:04:05 -0800
Subject: [PATCH 40/93] mlx5: fix get_ip_proto()

IP header is not necessarily located right after struct ethhdr,
there could be multiple 802.1Q headers in between, this is why
we call __vlan_get_protocol().

Fixes: fe1dc069990c ("net/mlx5e: don't set CHECKSUM_COMPLETE on SCTP packets")
Cc: Alaa Hleihel <alaa@mellanox.com>
Cc: Or Gerlitz <ogerlitz@mellanox.com>
Cc: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 16985ca3248d..624eed345b5d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -724,9 +724,9 @@ static u32 mlx5e_get_fcs(const struct sk_buff *skb)
 	return __get_unaligned_cpu32(fcs_bytes);
 }
 
-static u8 get_ip_proto(struct sk_buff *skb, __be16 proto)
+static u8 get_ip_proto(struct sk_buff *skb, int network_depth, __be16 proto)
 {
-	void *ip_p = skb->data + sizeof(struct ethhdr);
+	void *ip_p = skb->data + network_depth;
 
 	return (proto == htons(ETH_P_IP)) ? ((struct iphdr *)ip_p)->protocol :
 					    ((struct ipv6hdr *)ip_p)->nexthdr;
@@ -755,7 +755,7 @@ static inline void mlx5e_handle_csum(struct net_device *netdev,
 		goto csum_unnecessary;
 
 	if (likely(is_last_ethertype_ip(skb, &network_depth, &proto))) {
-		if (unlikely(get_ip_proto(skb, proto) == IPPROTO_SCTP))
+		if (unlikely(get_ip_proto(skb, network_depth, proto) == IPPROTO_SCTP))
 			goto csum_unnecessary;
 
 		skb->ip_summed = CHECKSUM_COMPLETE;

From c0f53771ba45745e5870daf880127925c93f232f Mon Sep 17 00:00:00 2001
From: Pan Bian <bianpan2016@163.com>
Date: Thu, 29 Nov 2018 07:54:22 +0800
Subject: [PATCH 41/93] liquidio: read sc->iq_no before release sc

The function lio_vf_rep_packet_sent_callback releases the occupation of
sc via octeon_free_soft_command. sc should not be used after that.
Unfortunately, sc->iq_no is read. To fix this, the patch stores sc->iq_no
into a local variable before releasing sc and then uses the local variable
instead of sc->iq_no.

Signed-off-by: Pan Bian <bianpan2016@163.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/cavium/liquidio/lio_vf_rep.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_rep.c b/drivers/net/ethernet/cavium/liquidio/lio_vf_rep.c
index ea9859e028d4..de61060721c4 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_vf_rep.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_rep.c
@@ -349,13 +349,15 @@ lio_vf_rep_packet_sent_callback(struct octeon_device *oct,
 	struct octeon_soft_command *sc = (struct octeon_soft_command *)buf;
 	struct sk_buff *skb = sc->ctxptr;
 	struct net_device *ndev = skb->dev;
+	u32 iq_no;
 
 	dma_unmap_single(&oct->pci_dev->dev, sc->dmadptr,
 			 sc->datasize, DMA_TO_DEVICE);
 	dev_kfree_skb_any(skb);
+	iq_no = sc->iq_no;
 	octeon_free_soft_command(oct, sc);
 
-	if (octnet_iq_is_full(oct, sc->iq_no))
+	if (octnet_iq_is_full(oct, iq_no))
 		return;
 
 	if (netif_queue_stopped(ndev))

From 3976535af0cb9fe34a55f2ffb8d7e6b39a2f8188 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 28 Nov 2018 16:06:43 -0800
Subject: [PATCH 42/93] tcp: fix off-by-one bug on aborting window-probing
 socket

Previously there is an off-by-one bug on determining when to abort
a stalled window-probing socket. This patch fixes that so it is
consistent with tcp_write_timeout().

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_timer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 091c53925e4d..25efdae4368a 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -378,7 +378,7 @@ static void tcp_probe_timer(struct sock *sk)
 			return;
 	}
 
-	if (icsk->icsk_probes_out > max_probes) {
+	if (icsk->icsk_probes_out >= max_probes) {
 abort:		tcp_write_err(sk);
 	} else {
 		/* Only send another probe if we didn't close things up. */

From ec641b39457e17774313b66697a8a1dc070257bd Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 28 Nov 2018 16:06:44 -0800
Subject: [PATCH 43/93] tcp: fix SNMP under-estimation on failed retransmission

Previously the SNMP counter LINUX_MIB_TCPRETRANSFAIL is not counting
the TSO/GSO properly on failed retransmission. This patch fixes that.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 3f510cad0b3e..68b5326f7321 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2920,7 +2920,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 		TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
 		trace_tcp_retransmit_skb(sk, skb);
 	} else if (err != -EBUSY) {
-		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
+		NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs);
 	}
 	return err;
 }

From e1561fe2dd69dc5dddd69bd73aa65355bdfb048b Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 28 Nov 2018 16:06:45 -0800
Subject: [PATCH 44/93] tcp: fix SNMP TCP timeout under-estimation

Previously the SNMP TCPTIMEOUTS counter has inconsistent accounting:
1. It counts all SYN and SYN-ACK timeouts
2. It counts timeouts in other states except recurring timeouts and
   timeouts after fast recovery or disorder state.

Such selective accounting makes analysis difficult and complicated. For
example the monitoring system needs to collect many other SNMP counters
to infer the total amount of timeout events. This patch makes TCPTIMEOUTS
counter simply counts all the retransmit timeout (SYN or data or FIN).

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_timer.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 25efdae4368a..f87dbc78b6bc 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -484,11 +484,12 @@ void tcp_retransmit_timer(struct sock *sk)
 		goto out_reset_timer;
 	}
 
+	__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEOUTS);
 	if (tcp_write_timeout(sk))
 		goto out;
 
 	if (icsk->icsk_retransmits == 0) {
-		int mib_idx;
+		int mib_idx = 0;
 
 		if (icsk->icsk_ca_state == TCP_CA_Recovery) {
 			if (tcp_is_sack(tp))
@@ -503,10 +504,9 @@ void tcp_retransmit_timer(struct sock *sk)
 				mib_idx = LINUX_MIB_TCPSACKFAILURES;
 			else
 				mib_idx = LINUX_MIB_TCPRENOFAILURES;
-		} else {
-			mib_idx = LINUX_MIB_TCPTIMEOUTS;
 		}
-		__NET_INC_STATS(sock_net(sk), mib_idx);
+		if (mib_idx)
+			__NET_INC_STATS(sock_net(sk), mib_idx);
 	}
 
 	tcp_enter_loss(sk);

From 436c9453a1ac0944b82870ef2e0d9be956b396d9 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Thu, 29 Nov 2018 13:53:16 +0800
Subject: [PATCH 45/93] virtio-net: keep vnet header zeroed after processing
 XDP

We copy vnet header unconditionally in page_to_skb() this is wrong
since XDP may modify the packet data. So let's keep a zeroed vnet
header for not confusing the conversion between vnet header and skb
metadata.

In the future, we should able to detect whether or not the packet was
modified and keep using the vnet header when packet was not touched.

Fixes: f600b6905015 ("virtio_net: Add XDP support")
Reported-by: Pavel Popa <pashinho1990@gmail.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/virtio_net.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index cecfd77c9f3c..ea672145f6a6 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -365,7 +365,8 @@ static unsigned int mergeable_ctx_to_truesize(void *mrg_ctx)
 static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 				   struct receive_queue *rq,
 				   struct page *page, unsigned int offset,
-				   unsigned int len, unsigned int truesize)
+				   unsigned int len, unsigned int truesize,
+				   bool hdr_valid)
 {
 	struct sk_buff *skb;
 	struct virtio_net_hdr_mrg_rxbuf *hdr;
@@ -387,7 +388,8 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 	else
 		hdr_padded_len = sizeof(struct padded_vnet_hdr);
 
-	memcpy(hdr, p, hdr_len);
+	if (hdr_valid)
+		memcpy(hdr, p, hdr_len);
 
 	len -= hdr_len;
 	offset += hdr_padded_len;
@@ -739,7 +741,8 @@ static struct sk_buff *receive_big(struct net_device *dev,
 				   struct virtnet_rq_stats *stats)
 {
 	struct page *page = buf;
-	struct sk_buff *skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE);
+	struct sk_buff *skb = page_to_skb(vi, rq, page, 0, len,
+					  PAGE_SIZE, true);
 
 	stats->bytes += len - vi->hdr_len;
 	if (unlikely(!skb))
@@ -842,7 +845,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 				rcu_read_unlock();
 				put_page(page);
 				head_skb = page_to_skb(vi, rq, xdp_page,
-						       offset, len, PAGE_SIZE);
+						       offset, len,
+						       PAGE_SIZE, false);
 				return head_skb;
 			}
 			break;
@@ -898,7 +902,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 		goto err_skb;
 	}
 
-	head_skb = page_to_skb(vi, rq, page, offset, len, truesize);
+	head_skb = page_to_skb(vi, rq, page, offset, len, truesize, !xdp_prog);
 	curr_skb = head_skb;
 
 	if (unlikely(!curr_skb))

From 35b827b6d06199841a83839e8bb69c0cd13a28be Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Thu, 29 Nov 2018 14:45:39 +0100
Subject: [PATCH 46/93] tun: forbid iface creation with rtnl ops

It's not supported right now (the goal of the initial patch was to support
'ip link del' only).

Before the patch:
$ ip link add foo type tun
[  239.632660] BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
[snip]
[  239.636410] RIP: 0010:register_netdevice+0x8e/0x3a0

This panic occurs because dev->netdev_ops is not set by tun_setup(). But to
have something usable, it will require more than just setting
netdev_ops.

Fixes: f019a7a594d9 ("tun: Implement ip link del tunXXX")
CC: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index e244f5d7512a..cf349e65a66b 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2293,9 +2293,9 @@ static void tun_setup(struct net_device *dev)
 static int tun_validate(struct nlattr *tb[], struct nlattr *data[],
 			struct netlink_ext_ack *extack)
 {
-	if (!data)
-		return 0;
-	return -EINVAL;
+	NL_SET_ERR_MSG(extack,
+		       "tun/tap creation via rtnetlink is not supported.");
+	return -EOPNOTSUPP;
 }
 
 static size_t tun_get_size(const struct net_device *dev)

From dcb40590e69e306030e944a39d0e4bf54247fb68 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Sat, 1 Dec 2018 10:39:44 -0800
Subject: [PATCH 47/93] bpf: refactor bpf_test_run() to separate own failures
 and test program result

After commit f42ee093be29 ("bpf/test_run: support cgroup local
storage") the bpf_test_run() function may fail with -ENOMEM, if
it's not possible to allocate memory for a cgroup local storage.

This error shouldn't be mixed with the return value of the testing
program. Let's add an additional argument with a pointer where to
store the testing program's result; and make bpf_test_run()
return either 0 or -ENOMEM.

Fixes: f42ee093be29 ("bpf/test_run: support cgroup local storage")
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Suggested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Roman Gushchin <guro@fb.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/bpf/test_run.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index c89c22c49015..25001913d03b 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -28,12 +28,13 @@ static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx,
 	return ret;
 }
 
-static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time)
+static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *ret,
+			u32 *time)
 {
 	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { 0 };
 	enum bpf_cgroup_storage_type stype;
 	u64 time_start, time_spent = 0;
-	u32 ret = 0, i;
+	u32 i;
 
 	for_each_cgroup_storage_type(stype) {
 		storage[stype] = bpf_cgroup_storage_alloc(prog, stype);
@@ -49,7 +50,7 @@ static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time)
 		repeat = 1;
 	time_start = ktime_get_ns();
 	for (i = 0; i < repeat; i++) {
-		ret = bpf_test_run_one(prog, ctx, storage);
+		*ret = bpf_test_run_one(prog, ctx, storage);
 		if (need_resched()) {
 			if (signal_pending(current))
 				break;
@@ -65,7 +66,7 @@ static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time)
 	for_each_cgroup_storage_type(stype)
 		bpf_cgroup_storage_free(storage[stype]);
 
-	return ret;
+	return 0;
 }
 
 static int bpf_test_finish(const union bpf_attr *kattr,
@@ -165,7 +166,12 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
 		__skb_push(skb, hh_len);
 	if (is_direct_pkt_access)
 		bpf_compute_data_pointers(skb);
-	retval = bpf_test_run(prog, skb, repeat, &duration);
+	ret = bpf_test_run(prog, skb, repeat, &retval, &duration);
+	if (ret) {
+		kfree_skb(skb);
+		kfree(sk);
+		return ret;
+	}
 	if (!is_l2) {
 		if (skb_headroom(skb) < hh_len) {
 			int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb));
@@ -212,11 +218,14 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
 	rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0);
 	xdp.rxq = &rxqueue->xdp_rxq;
 
-	retval = bpf_test_run(prog, &xdp, repeat, &duration);
+	ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration);
+	if (ret)
+		goto out;
 	if (xdp.data != data + XDP_PACKET_HEADROOM + NET_IP_ALIGN ||
 	    xdp.data_end != xdp.data + size)
 		size = xdp.data_end - xdp.data;
 	ret = bpf_test_finish(kattr, uattr, xdp.data, size, retval, duration);
+out:
 	kfree(data);
 	return ret;
 }

From d2a36971ef595069b7a600d1144c2e0881a930a1 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Mon, 3 Dec 2018 08:19:33 +0100
Subject: [PATCH 48/93] net: phy: don't allow __set_phy_supported to add
 unsupported modes

Currently __set_phy_supported allows to add modes w/o checking whether
the PHY supports them. This is wrong, it should never add modes but
only remove modes we don't want to support.

The commit marked as fixed didn't do anything wrong, it just copied
existing functionality to the helper which is being fixed now.

Fixes: f3a6bd393c2c ("phylib: Add phy_set_max_speed helper")
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy_device.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 23ee3967c166..18e92c19c5ab 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1880,20 +1880,17 @@ EXPORT_SYMBOL(genphy_loopback);
 
 static int __set_phy_supported(struct phy_device *phydev, u32 max_speed)
 {
-	phydev->supported &= ~(PHY_1000BT_FEATURES | PHY_100BT_FEATURES |
-			       PHY_10BT_FEATURES);
-
 	switch (max_speed) {
-	default:
-		return -ENOTSUPP;
-	case SPEED_1000:
-		phydev->supported |= PHY_1000BT_FEATURES;
+	case SPEED_10:
+		phydev->supported &= ~PHY_100BT_FEATURES;
 		/* fall through */
 	case SPEED_100:
-		phydev->supported |= PHY_100BT_FEATURES;
-		/* fall through */
-	case SPEED_10:
-		phydev->supported |= PHY_10BT_FEATURES;
+		phydev->supported &= ~PHY_1000BT_FEATURES;
+		break;
+	case SPEED_1000:
+		break;
+	default:
+		return -ENOTSUPP;
 	}
 
 	return 0;

From a5d4a89245ead1f37ed135213653c5beebea4237 Mon Sep 17 00:00:00 2001
From: Su Yanjun <suyj.fnst@cn.fujitsu.com>
Date: Mon, 3 Dec 2018 15:33:07 +0800
Subject: [PATCH 49/93] net: 8139cp: fix a BUG triggered by changing mtu with
 network traffic

When changing mtu many times with traffic, a bug is triggered:

[ 1035.684037] kernel BUG at lib/dynamic_queue_limits.c:26!
[ 1035.684042] invalid opcode: 0000 [#1] SMP
[ 1035.684049] Modules linked in: loop binfmt_misc 8139cp(OE) macsec
tcp_diag udp_diag inet_diag unix_diag af_packet_diag netlink_diag tcp_lp
fuse uinput xt_CHECKSUM iptable_mangle ipt_MASQUERADE
nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4
nf_defrag_ipv4 xt_conntrack nf_conntrack ipt_REJECT nf_reject_ipv4 tun
bridge stp llc ebtable_filter ebtables ip6table_filter devlink
ip6_tables iptable_filter sunrpc snd_hda_codec_generic snd_hda_intel
snd_hda_codec snd_hda_core snd_hwdep ppdev snd_seq iosf_mbi crc32_pclmul
parport_pc snd_seq_device ghash_clmulni_intel parport snd_pcm
aesni_intel joydev lrw snd_timer virtio_balloon sg gf128mul glue_helper
ablk_helper cryptd snd soundcore i2c_piix4 pcspkr ip_tables xfs
libcrc32c sr_mod sd_mod cdrom crc_t10dif crct10dif_generic ata_generic
[ 1035.684102]  pata_acpi virtio_console qxl drm_kms_helper syscopyarea
sysfillrect sysimgblt floppy fb_sys_fops crct10dif_pclmul
crct10dif_common ttm crc32c_intel serio_raw ata_piix drm libata 8139too
virtio_pci drm_panel_orientation_quirks virtio_ring virtio mii dm_mirror
dm_region_hash dm_log dm_mod [last unloaded: 8139cp]
[ 1035.684132] CPU: 9 PID: 25140 Comm: if-mtu-change Kdump: loaded
Tainted: G           OE  ------------ T 3.10.0-957.el7.x86_64 #1
[ 1035.684134] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
[ 1035.684136] task: ffff8f59b1f5a080 ti: ffff8f5a2e32c000 task.ti:
ffff8f5a2e32c000
[ 1035.684149] RIP: 0010:[<ffffffffba3a40d0>]  [<ffffffffba3a40d0>]
dql_completed+0x180/0x190
[ 1035.684162] RSP: 0000:ffff8f5a75483e50  EFLAGS: 00010093
[ 1035.684162] RAX: 00000000000000c2 RBX: ffff8f5a6f91c000 RCX:
0000000000000000
[ 1035.684162] RDX: 0000000000000000 RSI: 0000000000000184 RDI:
ffff8f599fea3ec0
[ 1035.684162] RBP: ffff8f5a75483ea8 R08: 00000000000000c2 R09:
0000000000000000
[ 1035.684162] R10: 00000000000616ef R11: ffff8f5a75483b56 R12:
ffff8f599fea3e00
[ 1035.684162] R13: 0000000000000001 R14: 0000000000000000 R15:
0000000000000184
[ 1035.684162] FS:  00007fa8434de740(0000) GS:ffff8f5a75480000(0000)
knlGS:0000000000000000
[ 1035.684162] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1035.684162] CR2: 00000000004305d0 CR3: 000000024eb66000 CR4:
00000000001406e0
[ 1035.684162] Call Trace:
[ 1035.684162]  <IRQ>
[ 1035.684162]  [<ffffffffc08cbaf8>] ? cp_interrupt+0x478/0x580 [8139cp]
[ 1035.684162]  [<ffffffffba14a294>]
__handle_irq_event_percpu+0x44/0x1c0
[ 1035.684162]  [<ffffffffba14a442>] handle_irq_event_percpu+0x32/0x80
[ 1035.684162]  [<ffffffffba14a4cc>] handle_irq_event+0x3c/0x60
[ 1035.684162]  [<ffffffffba14db29>] handle_fasteoi_irq+0x59/0x110
[ 1035.684162]  [<ffffffffba02e554>] handle_irq+0xe4/0x1a0
[ 1035.684162]  [<ffffffffba7795dd>] do_IRQ+0x4d/0xf0
[ 1035.684162]  [<ffffffffba76b362>] common_interrupt+0x162/0x162
[ 1035.684162]  <EOI>
[ 1035.684162]  [<ffffffffba0c2ae4>] ? __wake_up_bit+0x24/0x70
[ 1035.684162]  [<ffffffffba1e46f5>] ? do_set_pte+0xd5/0x120
[ 1035.684162]  [<ffffffffba1b64fb>] unlock_page+0x2b/0x30
[ 1035.684162]  [<ffffffffba1e4879>] do_read_fault.isra.61+0x139/0x1b0
[ 1035.684162]  [<ffffffffba1e9134>] handle_pte_fault+0x2f4/0xd10
[ 1035.684162]  [<ffffffffba1ebc6d>] handle_mm_fault+0x39d/0x9b0
[ 1035.684162]  [<ffffffffba76f5e3>] __do_page_fault+0x203/0x500
[ 1035.684162]  [<ffffffffba76f9c6>] trace_do_page_fault+0x56/0x150
[ 1035.684162]  [<ffffffffba76ef42>] do_async_page_fault+0x22/0xf0
[ 1035.684162]  [<ffffffffba76b788>] async_page_fault+0x28/0x30
[ 1035.684162] Code: 54 c7 47 54 ff ff ff ff 44 0f 49 ce 48 8b 35 48 2f
9c 00 48 89 77 58 e9 fe fe ff ff 0f 1f 80 00 00 00 00 41 89 d1 e9 ef fe
ff ff <0f> 0b 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 55 8d 42 ff 48
[ 1035.684162] RIP  [<ffffffffba3a40d0>] dql_completed+0x180/0x190
[ 1035.684162]  RSP <ffff8f5a75483e50>

It's not the same as in 7fe0ee09 patch described.
As 8139cp uses shared irq mode, other device irq will trigger
cp_interrupt to execute.

cp_change_mtu
 -> cp_close
 -> cp_open

In cp_close routine  just before free_irq(), some interrupt may occur.
In my environment, cp_interrupt exectutes and IntrStatus is 0x4,
exactly TxOk. That will cause cp_tx to wake device queue.

As device queue is started, cp_start_xmit and cp_open will run at same
time which will cause kernel BUG.

For example:
[#] for tx descriptor

At start:

[#][#][#]
num_queued=3

After cp_init_hw->cp_start_hw->netdev_reset_queue:

[#][#][#]
num_queued=0

When 8139cp starts to work then cp_tx will check
num_queued mismatchs the complete_bytes.

The patch will check IntrMask before check IntrStatus in cp_interrupt.
When 8139cp interrupt is disabled, just return.

Signed-off-by: Su Yanjun <suyj.fnst@cn.fujitsu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/8139cp.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/net/ethernet/realtek/8139cp.c b/drivers/net/ethernet/realtek/8139cp.c
index 81045dfa1cd8..44f6e4873aad 100644
--- a/drivers/net/ethernet/realtek/8139cp.c
+++ b/drivers/net/ethernet/realtek/8139cp.c
@@ -571,6 +571,7 @@ static irqreturn_t cp_interrupt (int irq, void *dev_instance)
 	struct cp_private *cp;
 	int handled = 0;
 	u16 status;
+	u16 mask;
 
 	if (unlikely(dev == NULL))
 		return IRQ_NONE;
@@ -578,6 +579,10 @@ static irqreturn_t cp_interrupt (int irq, void *dev_instance)
 
 	spin_lock(&cp->lock);
 
+	mask = cpr16(IntrMask);
+	if (!mask)
+		goto out_unlock;
+
 	status = cpr16(IntrStatus);
 	if (!status || (status == 0xFFFF))
 		goto out_unlock;

From 4e4b08e55889da97dec750759f3ade8cc92b4644 Mon Sep 17 00:00:00 2001
From: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Date: Mon, 3 Dec 2018 18:09:24 +0900
Subject: [PATCH 50/93] tun: remove skb access after netif_receive_skb

In tun.c skb->len was accessed while doing stats accounting after a
call to netif_receive_skb. We can not access skb after this call
because buffers may be dropped.

The fix for this bug would be to store skb->len in local variable and
then use it after netif_receive_skb(). IMO using xdp data size for
accounting bytes will be better because input for tun_xdp_one() is
xdp_buff.

Hence this patch:
- fixes a bug by removing skb access after netif_receive_skb()
- uses xdp data size for accounting bytes

[613.019057] BUG: KASAN: use-after-free in tun_sendmsg+0x77c/0xc50 [tun]
[613.021062] Read of size 4 at addr ffff8881da9ab7c0 by task vhost-1115/1155
[613.023073]
[613.024003] CPU: 0 PID: 1155 Comm: vhost-1115 Not tainted 4.20.0-rc3-vm+ #232
[613.026029] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014
[613.029116] Call Trace:
[613.031145]  dump_stack+0x5b/0x90
[613.032219]  print_address_description+0x6c/0x23c
[613.034156]  ? tun_sendmsg+0x77c/0xc50 [tun]
[613.036141]  kasan_report.cold.5+0x241/0x308
[613.038125]  tun_sendmsg+0x77c/0xc50 [tun]
[613.040109]  ? tun_get_user+0x1960/0x1960 [tun]
[613.042094]  ? __isolate_free_page+0x270/0x270
[613.045173]  vhost_tx_batch.isra.14+0xeb/0x1f0 [vhost_net]
[613.047127]  ? peek_head_len.part.13+0x90/0x90 [vhost_net]
[613.049096]  ? get_tx_bufs+0x5a/0x2c0 [vhost_net]
[613.051106]  ? vhost_enable_notify+0x2d8/0x420 [vhost]
[613.053139]  handle_tx_copy+0x2d0/0x8f0 [vhost_net]
[613.053139]  ? vhost_net_buf_peek+0x340/0x340 [vhost_net]
[613.053139]  ? __mutex_lock+0x8d9/0xb30
[613.053139]  ? finish_task_switch+0x8f/0x3f0
[613.053139]  ? handle_tx+0x32/0x120 [vhost_net]
[613.053139]  ? mutex_trylock+0x110/0x110
[613.053139]  ? finish_task_switch+0xcf/0x3f0
[613.053139]  ? finish_task_switch+0x240/0x3f0
[613.053139]  ? __switch_to_asm+0x34/0x70
[613.053139]  ? __switch_to_asm+0x40/0x70
[613.053139]  ? __schedule+0x506/0xf10
[613.053139]  handle_tx+0xc7/0x120 [vhost_net]
[613.053139]  vhost_worker+0x166/0x200 [vhost]
[613.053139]  ? vhost_dev_init+0x580/0x580 [vhost]
[613.053139]  ? __kthread_parkme+0x77/0x90
[613.053139]  ? vhost_dev_init+0x580/0x580 [vhost]
[613.053139]  kthread+0x1b1/0x1d0
[613.053139]  ? kthread_park+0xb0/0xb0
[613.053139]  ret_from_fork+0x35/0x40
[613.088705]
[613.088705] Allocated by task 1155:
[613.088705]  kasan_kmalloc+0xbf/0xe0
[613.088705]  kmem_cache_alloc+0xdc/0x220
[613.088705]  __build_skb+0x2a/0x160
[613.088705]  build_skb+0x14/0xc0
[613.088705]  tun_sendmsg+0x4f0/0xc50 [tun]
[613.088705]  vhost_tx_batch.isra.14+0xeb/0x1f0 [vhost_net]
[613.088705]  handle_tx_copy+0x2d0/0x8f0 [vhost_net]
[613.088705]  handle_tx+0xc7/0x120 [vhost_net]
[613.088705]  vhost_worker+0x166/0x200 [vhost]
[613.088705]  kthread+0x1b1/0x1d0
[613.088705]  ret_from_fork+0x35/0x40
[613.088705]
[613.088705] Freed by task 1155:
[613.088705]  __kasan_slab_free+0x12e/0x180
[613.088705]  kmem_cache_free+0xa0/0x230
[613.088705]  ip6_mc_input+0x40f/0x5a0
[613.088705]  ipv6_rcv+0xc9/0x1e0
[613.088705]  __netif_receive_skb_one_core+0xc1/0x100
[613.088705]  netif_receive_skb_internal+0xc4/0x270
[613.088705]  br_pass_frame_up+0x2b9/0x2e0
[613.088705]  br_handle_frame_finish+0x2fb/0x7a0
[613.088705]  br_handle_frame+0x30f/0x6c0
[613.088705]  __netif_receive_skb_core+0x61a/0x15b0
[613.088705]  __netif_receive_skb_one_core+0x8e/0x100
[613.088705]  netif_receive_skb_internal+0xc4/0x270
[613.088705]  tun_sendmsg+0x738/0xc50 [tun]
[613.088705]  vhost_tx_batch.isra.14+0xeb/0x1f0 [vhost_net]
[613.088705]  handle_tx_copy+0x2d0/0x8f0 [vhost_net]
[613.088705]  handle_tx+0xc7/0x120 [vhost_net]
[613.088705]  vhost_worker+0x166/0x200 [vhost]
[613.088705]  kthread+0x1b1/0x1d0
[613.088705]  ret_from_fork+0x35/0x40
[613.088705]
[613.088705] The buggy address belongs to the object at ffff8881da9ab740
[613.088705]  which belongs to the cache skbuff_head_cache of size 232

Fixes: 043d222f93ab ("tuntap: accept an array of XDP buffs through sendmsg()")
Reviewed-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Signed-off-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Acked-by: Jason Wang <jasowang@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index cf349e65a66b..005020042be9 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2385,6 +2385,7 @@ static int tun_xdp_one(struct tun_struct *tun,
 		       struct tun_file *tfile,
 		       struct xdp_buff *xdp, int *flush)
 {
+	unsigned int datasize = xdp->data_end - xdp->data;
 	struct tun_xdp_hdr *hdr = xdp->data_hard_start;
 	struct virtio_net_hdr *gso = &hdr->gso;
 	struct tun_pcpu_stats *stats;
@@ -2461,7 +2462,7 @@ build:
 	stats = get_cpu_ptr(tun->pcpu_stats);
 	u64_stats_update_begin(&stats->syncp);
 	stats->rx_packets++;
-	stats->rx_bytes += skb->len;
+	stats->rx_bytes += datasize;
 	u64_stats_update_end(&stats->syncp);
 	put_cpu_ptr(stats);
 

From 025dceb0fab31c912c41b8f32577432231d83e6b Mon Sep 17 00:00:00 2001
From: Sandipan Das <sandipan@linux.ibm.com>
Date: Mon, 3 Dec 2018 17:51:04 +0530
Subject: [PATCH 51/93] bpf: powerpc64: optimize JIT passes for bpf function
 calls

Once the JITed images for each function in a multi-function program
are generated after the first three JIT passes, we only need to fix
the target address for the branch instruction corresponding to each
bpf-to-bpf function call.

This introduces the following optimizations for reducing the work
done by the JIT compiler when handling multi-function programs:

  [1] Instead of doing two extra passes to fix the bpf function calls,
      do just one as that would be sufficient.

  [2] During the extra pass, only overwrite the instruction sequences
      for the bpf-to-bpf function calls as everything else would still
      remain exactly the same. This also reduces the number of writes
      to the JITed image.

  [3] Do not regenerate the prologue and the epilogue during the extra
      pass as that would be redundant.

Signed-off-by: Sandipan Das <sandipan@linux.ibm.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 arch/powerpc/net/bpf_jit_comp64.c | 66 +++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
index 17482f5de3e2..9393e231cbc2 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -891,6 +891,55 @@ cond_branch:
 	return 0;
 }
 
+/* Fix the branch target addresses for subprog calls */
+static int bpf_jit_fixup_subprog_calls(struct bpf_prog *fp, u32 *image,
+				       struct codegen_context *ctx, u32 *addrs)
+{
+	const struct bpf_insn *insn = fp->insnsi;
+	bool func_addr_fixed;
+	u64 func_addr;
+	u32 tmp_idx;
+	int i, ret;
+
+	for (i = 0; i < fp->len; i++) {
+		/*
+		 * During the extra pass, only the branch target addresses for
+		 * the subprog calls need to be fixed. All other instructions
+		 * can left untouched.
+		 *
+		 * The JITed image length does not change because we already
+		 * ensure that the JITed instruction sequence for these calls
+		 * are of fixed length by padding them with NOPs.
+		 */
+		if (insn[i].code == (BPF_JMP | BPF_CALL) &&
+		    insn[i].src_reg == BPF_PSEUDO_CALL) {
+			ret = bpf_jit_get_func_addr(fp, &insn[i], true,
+						    &func_addr,
+						    &func_addr_fixed);
+			if (ret < 0)
+				return ret;
+
+			/*
+			 * Save ctx->idx as this would currently point to the
+			 * end of the JITed image and set it to the offset of
+			 * the instruction sequence corresponding to the
+			 * subprog call temporarily.
+			 */
+			tmp_idx = ctx->idx;
+			ctx->idx = addrs[i] / 4;
+			bpf_jit_emit_func_call_rel(image, ctx, func_addr);
+
+			/*
+			 * Restore ctx->idx here. This is safe as the length
+			 * of the JITed sequence remains unchanged.
+			 */
+			ctx->idx = tmp_idx;
+		}
+	}
+
+	return 0;
+}
+
 struct powerpc64_jit_data {
 	struct bpf_binary_header *header;
 	u32 *addrs;
@@ -989,6 +1038,22 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 skip_init_ctx:
 	code_base = (u32 *)(image + FUNCTION_DESCR_SIZE);
 
+	if (extra_pass) {
+		/*
+		 * Do not touch the prologue and epilogue as they will remain
+		 * unchanged. Only fix the branch target address for subprog
+		 * calls in the body.
+		 *
+		 * This does not change the offsets and lengths of the subprog
+		 * call instruction sequences and hence, the size of the JITed
+		 * image as well.
+		 */
+		bpf_jit_fixup_subprog_calls(fp, code_base, &cgctx, addrs);
+
+		/* There is no need to perform the usual passes. */
+		goto skip_codegen_passes;
+	}
+
 	/* Code generation passes 1-2 */
 	for (pass = 1; pass < 3; pass++) {
 		/* Now build the prologue, body code & epilogue for real. */
@@ -1002,6 +1067,7 @@ skip_init_ctx:
 				proglen - (cgctx.idx * 4), cgctx.seen);
 	}
 
+skip_codegen_passes:
 	if (bpf_jit_enable > 1)
 		/*
 		 * Note that we output the base address of the code_base

From ef1b5bf506b1f0ee3edc98533e1f3ecb105eb46a Mon Sep 17 00:00:00 2001
From: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Date: Wed, 28 Nov 2018 09:02:41 +0000
Subject: [PATCH 52/93] net: phy: Fix not to call phy_resume() if PHY is not
 attached

This patch fixes an issue that mdio_bus_phy_resume() doesn't call
phy_resume() if the PHY is not attached.

Fixes: 803dd9c77ac3 ("net: phy: avoid suspending twice a PHY")
Signed-off-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy_device.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 18e92c19c5ab..c4b9008c52d2 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -220,7 +220,7 @@ static LIST_HEAD(phy_fixup_list);
 static DEFINE_MUTEX(phy_fixup_lock);
 
 #ifdef CONFIG_PM
-static bool mdio_bus_phy_may_suspend(struct phy_device *phydev)
+static bool mdio_bus_phy_may_suspend(struct phy_device *phydev, bool suspend)
 {
 	struct device_driver *drv = phydev->mdio.dev.driver;
 	struct phy_driver *phydrv = to_phy_driver(drv);
@@ -232,10 +232,11 @@ static bool mdio_bus_phy_may_suspend(struct phy_device *phydev)
 	/* PHY not attached? May suspend if the PHY has not already been
 	 * suspended as part of a prior call to phy_disconnect() ->
 	 * phy_detach() -> phy_suspend() because the parent netdev might be the
-	 * MDIO bus driver and clock gated at this point.
+	 * MDIO bus driver and clock gated at this point. Also may resume if
+	 * PHY is not attached.
 	 */
 	if (!netdev)
-		return !phydev->suspended;
+		return suspend ? !phydev->suspended : phydev->suspended;
 
 	if (netdev->wol_enabled)
 		return false;
@@ -270,7 +271,7 @@ static int mdio_bus_phy_suspend(struct device *dev)
 	if (phydev->attached_dev && phydev->adjust_link)
 		phy_stop_machine(phydev);
 
-	if (!mdio_bus_phy_may_suspend(phydev))
+	if (!mdio_bus_phy_may_suspend(phydev, true))
 		return 0;
 
 	return phy_suspend(phydev);
@@ -281,7 +282,7 @@ static int mdio_bus_phy_resume(struct device *dev)
 	struct phy_device *phydev = to_phy_device(dev);
 	int ret;
 
-	if (!mdio_bus_phy_may_suspend(phydev))
+	if (!mdio_bus_phy_may_suspend(phydev, false))
 		goto no_resume;
 
 	ret = phy_resume(phydev);

From 8c85f4b81296a530b8af2796c110fa482ac42d4f Mon Sep 17 00:00:00 2001
From: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Date: Wed, 28 Nov 2018 09:02:42 +0000
Subject: [PATCH 53/93] net: phy: micrel: add toggling phy reset if PHY is not
 attached

This patch adds toggling phy reset if PHY is not attached. Otherwise,
some boards (e.g. R-Car H3 Salvator-XS) cannot link up correctly if
we do the following method:

 1) Kernel boots by using initramfs.
 --> No open the nic, so phy_device_register() and phy_probe()
     deasserts the reset.
 2) Kernel enters the suspend.
 --> So, keep the reset signal as deassert.
 --> On R-Car Salvator-XS board, unfortunately, the board power is
     turned off.
 3) Kernel returns from suspend.
 4) ifconfig eth0 up
 --> Then, since edge signal of the reset doesn't happen,
     it cannot link up.
 5) ifconfig eth0 down
 6) ifconfig eth0 up
 --> In this case, it can link up.

Reported-by: Hiromitsu Yamasaki <hiromitsu.yamasaki.ym@renesas.com>
Signed-off-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/micrel.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
index 9265dea79412..1679a6ea104c 100644
--- a/drivers/net/phy/micrel.c
+++ b/drivers/net/phy/micrel.c
@@ -23,6 +23,7 @@
  *			 ksz9477
  */
 
+#include <linux/delay.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/phy.h>
@@ -835,6 +836,13 @@ static int kszphy_resume(struct phy_device *phydev)
 {
 	int ret;
 
+	if (!phydev->attached_dev) {
+		/* If the PHY is not attached, toggle the reset */
+		phy_device_reset(phydev, 1);
+		udelay(1);
+		phy_device_reset(phydev, 0);
+	}
+
 	genphy_resume(phydev);
 
 	ret = kszphy_config_reset(phydev);

From e3f787189e10f5fafce77ba8aa948741ebb93c2b Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
Date: Fri, 30 Nov 2018 16:05:53 +0000
Subject: [PATCH 54/93] vhost: fix IOTLB locking

Commit 78139c94dc8c ("net: vhost: lock the vqs one by one") moved the vq
lock to improve scalability, but introduced a possible deadlock in
vhost-iotlb. vhost_iotlb_notify_vq() now takes vq->mutex while holding
the device's IOTLB spinlock. And on the vhost_iotlb_miss() path, the
spinlock is taken while holding vq->mutex.

Since calling vhost_poll_queue() doesn't require any lock, avoid the
deadlock by not taking vq->mutex.

Fixes: 78139c94dc8c ("net: vhost: lock the vqs one by one")
Acked-by: Jason Wang <jasowang@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/vhost/vhost.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 3a5f81a66d34..6b98d8e3a5bf 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -944,10 +944,7 @@ static void vhost_iotlb_notify_vq(struct vhost_dev *d,
 		if (msg->iova <= vq_msg->iova &&
 		    msg->iova + msg->size - 1 >= vq_msg->iova &&
 		    vq_msg->type == VHOST_IOTLB_MISS) {
-			mutex_lock(&node->vq->mutex);
 			vhost_poll_queue(&node->vq->poll);
-			mutex_unlock(&node->vq->mutex);
-
 			list_del(&node->node);
 			kfree(node);
 		}

From 986103e7920cabc0b910749e77ae5589d3934d52 Mon Sep 17 00:00:00 2001
From: Thomas Falcon <tlfalcon@linux.ibm.com>
Date: Fri, 30 Nov 2018 10:59:08 -0600
Subject: [PATCH 55/93] net/ibmvnic: Fix RTNL deadlock during device reset

Commit a5681e20b541 ("net/ibmnvic: Fix deadlock problem
in reset") made the change to hold the RTNL lock during
driver reset but still calls netdev_notify_peers, which
results in a deadlock. Instead, use call_netdevice_notifiers,
which is functionally the same except that it does not
take the RTNL lock again.

Fixes: a5681e20b541 ("net/ibmnvic: Fix deadlock problem in reset")
Signed-off-by: Thomas Falcon <tlfalcon@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index c0203a0d5e3b..ed50b8dee44f 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1859,7 +1859,7 @@ static int do_reset(struct ibmvnic_adapter *adapter,
 
 	if (adapter->reset_reason != VNIC_RESET_FAILOVER &&
 	    adapter->reset_reason != VNIC_RESET_CHANGE_PARAM)
-		netdev_notify_peers(netdev);
+		call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, netdev);
 
 	netif_carrier_on(netdev);
 

From fb6df5a6234c38a9c551559506a49a677ac6f07a Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sat, 1 Dec 2018 01:36:59 +0800
Subject: [PATCH 56/93] sctp: kfree_rcu asoc

In sctp_hash_transport/sctp_epaddr_lookup_transport, it dereferences
a transport's asoc under rcu_read_lock while asoc is freed not after
a grace period, which leads to a use-after-free panic.

This patch fixes it by calling kfree_rcu to make asoc be freed after
a grace period.

Note that only the asoc's memory is delayed to free in the patch, it
won't cause sk to linger longer.

Thanks Neil and Marcelo to make this clear.

Fixes: 7fda702f9315 ("sctp: use new rhlist interface on sctp transport rhashtable")
Fixes: cd2b70875058 ("sctp: check duplicate node before inserting a new transport")
Reported-by: syzbot+0b05d8aa7cb185107483@syzkaller.appspotmail.com
Reported-by: syzbot+aad231d51b1923158444@syzkaller.appspotmail.com
Suggested-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h | 2 ++
 net/sctp/associola.c       | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index a11f93790476..feada358d872 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -2075,6 +2075,8 @@ struct sctp_association {
 
 	__u64 abandoned_unsent[SCTP_PR_INDEX(MAX) + 1];
 	__u64 abandoned_sent[SCTP_PR_INDEX(MAX) + 1];
+
+	struct rcu_head rcu;
 };
 
 
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index dd77ec3892b6..914750b819b2 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -435,7 +435,7 @@ static void sctp_association_destroy(struct sctp_association *asoc)
 
 	WARN_ON(atomic_read(&asoc->rmem_alloc));
 
-	kfree(asoc);
+	kfree_rcu(asoc, rcu);
 	SCTP_DBG_OBJCNT_DEC(assoc);
 }
 

From 59f997b088d26a774958cb7b17b0763cd82de7ec Mon Sep 17 00:00:00 2001
From: Matteo Croce <mcroce@redhat.com>
Date: Sat, 1 Dec 2018 00:26:27 +0100
Subject: [PATCH 57/93] macvlan: return correct error value

A MAC address must be unique among all the macvlan devices with the same
lower device. The only exception is the passthru [sic] mode,
which shares the lower device address.

When duplicate addresses are detected, EBUSY is returned when bringing
the interface up:

    # ip link add macvlan0 link eth0 type macvlan
    # read addr </sys/class/net/eth0/address
    # ip link set macvlan0 address $addr
    # ip link set macvlan0 up
    RTNETLINK answers: Device or resource busy

Use correct error code which is EADDRINUSE, and do the check also
earlier, on address change:

    # ip link set macvlan0 address $addr
    RTNETLINK answers: Address already in use

Signed-off-by: Matteo Croce <mcroce@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/macvlan.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index fc8d5f1ee1ad..0da3d36b283b 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -608,7 +608,7 @@ static int macvlan_open(struct net_device *dev)
 		goto hash_add;
 	}
 
-	err = -EBUSY;
+	err = -EADDRINUSE;
 	if (macvlan_addr_busy(vlan->port, dev->dev_addr))
 		goto out;
 
@@ -706,7 +706,7 @@ static int macvlan_sync_address(struct net_device *dev, unsigned char *addr)
 	} else {
 		/* Rehash and update the device filters */
 		if (macvlan_addr_busy(vlan->port, addr))
-			return -EBUSY;
+			return -EADDRINUSE;
 
 		if (!macvlan_passthru(port)) {
 			err = dev_uc_add(lowerdev, addr);
@@ -747,6 +747,9 @@ static int macvlan_set_mac_address(struct net_device *dev, void *p)
 		return dev_set_mac_address(vlan->lowerdev, addr);
 	}
 
+	if (macvlan_addr_busy(vlan->port, addr->sa_data))
+		return -EADDRINUSE;
+
 	return macvlan_sync_address(dev, addr->sa_data);
 }
 

From a74515604a7b171f2702bdcbd1e231225fb456d0 Mon Sep 17 00:00:00 2001
From: Anderson Luiz Alves <alacn1@gmail.com>
Date: Fri, 30 Nov 2018 21:58:36 -0200
Subject: [PATCH 58/93] mv88e6060: disable hardware level MAC learning

Disable hardware level MAC learning because it breaks station roaming.
When enabled it drops all frames that arrive from a MAC address
that is on a different port at learning table.

Signed-off-by: Anderson Luiz Alves <alacn1@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mv88e6060.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/drivers/net/dsa/mv88e6060.c b/drivers/net/dsa/mv88e6060.c
index 65f10fec25b3..0b3e51f248c2 100644
--- a/drivers/net/dsa/mv88e6060.c
+++ b/drivers/net/dsa/mv88e6060.c
@@ -116,8 +116,7 @@ static int mv88e6060_switch_reset(struct dsa_switch *ds)
 	/* Reset the switch. */
 	REG_WRITE(REG_GLOBAL, GLOBAL_ATU_CONTROL,
 		  GLOBAL_ATU_CONTROL_SWRESET |
-		  GLOBAL_ATU_CONTROL_ATUSIZE_1024 |
-		  GLOBAL_ATU_CONTROL_ATE_AGE_5MIN);
+		  GLOBAL_ATU_CONTROL_LEARNDIS);
 
 	/* Wait up to one second for reset to complete. */
 	timeout = jiffies + 1 * HZ;
@@ -142,13 +141,10 @@ static int mv88e6060_setup_global(struct dsa_switch *ds)
 	 */
 	REG_WRITE(REG_GLOBAL, GLOBAL_CONTROL, GLOBAL_CONTROL_MAX_FRAME_1536);
 
-	/* Enable automatic address learning, set the address
-	 * database size to 1024 entries, and set the default aging
-	 * time to 5 minutes.
+	/* Disable automatic address learning.
 	 */
 	REG_WRITE(REG_GLOBAL, GLOBAL_ATU_CONTROL,
-		  GLOBAL_ATU_CONTROL_ATUSIZE_1024 |
-		  GLOBAL_ATU_CONTROL_ATE_AGE_5MIN);
+		  GLOBAL_ATU_CONTROL_LEARNDIS);
 
 	return 0;
 }

From bf29e9e9b6d2f09cdbf39b48d028f0b49e944f85 Mon Sep 17 00:00:00 2001
From: Qian Cai <cai@gmx.us>
Date: Sat, 1 Dec 2018 21:11:19 -0500
Subject: [PATCH 59/93] net/core: tidy up an error message

netif_napi_add() could report an error like this below due to it allows
to pass a format string for wildcarding before calling
dev_get_valid_name(),

"netif_napi_add() called with weight 256 on device eth%d"

For example, hns_enet_drv module does this.

hns_nic_try_get_ae
  hns_nic_init_ring_data
    netif_napi_add
  register_netdev
    dev_get_valid_name

Hence, make it a bit more human-readable by using netdev_err_once()
instead.

Signed-off-by: Qian Cai <cai@gmx.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 3470e7fff1f4..e06223b65674 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6209,8 +6209,8 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 	napi->skb = NULL;
 	napi->poll = poll;
 	if (weight > NAPI_POLL_WEIGHT)
-		pr_err_once("netif_napi_add() called with weight %d on device %s\n",
-			    weight, dev->name);
+		netdev_err_once(dev, "%s() called with weight %d\n", __func__,
+				weight);
 	napi->weight = weight;
 	list_add(&napi->dev_list, &dev->napi_list);
 	napi->dev = dev;

From 24be19e47779d604d1492c114459dca9a92acf78 Mon Sep 17 00:00:00 2001
From: Eran Ben Elisha <eranbe@mellanox.com>
Date: Sun, 2 Dec 2018 14:34:36 +0200
Subject: [PATCH 60/93] net/mlx4_en: Change min MTU size to ETH_MIN_MTU

NIC driver minimal MTU size shall be set to ETH_MIN_MTU, as defined in
the RFC791 and in the network stack. Remove old mlx4_en only define for
it, which was set to wrong value.

Fixes: b80f71f5816f ("ethernet/mellanox: use core min/max MTU checking")
Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 4 ++--
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h   | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index b744cd49a785..6b88881b8e35 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -3493,8 +3493,8 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
 		dev->gso_partial_features = NETIF_F_GSO_UDP_TUNNEL_CSUM;
 	}
 
-	/* MTU range: 46 - hw-specific max */
-	dev->min_mtu = MLX4_EN_MIN_MTU;
+	/* MTU range: 68 - hw-specific max */
+	dev->min_mtu = ETH_MIN_MTU;
 	dev->max_mtu = priv->max_mtu;
 
 	mdev->pndev[port] = dev;
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index 485d856546c6..8137454e2534 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -161,7 +161,6 @@
 #define MLX4_SELFTEST_LB_MIN_MTU (MLX4_LOOPBACK_TEST_PAYLOAD + NET_IP_ALIGN + \
 				  ETH_HLEN + PREAMBLE_LEN)
 
-#define MLX4_EN_MIN_MTU		46
 /* VLAN_HLEN is added twice,to support skb vlan tagged with multiple
  * headers. (For example: ETH_P_8021Q and ETH_P_8021AD).
  */

From 1b603f9e4313348608f256b564ed6e3d9e67f377 Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Sun, 2 Dec 2018 14:34:37 +0200
Subject: [PATCH 61/93] net/mlx4_en: Fix build break when CONFIG_INET is off
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

MLX4_EN depends on NETDEVICES, ETHERNET and INET Kconfigs.
Make sure they are listed in MLX4_EN Kconfig dependencies.

This fixes the following build break:

drivers/net/ethernet/mellanox/mlx4/en_rx.c:582:18: warning: ‘struct iphdr’ declared inside parameter list [enabled by default]
struct iphdr *iph)
^
drivers/net/ethernet/mellanox/mlx4/en_rx.c:582:18: warning: its scope is only this definition or declaration, which is probably not what you want [enabled by default]
drivers/net/ethernet/mellanox/mlx4/en_rx.c: In function ‘get_fixed_ipv4_csum’:
drivers/net/ethernet/mellanox/mlx4/en_rx.c:586:20: error: dereferencing pointer to incomplete type
_u8 ipproto = iph->protocol;

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/Kconfig b/drivers/net/ethernet/mellanox/mlx4/Kconfig
index 36054e6fb9d3..f200b8c420d5 100644
--- a/drivers/net/ethernet/mellanox/mlx4/Kconfig
+++ b/drivers/net/ethernet/mellanox/mlx4/Kconfig
@@ -5,7 +5,7 @@
 config MLX4_EN
 	tristate "Mellanox Technologies 1/10/40Gbit Ethernet support"
 	depends on MAY_USE_DEVLINK
-	depends on PCI
+	depends on PCI && NETDEVICES && ETHERNET && INET
 	select MLX4_CORE
 	imply PTP_1588_CLOCK
 	---help---

From c3494801cd1785e2c25f1a5735fa19ddcf9665da Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 3 Dec 2018 22:46:04 -0800
Subject: [PATCH 62/93] bpf: check pending signals while verifying programs

Malicious user space may try to force the verifier to use as much cpu
time and memory as possible. Hence check for pending signals
while verifying the program.
Note that suspend of sys_bpf(PROG_LOAD) syscall will lead to EAGAIN,
since the kernel has to release the resources used for program verification.

Reported-by: Anatoly Trosinenko <anatoly.trosinenko@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/verifier.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6dd419550aba..751bb30b7c5c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5148,6 +5148,9 @@ static int do_check(struct bpf_verifier_env *env)
 			goto process_bpf_exit;
 		}
 
+		if (signal_pending(current))
+			return -EAGAIN;
+
 		if (need_resched())
 			cond_resched();
 

From 4f7b3e82589e0de723780198ec7983e427144c0a Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 3 Dec 2018 22:46:05 -0800
Subject: [PATCH 63/93] bpf: improve verifier branch analysis

pathological bpf programs may try to force verifier to explode in
the number of branch states:
  20: (d5) if r1 s<= 0x24000028 goto pc+0
  21: (b5) if r0 <= 0xe1fa20 goto pc+2
  22: (d5) if r1 s<= 0x7e goto pc+0
  23: (b5) if r0 <= 0xe880e000 goto pc+0
  24: (c5) if r0 s< 0x2100ecf4 goto pc+0
  25: (d5) if r1 s<= 0xe880e000 goto pc+1
  26: (c5) if r0 s< 0xf4041810 goto pc+0
  27: (d5) if r1 s<= 0x1e007e goto pc+0
  28: (b5) if r0 <= 0xe86be000 goto pc+0
  29: (07) r0 += 16614
  30: (c5) if r0 s< 0x6d0020da goto pc+0
  31: (35) if r0 >= 0x2100ecf4 goto pc+0

Teach verifier to recognize always taken and always not taken branches.
This analysis is already done for == and != comparison.
Expand it to all other branches.

It also helps real bpf programs to be verified faster:
                       before  after
bpf_lb-DLB_L3.o         2003    1940
bpf_lb-DLB_L4.o         3173    3089
bpf_lb-DUNKNOWN.o       1080    1065
bpf_lxc-DDROP_ALL.o     29584   28052
bpf_lxc-DUNKNOWN.o      36916   35487
bpf_netdev.o            11188   10864
bpf_overlay.o           6679    6643
bpf_lcx_jit.o           39555   38437

Reported-by: Anatoly Trosinenko <anatoly.trosinenko@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/verifier.c                       | 93 ++++++++++++++++++---
 tools/testing/selftests/bpf/test_verifier.c |  4 +-
 2 files changed, 82 insertions(+), 15 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 751bb30b7c5c..55a49703f423 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3751,6 +3751,79 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
 	}
 }
 
+/* compute branch direction of the expression "if (reg opcode val) goto target;"
+ * and return:
+ *  1 - branch will be taken and "goto target" will be executed
+ *  0 - branch will not be taken and fall-through to next insn
+ * -1 - unknown. Example: "if (reg < 5)" is unknown when register value range [0,10]
+ */
+static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
+{
+	if (__is_pointer_value(false, reg))
+		return -1;
+
+	switch (opcode) {
+	case BPF_JEQ:
+		if (tnum_is_const(reg->var_off))
+			return !!tnum_equals_const(reg->var_off, val);
+		break;
+	case BPF_JNE:
+		if (tnum_is_const(reg->var_off))
+			return !tnum_equals_const(reg->var_off, val);
+		break;
+	case BPF_JGT:
+		if (reg->umin_value > val)
+			return 1;
+		else if (reg->umax_value <= val)
+			return 0;
+		break;
+	case BPF_JSGT:
+		if (reg->smin_value > (s64)val)
+			return 1;
+		else if (reg->smax_value < (s64)val)
+			return 0;
+		break;
+	case BPF_JLT:
+		if (reg->umax_value < val)
+			return 1;
+		else if (reg->umin_value >= val)
+			return 0;
+		break;
+	case BPF_JSLT:
+		if (reg->smax_value < (s64)val)
+			return 1;
+		else if (reg->smin_value >= (s64)val)
+			return 0;
+		break;
+	case BPF_JGE:
+		if (reg->umin_value >= val)
+			return 1;
+		else if (reg->umax_value < val)
+			return 0;
+		break;
+	case BPF_JSGE:
+		if (reg->smin_value >= (s64)val)
+			return 1;
+		else if (reg->smax_value < (s64)val)
+			return 0;
+		break;
+	case BPF_JLE:
+		if (reg->umax_value <= val)
+			return 1;
+		else if (reg->umin_value > val)
+			return 0;
+		break;
+	case BPF_JSLE:
+		if (reg->smax_value <= (s64)val)
+			return 1;
+		else if (reg->smin_value > (s64)val)
+			return 0;
+		break;
+	}
+
+	return -1;
+}
+
 /* Adjusts the register min/max values in the case that the dst_reg is the
  * variable register that we are working on, and src_reg is a constant or we're
  * simply doing a BPF_K check.
@@ -4152,21 +4225,15 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 
 	dst_reg = &regs[insn->dst_reg];
 
-	/* detect if R == 0 where R was initialized to zero earlier */
-	if (BPF_SRC(insn->code) == BPF_K &&
-	    (opcode == BPF_JEQ || opcode == BPF_JNE) &&
-	    dst_reg->type == SCALAR_VALUE &&
-	    tnum_is_const(dst_reg->var_off)) {
-		if ((opcode == BPF_JEQ && dst_reg->var_off.value == insn->imm) ||
-		    (opcode == BPF_JNE && dst_reg->var_off.value != insn->imm)) {
-			/* if (imm == imm) goto pc+off;
-			 * only follow the goto, ignore fall-through
-			 */
+	if (BPF_SRC(insn->code) == BPF_K) {
+		int pred = is_branch_taken(dst_reg, insn->imm, opcode);
+
+		if (pred == 1) {
+			 /* only follow the goto, ignore fall-through */
 			*insn_idx += insn->off;
 			return 0;
-		} else {
-			/* if (imm != imm) goto pc+off;
-			 * only follow fall-through branch, since
+		} else if (pred == 0) {
+			/* only follow fall-through branch, since
 			 * that's where the program will go
 			 */
 			return 0;
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 5dd4410a716c..df6f751cc1e8 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -8576,7 +8576,7 @@ static struct bpf_test tests[] = {
 			BPF_JMP_IMM(BPF_JA, 0, 0, -7),
 		},
 		.fixup_map_hash_8b = { 4 },
-		.errstr = "R0 invalid mem access 'inv'",
+		.errstr = "unbounded min value",
 		.result = REJECT,
 	},
 	{
@@ -10547,7 +10547,7 @@ static struct bpf_test tests[] = {
 		"check deducing bounds from const, 5",
 		.insns = {
 			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 1),
+			BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 1, 1),
 			BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1),
 			BPF_EXIT_INSN(),
 		},

From ceefbc96fa5c5b975d87bf8e89ba8416f6b764d9 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 3 Dec 2018 22:46:06 -0800
Subject: [PATCH 64/93] bpf: add per-insn complexity limit

malicious bpf program may try to force the verifier to remember
a lot of distinct verifier states.
Put a limit to number of per-insn 'struct bpf_verifier_state'.
Note that hitting the limit doesn't reject the program.
It potentially makes the verifier do more steps to analyze the program.
It means that malicious programs will hit BPF_COMPLEXITY_LIMIT_INSNS sooner
instead of spending cpu time walking long link list.

The limit of BPF_COMPLEXITY_LIMIT_STATES==64 affects cilium progs
with slight increase in number of "steps" it takes to successfully verify
the programs:
                       before    after
bpf_lb-DLB_L3.o         1940      1940
bpf_lb-DLB_L4.o         3089      3089
bpf_lb-DUNKNOWN.o       1065      1065
bpf_lxc-DDROP_ALL.o     28052  |  28162
bpf_lxc-DUNKNOWN.o      35487  |  35541
bpf_netdev.o            10864     10864
bpf_overlay.o           6643      6643
bpf_lcx_jit.o           38437     38437

But it also makes malicious program to be rejected in 0.4 seconds vs 6.5
Hence apply this limit to unprivileged programs only.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/verifier.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 55a49703f423..fc760d00a38c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -175,6 +175,7 @@ struct bpf_verifier_stack_elem {
 
 #define BPF_COMPLEXITY_LIMIT_INSNS	131072
 #define BPF_COMPLEXITY_LIMIT_STACK	1024
+#define BPF_COMPLEXITY_LIMIT_STATES	64
 
 #define BPF_MAP_PTR_UNPRIV	1UL
 #define BPF_MAP_PTR_POISON	((void *)((0xeB9FUL << 1) +	\
@@ -5047,7 +5048,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 	struct bpf_verifier_state_list *new_sl;
 	struct bpf_verifier_state_list *sl;
 	struct bpf_verifier_state *cur = env->cur_state, *new;
-	int i, j, err;
+	int i, j, err, states_cnt = 0;
 
 	sl = env->explored_states[insn_idx];
 	if (!sl)
@@ -5074,8 +5075,12 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 			return 1;
 		}
 		sl = sl->next;
+		states_cnt++;
 	}
 
+	if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES)
+		return 0;
+
 	/* there were no equivalent states, remember current one.
 	 * technically the current state is not proven to be safe yet,
 	 * but it will either reach outer most bpf_exit (which means it's safe)

From 7b566f70e1bf65b189b66eb3de6f431c30f7dff2 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 4 Dec 2018 08:47:44 -0800
Subject: [PATCH 65/93] phy: Revert toggling reset changes.

This reverts:

ef1b5bf506b1 ("net: phy: Fix not to call phy_resume() if PHY is not attached")
8c85f4b81296 ("net: phy: micrel: add toggling phy reset if PHY is not  attached")

Andrew Lunn informs me that there are alternative efforts
underway to fix this more properly.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/micrel.c     |  8 --------
 drivers/net/phy/phy_device.c | 11 +++++------
 2 files changed, 5 insertions(+), 14 deletions(-)

diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
index 1679a6ea104c..9265dea79412 100644
--- a/drivers/net/phy/micrel.c
+++ b/drivers/net/phy/micrel.c
@@ -23,7 +23,6 @@
  *			 ksz9477
  */
 
-#include <linux/delay.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/phy.h>
@@ -836,13 +835,6 @@ static int kszphy_resume(struct phy_device *phydev)
 {
 	int ret;
 
-	if (!phydev->attached_dev) {
-		/* If the PHY is not attached, toggle the reset */
-		phy_device_reset(phydev, 1);
-		udelay(1);
-		phy_device_reset(phydev, 0);
-	}
-
 	genphy_resume(phydev);
 
 	ret = kszphy_config_reset(phydev);
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index c4b9008c52d2..18e92c19c5ab 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -220,7 +220,7 @@ static LIST_HEAD(phy_fixup_list);
 static DEFINE_MUTEX(phy_fixup_lock);
 
 #ifdef CONFIG_PM
-static bool mdio_bus_phy_may_suspend(struct phy_device *phydev, bool suspend)
+static bool mdio_bus_phy_may_suspend(struct phy_device *phydev)
 {
 	struct device_driver *drv = phydev->mdio.dev.driver;
 	struct phy_driver *phydrv = to_phy_driver(drv);
@@ -232,11 +232,10 @@ static bool mdio_bus_phy_may_suspend(struct phy_device *phydev, bool suspend)
 	/* PHY not attached? May suspend if the PHY has not already been
 	 * suspended as part of a prior call to phy_disconnect() ->
 	 * phy_detach() -> phy_suspend() because the parent netdev might be the
-	 * MDIO bus driver and clock gated at this point. Also may resume if
-	 * PHY is not attached.
+	 * MDIO bus driver and clock gated at this point.
 	 */
 	if (!netdev)
-		return suspend ? !phydev->suspended : phydev->suspended;
+		return !phydev->suspended;
 
 	if (netdev->wol_enabled)
 		return false;
@@ -271,7 +270,7 @@ static int mdio_bus_phy_suspend(struct device *dev)
 	if (phydev->attached_dev && phydev->adjust_link)
 		phy_stop_machine(phydev);
 
-	if (!mdio_bus_phy_may_suspend(phydev, true))
+	if (!mdio_bus_phy_may_suspend(phydev))
 		return 0;
 
 	return phy_suspend(phydev);
@@ -282,7 +281,7 @@ static int mdio_bus_phy_resume(struct device *dev)
 	struct phy_device *phydev = to_phy_device(dev);
 	int ret;
 
-	if (!mdio_bus_phy_may_suspend(phydev, false))
+	if (!mdio_bus_phy_may_suspend(phydev))
 		goto no_resume;
 
 	ret = phy_resume(phydev);

From 688838934c231bb08f46db687e57f6d8bf82709c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 4 Dec 2018 09:40:35 -0800
Subject: [PATCH 66/93] rtnetlink: ndo_dflt_fdb_dump() only work for
 ARPHRD_ETHER devices

kmsan was able to trigger a kernel-infoleak using a gre device [1]

nlmsg_populate_fdb_fill() has a hard coded assumption
that dev->addr_len is ETH_ALEN, as normally guaranteed
for ARPHRD_ETHER devices.

A similar issue was fixed recently in commit da71577545a5
("rtnetlink: Disallow FDB configuration for non-Ethernet device")

[1]
BUG: KMSAN: kernel-infoleak in copyout lib/iov_iter.c:143 [inline]
BUG: KMSAN: kernel-infoleak in _copy_to_iter+0x4c0/0x2700 lib/iov_iter.c:576
CPU: 0 PID: 6697 Comm: syz-executor310 Not tainted 4.20.0-rc3+ #95
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x32d/0x480 lib/dump_stack.c:113
 kmsan_report+0x12c/0x290 mm/kmsan/kmsan.c:683
 kmsan_internal_check_memory+0x32a/0xa50 mm/kmsan/kmsan.c:743
 kmsan_copy_to_user+0x78/0xd0 mm/kmsan/kmsan_hooks.c:634
 copyout lib/iov_iter.c:143 [inline]
 _copy_to_iter+0x4c0/0x2700 lib/iov_iter.c:576
 copy_to_iter include/linux/uio.h:143 [inline]
 skb_copy_datagram_iter+0x4e2/0x1070 net/core/datagram.c:431
 skb_copy_datagram_msg include/linux/skbuff.h:3316 [inline]
 netlink_recvmsg+0x6f9/0x19d0 net/netlink/af_netlink.c:1975
 sock_recvmsg_nosec net/socket.c:794 [inline]
 sock_recvmsg+0x1d1/0x230 net/socket.c:801
 ___sys_recvmsg+0x444/0xae0 net/socket.c:2278
 __sys_recvmsg net/socket.c:2327 [inline]
 __do_sys_recvmsg net/socket.c:2337 [inline]
 __se_sys_recvmsg+0x2fa/0x450 net/socket.c:2334
 __x64_sys_recvmsg+0x4a/0x70 net/socket.c:2334
 do_syscall_64+0xcf/0x110 arch/x86/entry/common.c:291
 entry_SYSCALL_64_after_hwframe+0x63/0xe7
RIP: 0033:0x441119
Code: 18 89 d0 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 db 0a fc ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007fffc7f008a8 EFLAGS: 00000207 ORIG_RAX: 000000000000002f
RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 0000000000441119
RDX: 0000000000000040 RSI: 00000000200005c0 RDI: 0000000000000003
RBP: 00000000006cc018 R08: 0000000000000100 R09: 0000000000000100
R10: 0000000000000100 R11: 0000000000000207 R12: 0000000000402080
R13: 0000000000402110 R14: 0000000000000000 R15: 0000000000000000

Uninit was stored to memory at:
 kmsan_save_stack_with_flags mm/kmsan/kmsan.c:246 [inline]
 kmsan_save_stack mm/kmsan/kmsan.c:261 [inline]
 kmsan_internal_chain_origin+0x13d/0x240 mm/kmsan/kmsan.c:469
 kmsan_memcpy_memmove_metadata+0x1a9/0xf70 mm/kmsan/kmsan.c:344
 kmsan_memcpy_metadata+0xb/0x10 mm/kmsan/kmsan.c:362
 __msan_memcpy+0x61/0x70 mm/kmsan/kmsan_instr.c:162
 __nla_put lib/nlattr.c:744 [inline]
 nla_put+0x20a/0x2d0 lib/nlattr.c:802
 nlmsg_populate_fdb_fill+0x444/0x810 net/core/rtnetlink.c:3466
 nlmsg_populate_fdb net/core/rtnetlink.c:3775 [inline]
 ndo_dflt_fdb_dump+0x73a/0x960 net/core/rtnetlink.c:3807
 rtnl_fdb_dump+0x1318/0x1cb0 net/core/rtnetlink.c:3979
 netlink_dump+0xc79/0x1c90 net/netlink/af_netlink.c:2244
 __netlink_dump_start+0x10c4/0x11d0 net/netlink/af_netlink.c:2352
 netlink_dump_start include/linux/netlink.h:216 [inline]
 rtnetlink_rcv_msg+0x141b/0x1540 net/core/rtnetlink.c:4910
 netlink_rcv_skb+0x394/0x640 net/netlink/af_netlink.c:2477
 rtnetlink_rcv+0x50/0x60 net/core/rtnetlink.c:4965
 netlink_unicast_kernel net/netlink/af_netlink.c:1310 [inline]
 netlink_unicast+0x1699/0x1740 net/netlink/af_netlink.c:1336
 netlink_sendmsg+0x13c7/0x1440 net/netlink/af_netlink.c:1917
 sock_sendmsg_nosec net/socket.c:621 [inline]
 sock_sendmsg net/socket.c:631 [inline]
 ___sys_sendmsg+0xe3b/0x1240 net/socket.c:2116
 __sys_sendmsg net/socket.c:2154 [inline]
 __do_sys_sendmsg net/socket.c:2163 [inline]
 __se_sys_sendmsg+0x305/0x460 net/socket.c:2161
 __x64_sys_sendmsg+0x4a/0x70 net/socket.c:2161
 do_syscall_64+0xcf/0x110 arch/x86/entry/common.c:291
 entry_SYSCALL_64_after_hwframe+0x63/0xe7

Uninit was created at:
 kmsan_save_stack_with_flags mm/kmsan/kmsan.c:246 [inline]
 kmsan_internal_poison_shadow+0x6d/0x130 mm/kmsan/kmsan.c:170
 kmsan_kmalloc+0xa1/0x100 mm/kmsan/kmsan_hooks.c:186
 __kmalloc+0x14c/0x4d0 mm/slub.c:3825
 kmalloc include/linux/slab.h:551 [inline]
 __hw_addr_create_ex net/core/dev_addr_lists.c:34 [inline]
 __hw_addr_add_ex net/core/dev_addr_lists.c:80 [inline]
 __dev_mc_add+0x357/0x8a0 net/core/dev_addr_lists.c:670
 dev_mc_add+0x6d/0x80 net/core/dev_addr_lists.c:687
 ip_mc_filter_add net/ipv4/igmp.c:1128 [inline]
 igmp_group_added+0x4d4/0xb80 net/ipv4/igmp.c:1311
 __ip_mc_inc_group+0xea9/0xf70 net/ipv4/igmp.c:1444
 ip_mc_inc_group net/ipv4/igmp.c:1453 [inline]
 ip_mc_up+0x1c3/0x400 net/ipv4/igmp.c:1775
 inetdev_event+0x1d03/0x1d80 net/ipv4/devinet.c:1522
 notifier_call_chain kernel/notifier.c:93 [inline]
 __raw_notifier_call_chain kernel/notifier.c:394 [inline]
 raw_notifier_call_chain+0x13d/0x240 kernel/notifier.c:401
 __dev_notify_flags+0x3da/0x860 net/core/dev.c:1733
 dev_change_flags+0x1ac/0x230 net/core/dev.c:7569
 do_setlink+0x165f/0x5ea0 net/core/rtnetlink.c:2492
 rtnl_newlink+0x2ad7/0x35a0 net/core/rtnetlink.c:3111
 rtnetlink_rcv_msg+0x1148/0x1540 net/core/rtnetlink.c:4947
 netlink_rcv_skb+0x394/0x640 net/netlink/af_netlink.c:2477
 rtnetlink_rcv+0x50/0x60 net/core/rtnetlink.c:4965
 netlink_unicast_kernel net/netlink/af_netlink.c:1310 [inline]
 netlink_unicast+0x1699/0x1740 net/netlink/af_netlink.c:1336
 netlink_sendmsg+0x13c7/0x1440 net/netlink/af_netlink.c:1917
 sock_sendmsg_nosec net/socket.c:621 [inline]
 sock_sendmsg net/socket.c:631 [inline]
 ___sys_sendmsg+0xe3b/0x1240 net/socket.c:2116
 __sys_sendmsg net/socket.c:2154 [inline]
 __do_sys_sendmsg net/socket.c:2163 [inline]
 __se_sys_sendmsg+0x305/0x460 net/socket.c:2161
 __x64_sys_sendmsg+0x4a/0x70 net/socket.c:2161
 do_syscall_64+0xcf/0x110 arch/x86/entry/common.c:291
 entry_SYSCALL_64_after_hwframe+0x63/0xe7

Bytes 36-37 of 105 are uninitialized
Memory access of size 105 starts at ffff88819686c000
Data copied to user address 0000000020000380

Fixes: d83b06036048 ("net: add fdb generic dump routine")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Ido Schimmel <idosch@mellanox.com>
Cc: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 33d9227a8b80..7819f7804eeb 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -3800,6 +3800,9 @@ int ndo_dflt_fdb_dump(struct sk_buff *skb,
 {
 	int err;
 
+	if (dev->type != ARPHRD_ETHER)
+		return -EINVAL;
+
 	netif_addr_lock_bh(dev);
 	err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->uc);
 	if (err)

From ecb239d96d369c23c33d41708646df646de669f4 Mon Sep 17 00:00:00 2001
From: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
Date: Mon, 3 Dec 2018 13:21:01 +0100
Subject: [PATCH 67/93] ethernet: fman: fix wrong of_node_put() in probe
 function

After getting a reference to the platform device's of_node the probe
function ends up calling of_find_matching_node() using the node as an
argument. The function takes care of decreasing the refcount on it. We
are then incorrectly decreasing the refcount on that node again.

This patch removes the unwarranted call to of_node_put().

Fixes: 414fd46e7762 ("fsl/fman: Add FMan support")
Signed-off-by: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/freescale/fman/fman.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/freescale/fman/fman.c b/drivers/net/ethernet/freescale/fman/fman.c
index c415ac67cb7b..e80fedb27cee 100644
--- a/drivers/net/ethernet/freescale/fman/fman.c
+++ b/drivers/net/ethernet/freescale/fman/fman.c
@@ -2786,7 +2786,7 @@ static struct fman *read_dts_node(struct platform_device *of_dev)
 	if (!muram_node) {
 		dev_err(&of_dev->dev, "%s: could not find MURAM node\n",
 			__func__);
-		goto fman_node_put;
+		goto fman_free;
 	}
 
 	err = of_address_to_resource(muram_node, 0,
@@ -2795,11 +2795,10 @@ static struct fman *read_dts_node(struct platform_device *of_dev)
 		of_node_put(muram_node);
 		dev_err(&of_dev->dev, "%s: of_address_to_resource() = %d\n",
 			__func__, err);
-		goto fman_node_put;
+		goto fman_free;
 	}
 
 	of_node_put(muram_node);
-	of_node_put(fm_node);
 
 	err = devm_request_irq(&of_dev->dev, irq, fman_irq, IRQF_SHARED,
 			       "fman", fman);

From 01b3fd5ac97caffb8e5d5bd85086da33db3b361f Mon Sep 17 00:00:00 2001
From: Baruch Siach <baruch@tkos.co.il>
Date: Tue, 4 Dec 2018 16:03:52 +0200
Subject: [PATCH 68/93] net: mvpp2: fix detection of 10G SFP modules

The mvpp2_phylink_validate() relies on the interface field of
phylink_link_state to determine valid link modes. However, when called
from phylink_sfp_module_insert() this field in not initialized. The
default switch case then excludes 10G link modes. This allows 10G SFP
modules that are detected correctly to be configured at max rate of
2.5G.

Catch the uninitialized PHY mode case, and allow 10G rates.

Fixes: d97c9f4ab000b ("net: mvpp2: 1000baseX support")
Cc: Maxime Chevallier <maxime.chevallier@bootlin.com>
Cc: Antoine Tenart <antoine.tenart@bootlin.com>
Acked-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: Baruch Siach <baruch@tkos.co.il>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
index 7a37a37e3fb3..eb1dc8abc359 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
@@ -4384,6 +4384,7 @@ static void mvpp2_phylink_validate(struct net_device *dev,
 
 	switch (state->interface) {
 	case PHY_INTERFACE_MODE_10GKR:
+	case PHY_INTERFACE_MODE_NA:
 		phylink_set(mask, 10000baseCR_Full);
 		phylink_set(mask, 10000baseSR_Full);
 		phylink_set(mask, 10000baseLR_Full);

From 0fb628f0f250c74b1023edd0ca4a57c8b35b9b2c Mon Sep 17 00:00:00 2001
From: Baruch Siach <baruch@tkos.co.il>
Date: Tue, 4 Dec 2018 16:03:53 +0200
Subject: [PATCH 69/93] net: mvpp2: fix phylink handling of invalid PHY modes

The .validate phylink callback should empty the supported bitmap when
the interface mode is invalid.

Cc: Maxime Chevallier <maxime.chevallier@bootlin.com>
Cc: Antoine Tenart <antoine.tenart@bootlin.com>
Reported-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: Baruch Siach <baruch@tkos.co.il>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/marvell/mvpp2/mvpp2_main.c   | 33 ++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
index eb1dc8abc359..125ea99418df 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
@@ -4375,8 +4375,27 @@ static void mvpp2_phylink_validate(struct net_device *dev,
 				   unsigned long *supported,
 				   struct phylink_link_state *state)
 {
+	struct mvpp2_port *port = netdev_priv(dev);
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(mask) = { 0, };
 
+	/* Invalid combinations */
+	switch (state->interface) {
+	case PHY_INTERFACE_MODE_10GKR:
+	case PHY_INTERFACE_MODE_XAUI:
+		if (port->gop_id != 0)
+			goto empty_set;
+		break;
+	case PHY_INTERFACE_MODE_RGMII:
+	case PHY_INTERFACE_MODE_RGMII_ID:
+	case PHY_INTERFACE_MODE_RGMII_RXID:
+	case PHY_INTERFACE_MODE_RGMII_TXID:
+		if (port->gop_id == 0)
+			goto empty_set;
+		break;
+	default:
+		break;
+	}
+
 	phylink_set(mask, Autoneg);
 	phylink_set_port_modes(mask);
 	phylink_set(mask, Pause);
@@ -4384,6 +4403,7 @@ static void mvpp2_phylink_validate(struct net_device *dev,
 
 	switch (state->interface) {
 	case PHY_INTERFACE_MODE_10GKR:
+	case PHY_INTERFACE_MODE_XAUI:
 	case PHY_INTERFACE_MODE_NA:
 		phylink_set(mask, 10000baseCR_Full);
 		phylink_set(mask, 10000baseSR_Full);
@@ -4392,7 +4412,11 @@ static void mvpp2_phylink_validate(struct net_device *dev,
 		phylink_set(mask, 10000baseER_Full);
 		phylink_set(mask, 10000baseKR_Full);
 		/* Fall-through */
-	default:
+	case PHY_INTERFACE_MODE_RGMII:
+	case PHY_INTERFACE_MODE_RGMII_ID:
+	case PHY_INTERFACE_MODE_RGMII_RXID:
+	case PHY_INTERFACE_MODE_RGMII_TXID:
+	case PHY_INTERFACE_MODE_SGMII:
 		phylink_set(mask, 10baseT_Half);
 		phylink_set(mask, 10baseT_Full);
 		phylink_set(mask, 100baseT_Half);
@@ -4404,11 +4428,18 @@ static void mvpp2_phylink_validate(struct net_device *dev,
 		phylink_set(mask, 1000baseT_Full);
 		phylink_set(mask, 1000baseX_Full);
 		phylink_set(mask, 2500baseX_Full);
+		break;
+	default:
+		goto empty_set;
 	}
 
 	bitmap_and(supported, supported, mask, __ETHTOOL_LINK_MODE_MASK_NBITS);
 	bitmap_and(state->advertising, state->advertising, mask,
 		   __ETHTOOL_LINK_MODE_MASK_NBITS);
+	return;
+
+empty_set:
+	bitmap_zero(supported, __ETHTOOL_LINK_MODE_MASK_NBITS);
 }
 
 static void mvpp22_xlg_link_state(struct mvpp2_port *port,

From a317e65face482371de30246b6494feb093ff7f9 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Tue, 13 Nov 2018 20:32:13 +0100
Subject: [PATCH 70/93] mac80211: ignore tx status for PS stations in
 ieee80211_tx_status_ext

Make it behave like regular ieee80211_tx_status calls, except for the lack of
filtered frame processing.
This fixes spurious low-ack triggered disconnections with powersave clients
connected to an AP.

Fixes: f027c2aca0cf4 ("mac80211: add ieee80211_tx_status_noskb")
Cc: stable@vger.kernel.org
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/status.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index aa4afbf0abaf..a794ca729000 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -964,6 +964,8 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw,
 			/* Track when last TDLS packet was ACKed */
 			if (test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH))
 				sta->status_stats.last_tdls_pkt_time = jiffies;
+		} else if (test_sta_flag(sta, WLAN_STA_PS_STA)) {
+			return;
 		} else {
 			ieee80211_lost_packet(sta, info);
 		}

From 9ec1190d065998650fd9260dea8cf3e1f56c0e8c Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Wed, 28 Nov 2018 22:39:16 +0100
Subject: [PATCH 71/93] mac80211: fix reordering of buffered broadcast packets

If the buffered broadcast queue contains packets, letting new packets bypass
that queue can lead to heavy reordering, since the driver is probably throttling
transmission of buffered multicast packets after beacons.

Keep buffering packets until the buffer has been cleared (and no client
is in powersave mode).

Cc: stable@vger.kernel.org
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/tx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index e0ccee23fbcd..1f536ba573b4 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -439,8 +439,8 @@ ieee80211_tx_h_multicast_ps_buf(struct ieee80211_tx_data *tx)
 	if (ieee80211_hw_check(&tx->local->hw, QUEUE_CONTROL))
 		info->hw_queue = tx->sdata->vif.cab_queue;
 
-	/* no stations in PS mode */
-	if (!atomic_read(&ps->num_sta_ps))
+	/* no stations in PS mode and no buffered packets */
+	if (!atomic_read(&ps->num_sta_ps) && skb_queue_empty(&ps->bc_buf))
 		return TX_CONTINUE;
 
 	info->flags |= IEEE80211_TX_CTL_SEND_AFTER_DTIM;

From 990d71846a0b7281bd933c34d734e6afc7408e7e Mon Sep 17 00:00:00 2001
From: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Date: Mon, 3 Dec 2018 21:16:07 +0200
Subject: [PATCH 72/93] mac80211: ignore NullFunc frames in the duplicate
 detection

NullFunc packets should never be duplicate just like
QoS-NullFunc packets.

We saw a client that enters / exits power save with
NullFunc frames (and not with QoS-NullFunc) despite the
fact that the association supports HT.
This specific client also re-uses a non-zero sequence number
for different NullFunc frames.
At some point, the client had to send a retransmission of
the NullFunc frame and we dropped it, leading to a
misalignment in the power save state.
Fix this by never consider a NullFunc frame as duplicate,
just like we do for QoS NullFunc frames.

This fixes https://bugzilla.kernel.org/show_bug.cgi?id=201449

CC: <stable@vger.kernel.org>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/rx.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index a69ecfb212ed..428f7ad5f9b5 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1403,6 +1403,7 @@ ieee80211_rx_h_check_dup(struct ieee80211_rx_data *rx)
 		return RX_CONTINUE;
 
 	if (ieee80211_is_ctl(hdr->frame_control) ||
+	    ieee80211_is_nullfunc(hdr->frame_control) ||
 	    ieee80211_is_qos_nullfunc(hdr->frame_control) ||
 	    is_multicast_ether_addr(hdr->addr1))
 		return RX_CONTINUE;

From 312ca38ddda64bac6513ec68e0ac3789b4eb44dc Mon Sep 17 00:00:00 2001
From: Jouni Malinen <jouni@codeaurora.org>
Date: Wed, 5 Dec 2018 12:55:54 +0200
Subject: [PATCH 73/93] cfg80211: Fix busy loop regression in
 ieee80211_ie_split_ric()

This function was modified to support the information element extension
case (WLAN_EID_EXTENSION) in a manner that would result in an infinite
loop when going through set of IEs that include WLAN_EID_RIC_DATA and
contain an IE that is in the after_ric array. The only place where this
can currently happen is in mac80211 ieee80211_send_assoc() where
ieee80211_ie_split_ric() is called with after_ric[].

This can be triggered by valid data from user space nl80211
association/connect request (i.e., requiring GENL_UNS_ADMIN_PERM). The
only known application having an option to include WLAN_EID_RIC_DATA in
these requests is wpa_supplicant and it had a bug that prevented this
specific contents from being used (and because of that, not triggering
this kernel bug in an automated test case ap_ft_ric) and now that this
bug is fixed, it has a workaround to avoid this kernel issue.
WLAN_EID_RIC_DATA is currently used only for testing purposes, so this
does not cause significant harm for production use cases.

Fixes: 2512b1b18d07 ("mac80211: extend ieee80211_ie_split to support EXTENSION")
Cc: stable@vger.kernel.org
Signed-off-by: Jouni Malinen <jouni@codeaurora.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/util.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/wireless/util.c b/net/wireless/util.c
index ef14d80ca03e..d473bd135da8 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -1421,6 +1421,8 @@ size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen,
 							  ies[pos + ext],
 							  ext == 2))
 					pos = skip_ie(ies, ielen, pos);
+				else
+					break;
 			}
 		} else {
 			pos = skip_ie(ies, ielen, pos);

From 22f6bbb7bcfcef0b373b0502a7ff390275c575dd Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Tue, 4 Dec 2018 17:37:57 +0000
Subject: [PATCH 74/93] net: use skb_list_del_init() to remove from RX sublists

list_del() leaves the skb->next pointer poisoned, which can then lead to
 a crash in e.g. OVS forwarding.  For example, setting up an OVS VXLAN
 forwarding bridge on sfc as per:

========
$ ovs-vsctl show
5dfd9c47-f04b-4aaa-aa96-4fbb0a522a30
    Bridge "br0"
        Port "br0"
            Interface "br0"
                type: internal
        Port "enp6s0f0"
            Interface "enp6s0f0"
        Port "vxlan0"
            Interface "vxlan0"
                type: vxlan
                options: {key="1", local_ip="10.0.0.5", remote_ip="10.0.0.4"}
    ovs_version: "2.5.0"
========
(where 10.0.0.5 is an address on enp6s0f1)
and sending traffic across it will lead to the following panic:
========
general protection fault: 0000 [#1] SMP PTI
CPU: 5 PID: 0 Comm: swapper/5 Not tainted 4.20.0-rc3-ehc+ #701
Hardware name: Dell Inc. PowerEdge R710/0M233H, BIOS 6.4.0 07/23/2013
RIP: 0010:dev_hard_start_xmit+0x38/0x200
Code: 53 48 89 fb 48 83 ec 20 48 85 ff 48 89 54 24 08 48 89 4c 24 18 0f 84 ab 01 00 00 48 8d 86 90 00 00 00 48 89 f5 48 89 44 24 10 <4c> 8b 33 48 c7 03 00 00 00 00 48 8b 05 c7 d1 b3 00 4d 85 f6 0f 95
RSP: 0018:ffff888627b437e0 EFLAGS: 00010202
RAX: 0000000000000000 RBX: dead000000000100 RCX: ffff88862279c000
RDX: ffff888614a342c0 RSI: 0000000000000000 RDI: 0000000000000000
RBP: ffff888618a88000 R08: 0000000000000001 R09: 00000000000003e8
R10: 0000000000000000 R11: ffff888614a34140 R12: 0000000000000000
R13: 0000000000000062 R14: dead000000000100 R15: ffff888616430000
FS:  0000000000000000(0000) GS:ffff888627b40000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f6d2bc6d000 CR3: 000000000200a000 CR4: 00000000000006e0
Call Trace:
 <IRQ>
 __dev_queue_xmit+0x623/0x870
 ? masked_flow_lookup+0xf7/0x220 [openvswitch]
 ? ep_poll_callback+0x101/0x310
 do_execute_actions+0xaba/0xaf0 [openvswitch]
 ? __wake_up_common+0x8a/0x150
 ? __wake_up_common_lock+0x87/0xc0
 ? queue_userspace_packet+0x31c/0x5b0 [openvswitch]
 ovs_execute_actions+0x47/0x120 [openvswitch]
 ovs_dp_process_packet+0x7d/0x110 [openvswitch]
 ovs_vport_receive+0x6e/0xd0 [openvswitch]
 ? dst_alloc+0x64/0x90
 ? rt_dst_alloc+0x50/0xd0
 ? ip_route_input_slow+0x19a/0x9a0
 ? __udp_enqueue_schedule_skb+0x198/0x1b0
 ? __udp4_lib_rcv+0x856/0xa30
 ? __udp4_lib_rcv+0x856/0xa30
 ? cpumask_next_and+0x19/0x20
 ? find_busiest_group+0x12d/0xcd0
 netdev_frame_hook+0xce/0x150 [openvswitch]
 __netif_receive_skb_core+0x205/0xae0
 __netif_receive_skb_list_core+0x11e/0x220
 netif_receive_skb_list+0x203/0x460
 ? __efx_rx_packet+0x335/0x5e0 [sfc]
 efx_poll+0x182/0x320 [sfc]
 net_rx_action+0x294/0x3c0
 __do_softirq+0xca/0x297
 irq_exit+0xa6/0xb0
 do_IRQ+0x54/0xd0
 common_interrupt+0xf/0xf
 </IRQ>
========
So, in all listified-receive handling, instead pull skbs off the lists with
 skb_list_del_init().

Fixes: 9af86f933894 ("net: core: fix use-after-free in __netif_receive_skb_list_core")
Fixes: 7da517a3bc52 ("net: core: Another step of skb receive list processing")
Fixes: a4ca8b7df73c ("net: ipv4: fix drop handling in ip_list_rcv() and ip_list_rcv_finish()")
Fixes: d8269e2cbf90 ("net: ipv6: listify ipv6_rcv() and ip6_rcv_finish()")
Signed-off-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c       | 8 ++++----
 net/ipv4/ip_input.c  | 4 ++--
 net/ipv6/ip6_input.c | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index e06223b65674..722d50dbf8a4 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5014,7 +5014,7 @@ static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemallo
 		struct net_device *orig_dev = skb->dev;
 		struct packet_type *pt_prev = NULL;
 
-		list_del(&skb->list);
+		skb_list_del_init(skb);
 		__netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
 		if (!pt_prev)
 			continue;
@@ -5170,7 +5170,7 @@ static void netif_receive_skb_list_internal(struct list_head *head)
 	INIT_LIST_HEAD(&sublist);
 	list_for_each_entry_safe(skb, next, head, list) {
 		net_timestamp_check(netdev_tstamp_prequeue, skb);
-		list_del(&skb->list);
+		skb_list_del_init(skb);
 		if (!skb_defer_rx_timestamp(skb))
 			list_add_tail(&skb->list, &sublist);
 	}
@@ -5181,7 +5181,7 @@ static void netif_receive_skb_list_internal(struct list_head *head)
 		rcu_read_lock();
 		list_for_each_entry_safe(skb, next, head, list) {
 			xdp_prog = rcu_dereference(skb->dev->xdp_prog);
-			list_del(&skb->list);
+			skb_list_del_init(skb);
 			if (do_xdp_generic(xdp_prog, skb) == XDP_PASS)
 				list_add_tail(&skb->list, &sublist);
 		}
@@ -5200,7 +5200,7 @@ static void netif_receive_skb_list_internal(struct list_head *head)
 
 			if (cpu >= 0) {
 				/* Will be handled, remove from list */
-				list_del(&skb->list);
+				skb_list_del_init(skb);
 				enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 			}
 		}
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 35a786c0aaa0..e609b08c9df4 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -547,7 +547,7 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk,
 	list_for_each_entry_safe(skb, next, head, list) {
 		struct dst_entry *dst;
 
-		list_del(&skb->list);
+		skb_list_del_init(skb);
 		/* if ingress device is enslaved to an L3 master device pass the
 		 * skb to its handler for processing
 		 */
@@ -594,7 +594,7 @@ void ip_list_rcv(struct list_head *head, struct packet_type *pt,
 		struct net_device *dev = skb->dev;
 		struct net *net = dev_net(dev);
 
-		list_del(&skb->list);
+		skb_list_del_init(skb);
 		skb = ip_rcv_core(skb, net);
 		if (skb == NULL)
 			continue;
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 96577e742afd..c1d85830c906 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -95,7 +95,7 @@ static void ip6_list_rcv_finish(struct net *net, struct sock *sk,
 	list_for_each_entry_safe(skb, next, head, list) {
 		struct dst_entry *dst;
 
-		list_del(&skb->list);
+		skb_list_del_init(skb);
 		/* if ingress device is enslaved to an L3 master device pass the
 		 * skb to its handler for processing
 		 */
@@ -296,7 +296,7 @@ void ipv6_list_rcv(struct list_head *head, struct packet_type *pt,
 		struct net_device *dev = skb->dev;
 		struct net *net = dev_net(dev);
 
-		list_del(&skb->list);
+		skb_list_del_init(skb);
 		skb = ip6_rcv_core(skb, dev, net);
 		if (skb == NULL)
 			continue;

From 41727549de3e7281feb174d568c6e46823db8684 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 5 Dec 2018 14:24:31 -0800
Subject: [PATCH 75/93] tcp: Do not underestimate rwnd_limited

If available rwnd is too small, tcp_tso_should_defer()
can decide it is worth waiting before splitting a TSO packet.

This really means we are rwnd limited.

Fixes: 5615f88614a4 ("tcp: instrument how long TCP is limited by receive window")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 68b5326f7321..318690234758 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2356,8 +2356,11 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 		} else {
 			if (!push_one &&
 			    tcp_tso_should_defer(sk, skb, &is_cwnd_limited,
-						 max_segs))
+						 max_segs)) {
+				if (!is_cwnd_limited)
+					is_rwnd_limited = true;
 				break;
+			}
 		}
 
 		limit = mss_now;

From b2b7af861122a0c0f6260155c29a1b2e594cd5b5 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 5 Dec 2018 14:38:38 -0800
Subject: [PATCH 76/93] tcp: fix NULL ref in tail loss probe

TCP loss probe timer may fire when the retranmission queue is empty but
has a non-zero tp->packets_out counter. tcp_send_loss_probe will call
tcp_rearm_rto which triggers NULL pointer reference by fetching the
retranmission queue head in its sub-routines.

Add a more detailed warning to help catch the root cause of the inflight
accounting inconsistency.

Reported-by: Rafael Tinoco <rafael.tinoco@linaro.org>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 318690234758..5aa600900695 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2497,15 +2497,18 @@ void tcp_send_loss_probe(struct sock *sk)
 		goto rearm_timer;
 	}
 	skb = skb_rb_last(&sk->tcp_rtx_queue);
+	if (unlikely(!skb)) {
+		WARN_ONCE(tp->packets_out,
+			  "invalid inflight: %u state %u cwnd %u mss %d\n",
+			  tp->packets_out, sk->sk_state, tp->snd_cwnd, mss);
+		inet_csk(sk)->icsk_pending = 0;
+		return;
+	}
 
 	/* At most one outstanding TLP retransmission. */
 	if (tp->tlp_high_seq)
 		goto rearm_timer;
 
-	/* Retransmit last segment. */
-	if (WARN_ON(!skb))
-		goto rearm_timer;
-
 	if (skb_still_in_host_queue(sk, skb))
 		goto rearm_timer;
 

From afd0a8006e98b1890908f81746c94ca5dae29d7c Mon Sep 17 00:00:00 2001
From: Jakub Audykowicz <jakub.audykowicz@gmail.com>
Date: Tue, 4 Dec 2018 20:27:41 +0100
Subject: [PATCH 77/93] sctp: frag_point sanity check

If for some reason an association's fragmentation point is zero,
sctp_datamsg_from_user will try to endlessly try to divide a message
into zero-sized chunks. This eventually causes kernel panic due to
running out of memory.

Although this situation is quite unlikely, it has occurred before as
reported. I propose to add this simple last-ditch sanity check due to
the severity of the potential consequences.

Signed-off-by: Jakub Audykowicz <jakub.audykowicz@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sctp.h | 5 +++++
 net/sctp/chunk.c        | 6 ++++++
 net/sctp/socket.c       | 3 +--
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index ab9242e51d9e..2abbc15824af 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -620,4 +620,9 @@ static inline bool sctp_transport_pmtu_check(struct sctp_transport *t)
 	return false;
 }
 
+static inline __u32 sctp_min_frag_point(struct sctp_sock *sp, __u16 datasize)
+{
+	return sctp_mtu_payload(sp, SCTP_DEFAULT_MINSEGMENT, datasize);
+}
+
 #endif /* __net_sctp_h__ */
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index ce8087846f05..d2048de86e7c 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -191,6 +191,12 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 	 * the packet
 	 */
 	max_data = asoc->frag_point;
+	if (unlikely(!max_data)) {
+		max_data = sctp_min_frag_point(sctp_sk(asoc->base.sk),
+					       sctp_datachk_len(&asoc->stream));
+		pr_warn_ratelimited("%s: asoc:%p frag_point is zero, forcing max_data to default minimum (%Zu)",
+				    __func__, asoc, max_data);
+	}
 
 	/* If the the peer requested that we authenticate DATA chunks
 	 * we need to account for bundling of the AUTH chunks along with
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index bf618d1b41fd..b8cebd5a87e5 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3324,8 +3324,7 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned
 		__u16 datasize = asoc ? sctp_datachk_len(&asoc->stream) :
 				 sizeof(struct sctp_data_chunk);
 
-		min_len = sctp_mtu_payload(sp, SCTP_DEFAULT_MINSEGMENT,
-					   datasize);
+		min_len = sctp_min_frag_point(sp, datasize);
 		max_len = SCTP_MAX_CHUNK_LEN - datasize;
 
 		if (val < min_len || val > max_len)

From ebaf39e6032faf77218220707fc3fa22487784e0 Mon Sep 17 00:00:00 2001
From: Jiri Wiesner <jwiesner@suse.com>
Date: Wed, 5 Dec 2018 16:55:29 +0100
Subject: [PATCH 78/93] ipv4: ipv6: netfilter: Adjust the frag mem limit when
 truesize changes

The *_frag_reasm() functions are susceptible to miscalculating the byte
count of packet fragments in case the truesize of a head buffer changes.
The truesize member may be changed by the call to skb_unclone(), leaving
the fragment memory limit counter unbalanced even if all fragments are
processed. This miscalculation goes unnoticed as long as the network
namespace which holds the counter is not destroyed.

Should an attempt be made to destroy a network namespace that holds an
unbalanced fragment memory limit counter the cleanup of the namespace
never finishes. The thread handling the cleanup gets stuck in
inet_frags_exit_net() waiting for the percpu counter to reach zero. The
thread is usually in running state with a stacktrace similar to:

 PID: 1073   TASK: ffff880626711440  CPU: 1   COMMAND: "kworker/u48:4"
  #5 [ffff880621563d48] _raw_spin_lock at ffffffff815f5480
  #6 [ffff880621563d48] inet_evict_bucket at ffffffff8158020b
  #7 [ffff880621563d80] inet_frags_exit_net at ffffffff8158051c
  #8 [ffff880621563db0] ops_exit_list at ffffffff814f5856
  #9 [ffff880621563dd8] cleanup_net at ffffffff814f67c0
 #10 [ffff880621563e38] process_one_work at ffffffff81096f14

It is not possible to create new network namespaces, and processes
that call unshare() end up being stuck in uninterruptible sleep state
waiting to acquire the net_mutex.

The bug was observed in the IPv6 netfilter code by Per Sundstrom.
I thank him for his analysis of the problem. The parts of this patch
that apply to IPv4 and IPv6 fragment reassembly are preemptive measures.

Signed-off-by: Jiri Wiesner <jwiesner@suse.com>
Reported-by: Per Sundstrom <per.sundstrom@redqube.se>
Acked-by: Peter Oskolkov <posk@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_fragment.c                  | 7 +++++++
 net/ipv6/netfilter/nf_conntrack_reasm.c | 8 +++++++-
 net/ipv6/reassembly.c                   | 8 +++++++-
 3 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index d6ee343fdb86..aa0b22697998 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -515,6 +515,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
 	struct rb_node *rbn;
 	int len;
 	int ihlen;
+	int delta;
 	int err;
 	u8 ecn;
 
@@ -556,10 +557,16 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
 	if (len > 65535)
 		goto out_oversize;
 
+	delta = - head->truesize;
+
 	/* Head of list must not be cloned. */
 	if (skb_unclone(head, GFP_ATOMIC))
 		goto out_nomem;
 
+	delta += head->truesize;
+	if (delta)
+		add_frag_mem_limit(qp->q.net, delta);
+
 	/* If the first fragment is fragmented itself, we split
 	 * it to two chunks: the first with data and paged part
 	 * and the second, holding only fragments. */
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index d219979c3e52..181da2c40f9a 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -341,7 +341,7 @@ static bool
 nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev,  struct net_device *dev)
 {
 	struct sk_buff *fp, *head = fq->q.fragments;
-	int    payload_len;
+	int    payload_len, delta;
 	u8 ecn;
 
 	inet_frag_kill(&fq->q);
@@ -363,10 +363,16 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev,  struct net_devic
 		return false;
 	}
 
+	delta = - head->truesize;
+
 	/* Head of list must not be cloned. */
 	if (skb_unclone(head, GFP_ATOMIC))
 		return false;
 
+	delta += head->truesize;
+	if (delta)
+		add_frag_mem_limit(fq->q.net, delta);
+
 	/* If the first fragment is fragmented itself, we split
 	 * it to two chunks: the first with data and paged part
 	 * and the second, holding only fragments. */
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 5c3c92713096..aa26c45486d9 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -281,7 +281,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
 {
 	struct net *net = container_of(fq->q.net, struct net, ipv6.frags);
 	struct sk_buff *fp, *head = fq->q.fragments;
-	int    payload_len;
+	int    payload_len, delta;
 	unsigned int nhoff;
 	int sum_truesize;
 	u8 ecn;
@@ -322,10 +322,16 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
 	if (payload_len > IPV6_MAXPLEN)
 		goto out_oversize;
 
+	delta = - head->truesize;
+
 	/* Head of list must not be cloned. */
 	if (skb_unclone(head, GFP_ATOMIC))
 		goto out_oom;
 
+	delta += head->truesize;
+	if (delta)
+		add_frag_mem_limit(fq->q.net, delta);
+
 	/* If the first fragment is fragmented itself, we split
 	 * it to two chunks: the first with data and paged part
 	 * and the second, holding only fragments. */

From 050fc01fb1d916058605569cd7f4e15152afc3af Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Thu, 6 Dec 2018 17:44:50 +0000
Subject: [PATCH 79/93] mlxsw: spectrum_nve: Remove easily triggerable warnings

It is possible to trigger a warning in mlxsw in case a flood entry which
mlxsw is not aware of is deleted from the VxLAN device. This is because
mlxsw expects to find a singly linked list where the flood entry is
present in.

Fix by removing these warnings for now.

Will re-add them in the next release after we teach mlxsw to ask for a
dump of FDB entries from the VxLAN device, once it is enslaved to a
bridge mlxsw cares about.

Fixes: 6e6030bd5412 ("mlxsw: spectrum_nve: Implement common NVE core")
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_nve.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_nve.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_nve.c
index ad06d9969bc1..5c13674439f1 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_nve.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_nve.c
@@ -560,7 +560,7 @@ static void mlxsw_sp_nve_mc_list_ip_del(struct mlxsw_sp *mlxsw_sp,
 
 	mc_record = mlxsw_sp_nve_mc_record_find(mc_list, proto, addr,
 						&mc_entry);
-	if (WARN_ON(!mc_record))
+	if (!mc_record)
 		return;
 
 	mlxsw_sp_nve_mc_record_entry_del(mc_record, mc_entry);
@@ -647,7 +647,7 @@ void mlxsw_sp_nve_flood_ip_del(struct mlxsw_sp *mlxsw_sp,
 
 	key.fid_index = mlxsw_sp_fid_index(fid);
 	mc_list = mlxsw_sp_nve_mc_list_find(mlxsw_sp, &key);
-	if (WARN_ON(!mc_list))
+	if (!mc_list)
 		return;
 
 	mlxsw_sp_nve_fid_flood_index_clear(fid, mc_list);

From f58a83c207b791c6586b9675a589db5c6ac7909e Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Thu, 6 Dec 2018 17:44:51 +0000
Subject: [PATCH 80/93] mlxsw: spectrum_switchdev: Avoid leaking FID's
 reference count

It should never be possible for a user to set a VNI on a FID in case one
is already set. The driver therefore returns an error, but fails to drop
the reference count taken earlier when calling
mlxsw_sp_fid_8021d_lookup().

Drop the reference when this unlikely error is hit.

Fixes: 1c30d1836aeb ("mlxsw: spectrum: Enable VxLAN enslavement to bridges")
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
index 739a51f0a366..7f2091c2648e 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
@@ -2134,8 +2134,10 @@ mlxsw_sp_bridge_8021d_vxlan_join(struct mlxsw_sp_bridge_device *bridge_device,
 	if (!fid)
 		return -EINVAL;
 
-	if (mlxsw_sp_fid_vni_is_set(fid))
-		return -EINVAL;
+	if (mlxsw_sp_fid_vni_is_set(fid)) {
+		err = -EINVAL;
+		goto err_vni_exists;
+	}
 
 	err = mlxsw_sp_nve_fid_enable(mlxsw_sp, fid, &params, extack);
 	if (err)
@@ -2149,6 +2151,7 @@ mlxsw_sp_bridge_8021d_vxlan_join(struct mlxsw_sp_bridge_device *bridge_device,
 	return 0;
 
 err_nve_fid_enable:
+err_vni_exists:
 	mlxsw_sp_fid_put(fid);
 	return err;
 }

From da93d2913fdf43d5cde3c5a53ac9cc29684d5c7c Mon Sep 17 00:00:00 2001
From: Nir Dotan <nird@mellanox.com>
Date: Thu, 6 Dec 2018 17:44:52 +0000
Subject: [PATCH 81/93] mlxsw: spectrum_router: Relax GRE decap matching check

GRE decap offload is configured when local routes prefix correspond to the
local address of one of the offloaded GRE tunnels. The matching check was
found to be too strict, such that for a flat GRE configuration, in which
the overlay and underlay traffic share the same non-default VRF, decap flow
was not offloaded.

Relax the check for decap flow offloading. A match occurs if the local
address of the tunnel matches the local route address while both share the
same VRF table.

Fixes: 4607f6d26950 ("mlxsw: spectrum_router: Support IPv4 underlay decap")
Signed-off-by: Nir Dotan <nird@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 9e9bb57134f2..6ebf99cc3154 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -1275,15 +1275,12 @@ mlxsw_sp_ipip_entry_matches_decap(struct mlxsw_sp *mlxsw_sp,
 {
 	u32 ul_tb_id = l3mdev_fib_table(ul_dev) ? : RT_TABLE_MAIN;
 	enum mlxsw_sp_ipip_type ipipt = ipip_entry->ipipt;
-	struct net_device *ipip_ul_dev;
 
 	if (mlxsw_sp->router->ipip_ops_arr[ipipt]->ul_proto != ul_proto)
 		return false;
 
-	ipip_ul_dev = __mlxsw_sp_ipip_netdev_ul_dev_get(ipip_entry->ol_dev);
 	return mlxsw_sp_ipip_entry_saddr_matches(mlxsw_sp, ul_proto, ul_dip,
-						 ul_tb_id, ipip_entry) &&
-	       (!ipip_ul_dev || ipip_ul_dev == ul_dev);
+						 ul_tb_id, ipip_entry);
 }
 
 /* Given decap parameters, find the corresponding IPIP entry. */

From 993107fea5eefdfdfde1ca38d3f01f0bebf76e77 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Thu, 6 Dec 2018 17:44:53 +0000
Subject: [PATCH 82/93] mlxsw: spectrum_switchdev: Fix VLAN device deletion via
 ioctl

When deleting a VLAN device using an ioctl the netdev is unregistered
before the VLAN filter is updated via ndo_vlan_rx_kill_vid(). It can
lead to a use-after-free in mlxsw in case the VLAN device is deleted
while being enslaved to a bridge.

The reason for the above is that when mlxsw receives the CHANGEUPPER
event, it wrongly assumes that the VLAN device is no longer its upper
and thus destroys the internal representation of the bridge port despite
the reference count being non-zero.

Fix this by checking if the VLAN device is our upper using its real
device. In net-next I'm going to remove this trick and instead make
mlxsw completely agnostic to the order of the events.

Fixes: c57529e1d5d8 ("mlxsw: spectrum: Replace vPorts with Port-VLAN")
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/mellanox/mlxsw/spectrum_switchdev.c   | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
index 7f2091c2648e..50080c60a279 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
@@ -296,7 +296,13 @@ static bool
 mlxsw_sp_bridge_port_should_destroy(const struct mlxsw_sp_bridge_port *
 				    bridge_port)
 {
-	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_lower_get(bridge_port->dev);
+	struct net_device *dev = bridge_port->dev;
+	struct mlxsw_sp *mlxsw_sp;
+
+	if (is_vlan_dev(dev))
+		mlxsw_sp = mlxsw_sp_lower_get(vlan_dev_real_dev(dev));
+	else
+		mlxsw_sp = mlxsw_sp_lower_get(dev);
 
 	/* In case ports were pulled from out of a bridged LAG, then
 	 * it's possible the reference count isn't zero, yet the bridge
@@ -2109,7 +2115,7 @@ mlxsw_sp_bridge_8021d_port_leave(struct mlxsw_sp_bridge_device *bridge_device,
 
 	vid = is_vlan_dev(dev) ? vlan_dev_vlan_id(dev) : 1;
 	mlxsw_sp_port_vlan = mlxsw_sp_port_vlan_find_by_vid(mlxsw_sp_port, vid);
-	if (WARN_ON(!mlxsw_sp_port_vlan))
+	if (!mlxsw_sp_port_vlan)
 		return;
 
 	mlxsw_sp_port_vlan_bridge_leave(mlxsw_sp_port_vlan);

From 1b4e5ad5d6b9f15cd0b5121f86d4719165958417 Mon Sep 17 00:00:00 2001
From: Shmulik Ladkani <shmulik@metanetworks.com>
Date: Fri, 7 Dec 2018 09:50:17 +0200
Subject: [PATCH 83/93] ipv6: sr: properly initialize flowi6 prior passing to
 ip6_route_output

In 'seg6_output', stack variable 'struct flowi6 fl6' was missing
initialization.

Fixes: 6c8702c60b88 ("ipv6: sr: add support for SRH encapsulation and injection with lwtunnels")
Signed-off-by: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/seg6_iptunnel.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
index a8854dd3e9c5..8181ee7e1e27 100644
--- a/net/ipv6/seg6_iptunnel.c
+++ b/net/ipv6/seg6_iptunnel.c
@@ -347,6 +347,7 @@ static int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 		struct ipv6hdr *hdr = ipv6_hdr(skb);
 		struct flowi6 fl6;
 
+		memset(&fl6, 0, sizeof(fl6));
 		fl6.daddr = hdr->daddr;
 		fl6.saddr = hdr->saddr;
 		fl6.flowlabel = ip6_flowinfo(hdr);

From f9bfe4e6a9d08d405fe7b081ee9a13e649c97ecf Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 6 Dec 2018 09:58:24 -0800
Subject: [PATCH 84/93] tcp: lack of available data can also cause TSO defer

tcp_tso_should_defer() can return true in three different cases :

 1) We are cwnd-limited
 2) We are rwnd-limited
 3) We are application limited.

Neal pointed out that my recent fix went too far, since
it assumed that if we were not in 1) case, we must be rwnd-limited

Fix this by properly populating the is_cwnd_limited and
is_rwnd_limited booleans.

After this change, we can finally move the silly check for FIN
flag only for the application-limited case.

The same move for EOR bit will be handled in net-next,
since commit 1c09f7d073b1 ("tcp: do not try to defer skbs
with eor mark (MSG_EOR)") is scheduled for linux-4.21

Tested by running 200 concurrent netperf -t TCP_RR -- -r 60000,100
and checking none of them was rwnd_limited in the chrono_stat
output from "ss -ti" command.

Fixes: 41727549de3e ("tcp: Do not underestimate rwnd_limited")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Suggested-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 35 ++++++++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 11 deletions(-)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 5aa600900695..d1676d8a6ed7 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1904,7 +1904,9 @@ static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue,
  * This algorithm is from John Heffner.
  */
 static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
-				 bool *is_cwnd_limited, u32 max_segs)
+				 bool *is_cwnd_limited,
+				 bool *is_rwnd_limited,
+				 u32 max_segs)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	u32 age, send_win, cong_win, limit, in_flight;
@@ -1912,9 +1914,6 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
 	struct sk_buff *head;
 	int win_divisor;
 
-	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
-		goto send_now;
-
 	if (icsk->icsk_ca_state >= TCP_CA_Recovery)
 		goto send_now;
 
@@ -1973,10 +1972,27 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
 	if (age < (tp->srtt_us >> 4))
 		goto send_now;
 
-	/* Ok, it looks like it is advisable to defer. */
+	/* Ok, it looks like it is advisable to defer.
+	 * Three cases are tracked :
+	 * 1) We are cwnd-limited
+	 * 2) We are rwnd-limited
+	 * 3) We are application limited.
+	 */
+	if (cong_win < send_win) {
+		if (cong_win <= skb->len) {
+			*is_cwnd_limited = true;
+			return true;
+		}
+	} else {
+		if (send_win <= skb->len) {
+			*is_rwnd_limited = true;
+			return true;
+		}
+	}
 
-	if (cong_win < send_win && cong_win <= skb->len)
-		*is_cwnd_limited = true;
+	/* If this packet won't get more data, do not wait. */
+	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+		goto send_now;
 
 	return true;
 
@@ -2356,11 +2372,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 		} else {
 			if (!push_one &&
 			    tcp_tso_should_defer(sk, skb, &is_cwnd_limited,
-						 max_segs)) {
-				if (!is_cwnd_limited)
-					is_rwnd_limited = true;
+						 &is_rwnd_limited, max_segs))
 				break;
-			}
 		}
 
 		limit = mss_now;

From 66033f47ca60294a95fc85ec3a3cc909dab7b765 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Thu, 6 Dec 2018 19:30:36 +0100
Subject: [PATCH 85/93] ipv6: Check available headroom in ip6_xmit() even
 without options

Even if we send an IPv6 packet without options, MAX_HEADER might not be
enough to account for the additional headroom required by alignment of
hardware headers.

On a configuration without HYPERV_NET, WLAN, AX25, and with IPV6_TUNNEL,
sending short SCTP packets over IPv4 over L2TP over IPv6, we start with
100 bytes of allocated headroom in sctp_packet_transmit(), end up with 54
bytes after l2tp_xmit_skb(), and 14 bytes in ip6_finish_output2().

Those would be enough to append our 14 bytes header, but we're going to
align that to 16 bytes, and write 2 bytes out of the allocated slab in
neigh_hh_output().

KASan says:

[  264.967848] ==================================================================
[  264.967861] BUG: KASAN: slab-out-of-bounds in ip6_finish_output2+0x1aec/0x1c70
[  264.967866] Write of size 16 at addr 000000006af1c7fe by task netperf/6201
[  264.967870]
[  264.967876] CPU: 0 PID: 6201 Comm: netperf Not tainted 4.20.0-rc4+ #1
[  264.967881] Hardware name: IBM 2827 H43 400 (z/VM 6.4.0)
[  264.967887] Call Trace:
[  264.967896] ([<00000000001347d6>] show_stack+0x56/0xa0)
[  264.967903]  [<00000000017e379c>] dump_stack+0x23c/0x290
[  264.967912]  [<00000000007bc594>] print_address_description+0xf4/0x290
[  264.967919]  [<00000000007bc8fc>] kasan_report+0x13c/0x240
[  264.967927]  [<000000000162f5e4>] ip6_finish_output2+0x1aec/0x1c70
[  264.967935]  [<000000000163f890>] ip6_finish_output+0x430/0x7f0
[  264.967943]  [<000000000163fe44>] ip6_output+0x1f4/0x580
[  264.967953]  [<000000000163882a>] ip6_xmit+0xfea/0x1ce8
[  264.967963]  [<00000000017396e2>] inet6_csk_xmit+0x282/0x3f8
[  264.968033]  [<000003ff805fb0ba>] l2tp_xmit_skb+0xe02/0x13e0 [l2tp_core]
[  264.968037]  [<000003ff80631192>] l2tp_eth_dev_xmit+0xda/0x150 [l2tp_eth]
[  264.968041]  [<0000000001220020>] dev_hard_start_xmit+0x268/0x928
[  264.968069]  [<0000000001330e8e>] sch_direct_xmit+0x7ae/0x1350
[  264.968071]  [<000000000122359c>] __dev_queue_xmit+0x2b7c/0x3478
[  264.968075]  [<00000000013d2862>] ip_finish_output2+0xce2/0x11a0
[  264.968078]  [<00000000013d9b14>] ip_finish_output+0x56c/0x8c8
[  264.968081]  [<00000000013ddd1e>] ip_output+0x226/0x4c0
[  264.968083]  [<00000000013dbd6c>] __ip_queue_xmit+0x894/0x1938
[  264.968100]  [<000003ff80bc3a5c>] sctp_packet_transmit+0x29d4/0x3648 [sctp]
[  264.968116]  [<000003ff80b7bf68>] sctp_outq_flush_ctrl.constprop.5+0x8d0/0xe50 [sctp]
[  264.968131]  [<000003ff80b7c716>] sctp_outq_flush+0x22e/0x7d8 [sctp]
[  264.968146]  [<000003ff80b35c68>] sctp_cmd_interpreter.isra.16+0x530/0x6800 [sctp]
[  264.968161]  [<000003ff80b3410a>] sctp_do_sm+0x222/0x648 [sctp]
[  264.968177]  [<000003ff80bbddac>] sctp_primitive_ASSOCIATE+0xbc/0xf8 [sctp]
[  264.968192]  [<000003ff80b93328>] __sctp_connect+0x830/0xc20 [sctp]
[  264.968208]  [<000003ff80bb11ce>] sctp_inet_connect+0x2e6/0x378 [sctp]
[  264.968212]  [<0000000001197942>] __sys_connect+0x21a/0x450
[  264.968215]  [<000000000119aff8>] sys_socketcall+0x3d0/0xb08
[  264.968218]  [<000000000184ea7a>] system_call+0x2a2/0x2c0

[...]

Just like ip_finish_output2() does for IPv4, check that we have enough
headroom in ip6_xmit(), and reallocate it if we don't.

This issue is older than git history.

Reported-by: Jianlin Shi <jishi@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_output.c | 42 +++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 827a3f5ff3bb..fcd3c66ded16 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -195,37 +195,37 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 	const struct ipv6_pinfo *np = inet6_sk(sk);
 	struct in6_addr *first_hop = &fl6->daddr;
 	struct dst_entry *dst = skb_dst(skb);
+	unsigned int head_room;
 	struct ipv6hdr *hdr;
 	u8  proto = fl6->flowi6_proto;
 	int seg_len = skb->len;
 	int hlimit = -1;
 	u32 mtu;
 
-	if (opt) {
-		unsigned int head_room;
+	head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
+	if (opt)
+		head_room += opt->opt_nflen + opt->opt_flen;
 
-		/* First: exthdrs may take lots of space (~8K for now)
-		   MAX_HEADER is not enough.
-		 */
-		head_room = opt->opt_nflen + opt->opt_flen;
-		seg_len += head_room;
-		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
-
-		if (skb_headroom(skb) < head_room) {
-			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
-			if (!skb2) {
-				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
-					      IPSTATS_MIB_OUTDISCARDS);
-				kfree_skb(skb);
-				return -ENOBUFS;
-			}
-			if (skb->sk)
-				skb_set_owner_w(skb2, skb->sk);
-			consume_skb(skb);
-			skb = skb2;
+	if (unlikely(skb_headroom(skb) < head_room)) {
+		struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
+		if (!skb2) {
+			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+				      IPSTATS_MIB_OUTDISCARDS);
+			kfree_skb(skb);
+			return -ENOBUFS;
 		}
+		if (skb->sk)
+			skb_set_owner_w(skb2, skb->sk);
+		consume_skb(skb);
+		skb = skb2;
+	}
+
+	if (opt) {
+		seg_len += opt->opt_nflen + opt->opt_flen;
+
 		if (opt->opt_flen)
 			ipv6_push_frag_opts(skb, opt, &proto);
+
 		if (opt->opt_nflen)
 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
 					     &fl6->saddr);

From e6ac64d4c4d095085d7dd71cbd05704ac99829b2 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Thu, 6 Dec 2018 19:30:37 +0100
Subject: [PATCH 86/93] neighbour: Avoid writing before skb->head in
 neigh_hh_output()

While skb_push() makes the kernel panic if the skb headroom is less than
the unaligned hardware header size, it will proceed normally in case we
copy more than that because of alignment, and we'll silently corrupt
adjacent slabs.

In the case fixed by the previous patch,
"ipv6: Check available headroom in ip6_xmit() even without options", we
end up in neigh_hh_output() with 14 bytes headroom, 14 bytes hardware
header and write 16 bytes, starting 2 bytes before the allocated buffer.

Always check we're not writing before skb->head and, if the headroom is
not enough, warn and drop the packet.

v2:
 - instead of panicking with BUG_ON(), WARN_ON_ONCE() and drop the packet
   (Eric Dumazet)
 - if we avoid the panic, though, we need to explicitly check the headroom
   before the memcpy(), otherwise we'll have corrupted slabs on a running
   kernel, after we warn
 - use __skb_push() instead of skb_push(), as the headroom check is
   already implemented here explicitly (Eric Dumazet)

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/neighbour.h | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index f58b384aa6c9..665990c7dec8 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -454,6 +454,7 @@ static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb)
 
 static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb)
 {
+	unsigned int hh_alen = 0;
 	unsigned int seq;
 	unsigned int hh_len;
 
@@ -461,16 +462,33 @@ static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb
 		seq = read_seqbegin(&hh->hh_lock);
 		hh_len = hh->hh_len;
 		if (likely(hh_len <= HH_DATA_MOD)) {
-			/* this is inlined by gcc */
-			memcpy(skb->data - HH_DATA_MOD, hh->hh_data, HH_DATA_MOD);
-		} else {
-			unsigned int hh_alen = HH_DATA_ALIGN(hh_len);
+			hh_alen = HH_DATA_MOD;
 
-			memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
+			/* skb_push() would proceed silently if we have room for
+			 * the unaligned size but not for the aligned size:
+			 * check headroom explicitly.
+			 */
+			if (likely(skb_headroom(skb) >= HH_DATA_MOD)) {
+				/* this is inlined by gcc */
+				memcpy(skb->data - HH_DATA_MOD, hh->hh_data,
+				       HH_DATA_MOD);
+			}
+		} else {
+			hh_alen = HH_DATA_ALIGN(hh_len);
+
+			if (likely(skb_headroom(skb) >= hh_alen)) {
+				memcpy(skb->data - hh_alen, hh->hh_data,
+				       hh_alen);
+			}
 		}
 	} while (read_seqretry(&hh->hh_lock, seq));
 
-	skb_push(skb, hh_len);
+	if (WARN_ON_ONCE(skb_headroom(skb) < hh_alen)) {
+		kfree_skb(skb);
+		return NET_XMIT_DROP;
+	}
+
+	__skb_push(skb, hh_len);
 	return dev_queue_xmit(skb);
 }
 

From 5b3279e2cba2238b37f6c18adfdea8bddb32715a Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Fri, 7 Dec 2018 15:05:04 +1100
Subject: [PATCH 87/93] Revert "net/ibm/emac: wrong bit is used for STA
 control"

This reverts commit 624ca9c33c8a853a4a589836e310d776620f4ab9.

This commit is completely bogus. The STACR register has two formats, old
and new, depending on the version of the IP block used. There's a pair of
device-tree properties that can be used to specify the format used:

	has-inverted-stacr-oc
	has-new-stacr-staopc

What this commit did was to change the bit definition used with the old
parts to match the new parts. This of course breaks the driver on all
the old ones.

Instead, the author should have set the appropriate properties in the
device-tree for the variant used on his board.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ibm/emac/emac.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ibm/emac/emac.h b/drivers/net/ethernet/ibm/emac/emac.h
index e2f80cca9bed..0d2de6f67676 100644
--- a/drivers/net/ethernet/ibm/emac/emac.h
+++ b/drivers/net/ethernet/ibm/emac/emac.h
@@ -231,7 +231,7 @@ struct emac_regs {
 #define EMAC_STACR_PHYE			0x00004000
 #define EMAC_STACR_STAC_MASK		0x00003000
 #define EMAC_STACR_STAC_READ		0x00001000
-#define EMAC_STACR_STAC_WRITE		0x00000800
+#define EMAC_STACR_STAC_WRITE		0x00002000
 #define EMAC_STACR_OPBC_MASK		0x00000C00
 #define EMAC_STACR_OPBC_50		0x00000000
 #define EMAC_STACR_OPBC_66		0x00000400

From bd5122cd1e0644d8bd8dd84517c932773e999766 Mon Sep 17 00:00:00 2001
From: Tarick Bedeir <tarick@google.com>
Date: Fri, 7 Dec 2018 00:30:26 -0800
Subject: [PATCH 88/93] net/mlx4_core: Correctly set PFC param if global pause
 is turned off.

rx_ppp and tx_ppp can be set between 0 and 255, so don't clamp to 1.

Fixes: 6e8814ceb7e8 ("net/mlx4_en: Fix mixed PFC and Global pause user control requests")
Signed-off-by: Tarick Bedeir <tarick@google.com>
Reviewed-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/en_ethtool.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
index f11b45001cad..d290f0787dfb 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
@@ -1084,8 +1084,8 @@ static int mlx4_en_set_pauseparam(struct net_device *dev,
 
 	tx_pause = !!(pause->tx_pause);
 	rx_pause = !!(pause->rx_pause);
-	rx_ppp = priv->prof->rx_ppp && !(tx_pause || rx_pause);
-	tx_ppp = priv->prof->tx_ppp && !(tx_pause || rx_pause);
+	rx_ppp = (tx_pause || rx_pause) ? 0 : priv->prof->rx_ppp;
+	tx_ppp = (tx_pause || rx_pause) ? 0 : priv->prof->tx_ppp;
 
 	err = mlx4_SET_PORT_general(mdev->dev, priv->port,
 				    priv->rx_skb_size + ETH_FCS_LEN,

From 804fba4e9f508c8004a4bfbdf3f300ca237c56df Mon Sep 17 00:00:00 2001
From: Michael Chan <michael.chan@broadcom.com>
Date: Sun, 9 Dec 2018 07:00:59 -0500
Subject: [PATCH 89/93] bnxt_en: Fix CNP CoS queue regression.

Recent changes to support the 57500 devices have created this
regression.  The bnxt_hwrm_queue_qportcfg() call was moved to be
called earlier before the RDMA support was determined, causing
the CoS queues configuration to be set before knowing whether RDMA
was supported or not.  Fix it by moving it to the right place right
after RDMA support is determined.

Fixes: 98f04cf0f1fc ("bnxt_en: Check context memory requirements from firmware.")
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index d4c300117529..0cf4cb93c1e1 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -6292,6 +6292,8 @@ hwrm_func_qcaps_exit:
 	return rc;
 }
 
+static int bnxt_hwrm_queue_qportcfg(struct bnxt *bp);
+
 static int bnxt_hwrm_func_qcaps(struct bnxt *bp)
 {
 	int rc;
@@ -6299,6 +6301,11 @@ static int bnxt_hwrm_func_qcaps(struct bnxt *bp)
 	rc = __bnxt_hwrm_func_qcaps(bp);
 	if (rc)
 		return rc;
+	rc = bnxt_hwrm_queue_qportcfg(bp);
+	if (rc) {
+		netdev_err(bp->dev, "hwrm query qportcfg failure rc: %d\n", rc);
+		return rc;
+	}
 	if (bp->hwrm_spec_code >= 0x10803) {
 		rc = bnxt_alloc_ctx_mem(bp);
 		if (rc)

From 75720e6323a1d195ae3ebf1a7b5e17c2e687f552 Mon Sep 17 00:00:00 2001
From: Michael Chan <michael.chan@broadcom.com>
Date: Sun, 9 Dec 2018 07:01:00 -0500
Subject: [PATCH 90/93] bnxt_en: Keep track of reserved IRQs.

The new 57500 chips use 1 NQ per MSIX vector, whereas legacy chips use
1 CP ring per MSIX vector.  To better unify this, add a resv_irqs
field to struct bnxt_hw_resc.  On legacy chips, we initialize resv_irqs
with resv_cp_rings.  On new chips, we initialize it with the allocated
MSIX resources.

Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c     | 8 ++++++--
 drivers/net/ethernet/broadcom/bnxt/bnxt.h     | 1 +
 drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c | 2 +-
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 0cf4cb93c1e1..c39820b2268f 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -5162,6 +5162,7 @@ static int bnxt_hwrm_get_rings(struct bnxt *bp)
 		cp = le16_to_cpu(resp->alloc_cmpl_rings);
 		stats = le16_to_cpu(resp->alloc_stat_ctx);
 		cp = min_t(u16, cp, stats);
+		hw_resc->resv_irqs = cp;
 		if (bp->flags & BNXT_FLAG_CHIP_P5) {
 			int rx = hw_resc->resv_rx_rings;
 			int tx = hw_resc->resv_tx_rings;
@@ -5175,7 +5176,7 @@ static int bnxt_hwrm_get_rings(struct bnxt *bp)
 				hw_resc->resv_rx_rings = rx;
 				hw_resc->resv_tx_rings = tx;
 			}
-			cp = le16_to_cpu(resp->alloc_msix);
+			hw_resc->resv_irqs = le16_to_cpu(resp->alloc_msix);
 			hw_resc->resv_hw_ring_grps = rx;
 		}
 		hw_resc->resv_cp_rings = cp;
@@ -7055,7 +7056,9 @@ int bnxt_get_avail_msix(struct bnxt *bp, int num)
 	int total_req = bp->cp_nr_rings + num;
 	int max_idx, avail_msix;
 
-	max_idx = min_t(int, bp->total_irqs, max_cp);
+	max_idx = bp->total_irqs;
+	if (!(bp->flags & BNXT_FLAG_CHIP_P5))
+		max_idx = min_t(int, bp->total_irqs, max_cp);
 	avail_msix = max_idx - bp->cp_nr_rings;
 	if (!BNXT_NEW_RM(bp) || avail_msix >= num)
 		return avail_msix;
@@ -7801,6 +7804,7 @@ static int bnxt_hwrm_if_change(struct bnxt *bp, bool up)
 
 		rc = bnxt_hwrm_func_resc_qcaps(bp, true);
 		hw_resc->resv_cp_rings = 0;
+		hw_resc->resv_irqs = 0;
 		hw_resc->resv_tx_rings = 0;
 		hw_resc->resv_rx_rings = 0;
 		hw_resc->resv_hw_ring_grps = 0;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 9e99d4ab3e06..3030931ccaf8 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -928,6 +928,7 @@ struct bnxt_hw_resc {
 	u16	min_stat_ctxs;
 	u16	max_stat_ctxs;
 	u16	max_irqs;
+	u16	resv_irqs;
 };
 
 #if defined(CONFIG_BNXT_SRIOV)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c
index b59b382d34f9..0a3097baafde 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c
@@ -168,7 +168,7 @@ static int bnxt_req_msix_vecs(struct bnxt_en_dev *edev, int ulp_id,
 	if (BNXT_NEW_RM(bp)) {
 		struct bnxt_hw_resc *hw_resc = &bp->hw_resc;
 
-		avail_msix = hw_resc->resv_cp_rings - bp->cp_nr_rings;
+		avail_msix = hw_resc->resv_irqs - bp->cp_nr_rings;
 		edev->ulp_tbl[ulp_id].msix_requested = avail_msix;
 	}
 	bnxt_fill_msix_vecs(bp, ent);

From c0b8cda05e1d8151f57a79e525c2c7d51cec2f4e Mon Sep 17 00:00:00 2001
From: Michael Chan <michael.chan@broadcom.com>
Date: Sun, 9 Dec 2018 07:01:01 -0500
Subject: [PATCH 91/93] bnxt_en: Fix NQ/CP rings accounting on the new 57500
 chips.

The new 57500 chips have introduced the NQ structure in addition to
the existing CP rings in all chips.  We need to introduce a new
bnxt_nq_rings_in_use().  On legacy chips, the 2 functions are the
same and one will just call the other.  On the new chips, they
refer to the 2 separate ring structures.  The new function is now
called to determine the resource (NQ or CP rings) associated with
MSIX that are in use.

On 57500 chips, the RDMA driver does not use the CP rings so
we don't need to do the subtraction adjustment.

Fixes: 41e8d7983752 ("bnxt_en: Modify the ring reservation functions for 57500 series chips.")
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 29 ++++++++++++++++++-----
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index c39820b2268f..2e90d98640d1 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -5354,7 +5354,7 @@ static int bnxt_hwrm_reserve_rings(struct bnxt *bp, int tx, int rx, int grp,
 		return bnxt_hwrm_reserve_vf_rings(bp, tx, rx, grp, cp, vnic);
 }
 
-static int bnxt_cp_rings_in_use(struct bnxt *bp)
+static int bnxt_nq_rings_in_use(struct bnxt *bp)
 {
 	int cp = bp->cp_nr_rings;
 	int ulp_msix, ulp_base;
@@ -5369,10 +5369,22 @@ static int bnxt_cp_rings_in_use(struct bnxt *bp)
 	return cp;
 }
 
+static int bnxt_cp_rings_in_use(struct bnxt *bp)
+{
+	int cp;
+
+	if (!(bp->flags & BNXT_FLAG_CHIP_P5))
+		return bnxt_nq_rings_in_use(bp);
+
+	cp = bp->tx_nr_rings + bp->rx_nr_rings;
+	return cp;
+}
+
 static bool bnxt_need_reserve_rings(struct bnxt *bp)
 {
 	struct bnxt_hw_resc *hw_resc = &bp->hw_resc;
 	int cp = bnxt_cp_rings_in_use(bp);
+	int nq = bnxt_nq_rings_in_use(bp);
 	int rx = bp->rx_nr_rings;
 	int vnic = 1, grp = rx;
 
@@ -5388,7 +5400,7 @@ static bool bnxt_need_reserve_rings(struct bnxt *bp)
 		rx <<= 1;
 	if (BNXT_NEW_RM(bp) &&
 	    (hw_resc->resv_rx_rings != rx || hw_resc->resv_cp_rings != cp ||
-	     hw_resc->resv_vnics != vnic ||
+	     hw_resc->resv_irqs < nq || hw_resc->resv_vnics != vnic ||
 	     (hw_resc->resv_hw_ring_grps != grp &&
 	      !(bp->flags & BNXT_FLAG_CHIP_P5))))
 		return true;
@@ -5398,7 +5410,7 @@ static bool bnxt_need_reserve_rings(struct bnxt *bp)
 static int __bnxt_reserve_rings(struct bnxt *bp)
 {
 	struct bnxt_hw_resc *hw_resc = &bp->hw_resc;
-	int cp = bnxt_cp_rings_in_use(bp);
+	int cp = bnxt_nq_rings_in_use(bp);
 	int tx = bp->tx_nr_rings;
 	int rx = bp->rx_nr_rings;
 	int grp, rx_rings, rc;
@@ -5423,7 +5435,7 @@ static int __bnxt_reserve_rings(struct bnxt *bp)
 	tx = hw_resc->resv_tx_rings;
 	if (BNXT_NEW_RM(bp)) {
 		rx = hw_resc->resv_rx_rings;
-		cp = hw_resc->resv_cp_rings;
+		cp = hw_resc->resv_irqs;
 		grp = hw_resc->resv_hw_ring_grps;
 		vnic = hw_resc->resv_vnics;
 	}
@@ -7034,7 +7046,12 @@ unsigned int bnxt_get_max_func_cp_rings(struct bnxt *bp)
 
 unsigned int bnxt_get_max_func_cp_rings_for_en(struct bnxt *bp)
 {
-	return bp->hw_resc.max_cp_rings - bnxt_get_ulp_msix_num(bp);
+	unsigned int cp = bp->hw_resc.max_cp_rings;
+
+	if (!(bp->flags & BNXT_FLAG_CHIP_P5))
+		cp -= bnxt_get_ulp_msix_num(bp);
+
+	return cp;
 }
 
 static unsigned int bnxt_get_max_func_irqs(struct bnxt *bp)
@@ -7076,7 +7093,7 @@ static int bnxt_get_num_msix(struct bnxt *bp)
 	if (!BNXT_NEW_RM(bp))
 		return bnxt_get_max_func_irqs(bp);
 
-	return bnxt_cp_rings_in_use(bp);
+	return bnxt_nq_rings_in_use(bp);
 }
 
 static int bnxt_init_msix(struct bnxt *bp)

From e30fbc33190b8ba1d6e8ff4864627f7414b5ca99 Mon Sep 17 00:00:00 2001
From: Michael Chan <michael.chan@broadcom.com>
Date: Sun, 9 Dec 2018 07:01:02 -0500
Subject: [PATCH 92/93] bnxt_en: Fix _bnxt_get_max_rings() for 57500 chips.

The CP rings are accounted differently on the new 57500 chips.  There
must be enough CP rings for the sum of RX and TX rings on the new
chips.  The current logic may be over-estimating the RX and TX rings.

The output parameter max_cp should be the maximum NQs capped by
MSIX vectors available for networking in the context of 57500 chips.
The existing code which uses CMPL rings capped by the MSIX vectors
works most of the time but is not always correct.

Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 2e90d98640d1..5d21c14853ac 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -9827,13 +9827,16 @@ static void _bnxt_get_max_rings(struct bnxt *bp, int *max_rx, int *max_tx,
 				int *max_cp)
 {
 	struct bnxt_hw_resc *hw_resc = &bp->hw_resc;
-	int max_ring_grps = 0;
+	int max_ring_grps = 0, max_irq;
 
 	*max_tx = hw_resc->max_tx_rings;
 	*max_rx = hw_resc->max_rx_rings;
-	*max_cp = min_t(int, bnxt_get_max_func_cp_rings_for_en(bp),
-			hw_resc->max_irqs - bnxt_get_ulp_msix_num(bp));
-	*max_cp = min_t(int, *max_cp, hw_resc->max_stat_ctxs);
+	*max_cp = bnxt_get_max_func_cp_rings_for_en(bp);
+	max_irq = min_t(int, bnxt_get_max_func_irqs(bp) -
+			bnxt_get_ulp_msix_num(bp),
+			bnxt_get_max_func_stat_ctxs(bp));
+	if (!(bp->flags & BNXT_FLAG_CHIP_P5))
+		*max_cp = min_t(int, *max_cp, max_irq);
 	max_ring_grps = hw_resc->max_hw_ring_grps;
 	if (BNXT_CHIP_TYPE_NITRO_A0(bp) && BNXT_PF(bp)) {
 		*max_cp -= 1;
@@ -9841,6 +9844,11 @@ static void _bnxt_get_max_rings(struct bnxt *bp, int *max_rx, int *max_tx,
 	}
 	if (bp->flags & BNXT_FLAG_AGG_RINGS)
 		*max_rx >>= 1;
+	if (bp->flags & BNXT_FLAG_CHIP_P5) {
+		bnxt_trim_rings(bp, max_rx, max_tx, *max_cp, false);
+		/* On P5 chips, max_cp output param should be available NQs */
+		*max_cp = max_irq;
+	}
 	*max_rx = min_t(int, *max_rx, max_ring_grps);
 }
 

From 35cc3cefc4de90001c9137e2d01dd9d06b11acfb Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Sun, 9 Dec 2018 18:10:24 +0200
Subject: [PATCH 93/93] net/sched: cls_flower: Reject duplicated rules also
 under skip_sw

Currently, duplicated rules are rejected only for skip_hw or "none",
hence allowing users to push duplicates into HW for no reason.

Use the flower tables to protect for that.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Paul Blakey <paulb@mellanox.com>
Reported-by: Chris Mi <chrism@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_flower.c | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index c6c327874abc..71312d7bd8f4 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -1238,18 +1238,16 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 	if (err)
 		goto errout_idr;
 
-	if (!tc_skip_sw(fnew->flags)) {
-		if (!fold && fl_lookup(fnew->mask, &fnew->mkey)) {
-			err = -EEXIST;
-			goto errout_mask;
-		}
-
-		err = rhashtable_insert_fast(&fnew->mask->ht, &fnew->ht_node,
-					     fnew->mask->filter_ht_params);
-		if (err)
-			goto errout_mask;
+	if (!fold && fl_lookup(fnew->mask, &fnew->mkey)) {
+		err = -EEXIST;
+		goto errout_mask;
 	}
 
+	err = rhashtable_insert_fast(&fnew->mask->ht, &fnew->ht_node,
+				     fnew->mask->filter_ht_params);
+	if (err)
+		goto errout_mask;
+
 	if (!tc_skip_hw(fnew->flags)) {
 		err = fl_hw_replace_filter(tp, fnew, extack);
 		if (err)
@@ -1303,9 +1301,8 @@ static int fl_delete(struct tcf_proto *tp, void *arg, bool *last,
 	struct cls_fl_head *head = rtnl_dereference(tp->root);
 	struct cls_fl_filter *f = arg;
 
-	if (!tc_skip_sw(f->flags))
-		rhashtable_remove_fast(&f->mask->ht, &f->ht_node,
-				       f->mask->filter_ht_params);
+	rhashtable_remove_fast(&f->mask->ht, &f->ht_node,
+			       f->mask->filter_ht_params);
 	__fl_delete(tp, f, extack);
 	*last = list_empty(&head->masks);
 	return 0;