Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net

Pull networking fixes from David Miller:

 1) In TCP, don't register an FRTO for cumulatively ACK'd data that was
    previously SACK'd, from Neal Cardwell.

 2) Need to hold RNL mutex in ipv4 multicast code namespace cleanup,
    from Cong WANG.

 3) Similarly we have to hold RNL mutex for fib_rules_unregister(), also
    from Cong WANG.

 4) Revert and rework netns nsid allocation fix, from Nicolas Dichtel.

 5) When we encapsulate for a tunnel device, skb->sk still points to the
    user socket.  So this leads to cases where we retraverse the
    ipv4/ipv6 output path with skb->sk being of some other address
    family (f.e. AF_PACKET).  This can cause things to crash since the
    ipv4 output path is dereferencing an AF_PACKET socket as if it were
    an ipv4 one.

    The short term fix for 'net' and -stable is to elide these socket
    checks once we've entered an encapsulation sequence by testing
    xmit_recursion.

    Longer term we have a better solution wherein we pass the tunnel's
    socket down through the output paths, but that is way too invasive
    for 'net' and -stable.

    From Hannes Frederic Sowa.

 6) l2tp_init() failure path forgets to unregister per-net ops, from
    Cong WANG.

* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net:
  net/mlx4_core: Fix error message deprecation for ConnectX-2 cards
  net: dsa: fix filling routing table from OF description
  l2tp: unregister l2tp_net_ops on failure path
  mvneta: dont call mvneta_adjust_link() manually
  ipv6: protect skb->sk accesses from recursive dereference inside the stack
  netns: don't allocate an id for dead netns
  Revert "netns: don't clear nsid too early on removal"
  ip6mr: call del_timer_sync() in ip6mr_free_table()
  net: move fib_rules_unregister() under rtnl lock
  ipv4: take rtnl_lock and mark mrt table as freed on namespace cleanup
  tcp: fix FRTO undo on cumulative ACK of SACKed range
  xen-netfront: transmit fully GSO-sized packets
diff --git a/Documentation/devicetree/bindings/net/dsa/dsa.txt b/Documentation/devicetree/bindings/net/dsa/dsa.txt
index e124847..f0b4cd7 100644
--- a/Documentation/devicetree/bindings/net/dsa/dsa.txt
+++ b/Documentation/devicetree/bindings/net/dsa/dsa.txt
@@ -19,7 +19,9 @@
 (DSA_MAX_SWITCHES).
 Each of these switch child nodes should have the following required properties:
 
-- reg			: Describes the switch address on the MII bus
+- reg			: Contains two fields. The first one describes the
+			  address on the MII bus. The second is the switch
+			  number that must be unique in cascaded configurations
 - #address-cells	: Must be 1
 - #size-cells		: Must be 0
 
diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index 96208f1..2db6532 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -2658,16 +2658,11 @@
 static int mvneta_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 {
 	struct mvneta_port *pp = netdev_priv(dev);
-	int ret;
 
 	if (!pp->phy_dev)
 		return -ENOTSUPP;
 
-	ret = phy_mii_ioctl(pp->phy_dev, ifr, cmd);
-	if (!ret)
-		mvneta_adjust_link(dev);
-
-	return ret;
+	return phy_mii_ioctl(pp->phy_dev, ifr, cmd);
 }
 
 /* Ethtool methods */
diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index 3350721..546ca42 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -724,7 +724,8 @@
 		 * on the host, we deprecate the error message for this
 		 * specific command/input_mod/opcode_mod/fw-status to be debug.
 		 */
-		if (op == MLX4_CMD_SET_PORT && in_modifier == 1 &&
+		if (op == MLX4_CMD_SET_PORT &&
+		    (in_modifier == 1 || in_modifier == 2) &&
 		    op_modifier == 0 && context->fw_status == CMD_STAT_BAD_SIZE)
 			mlx4_dbg(dev, "command 0x%x failed: fw status = 0x%x\n",
 				 op, context->fw_status);
diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index e9b960f..720aaf6 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -1008,8 +1008,7 @@
 
 static int xennet_change_mtu(struct net_device *dev, int mtu)
 {
-	int max = xennet_can_sg(dev) ?
-		XEN_NETIF_MAX_TX_SIZE - MAX_TCP_HEADER : ETH_DATA_LEN;
+	int max = xennet_can_sg(dev) ? XEN_NETIF_MAX_TX_SIZE : ETH_DATA_LEN;
 
 	if (mtu > max)
 		return -EINVAL;
@@ -1279,8 +1278,6 @@
 	netdev->ethtool_ops = &xennet_ethtool_ops;
 	SET_NETDEV_DEV(netdev, &dev->dev);
 
-	netif_set_gso_max_size(netdev, XEN_NETIF_MAX_TX_SIZE - MAX_TCP_HEADER);
-
 	np->netdev = netdev;
 
 	netif_carrier_off(netdev);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index dcf6ec2..2787388 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2185,6 +2185,12 @@
 void synchronize_net(void);
 int init_dummy_netdev(struct net_device *dev);
 
+DECLARE_PER_CPU(int, xmit_recursion);
+static inline int dev_recursion_level(void)
+{
+	return this_cpu_read(xmit_recursion);
+}
+
 struct net_device *dev_get_by_index(struct net *net, int ifindex);
 struct net_device *__dev_get_by_index(struct net *net, int ifindex);
 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex);
diff --git a/include/net/ip.h b/include/net/ip.h
index 025c61c..6cc1eaf 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -453,22 +453,6 @@
 
 #endif
 
-static inline int sk_mc_loop(struct sock *sk)
-{
-	if (!sk)
-		return 1;
-	switch (sk->sk_family) {
-	case AF_INET:
-		return inet_sk(sk)->mc_loop;
-#if IS_ENABLED(CONFIG_IPV6)
-	case AF_INET6:
-		return inet6_sk(sk)->mc_loop;
-#endif
-	}
-	WARN_ON(1);
-	return 1;
-}
-
 bool ip_call_ra_chain(struct sk_buff *skb);
 
 /*
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 1d09b46..eda131d 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -174,7 +174,8 @@
 
 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
 {
-	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
+	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
+				inet6_sk(skb->sk) : NULL;
 
 	return (np && np->pmtudisc >= IPV6_PMTUDISC_PROBE) ?
 	       skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
diff --git a/include/net/sock.h b/include/net/sock.h
index ab186b1..e4079c2 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1762,6 +1762,8 @@
 
 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie);
 
+bool sk_mc_loop(struct sock *sk);
+
 static inline bool sk_can_gso(const struct sock *sk)
 {
 	return net_gso_ok(sk->sk_route_caps, sk->sk_gso_type);
diff --git a/net/core/dev.c b/net/core/dev.c
index 962ee9d..45109b7 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2848,7 +2848,9 @@
 #define skb_update_prio(skb)
 #endif
 
-static DEFINE_PER_CPU(int, xmit_recursion);
+DEFINE_PER_CPU(int, xmit_recursion);
+EXPORT_SYMBOL(xmit_recursion);
+
 #define RECURSION_LIMIT 10
 
 /**
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 44706e8..e4fdc9d 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -175,9 +175,9 @@
 
 	spin_lock(&net->rules_mod_lock);
 	list_del_rcu(&ops->list);
-	fib_rules_cleanup_ops(ops);
 	spin_unlock(&net->rules_mod_lock);
 
+	fib_rules_cleanup_ops(ops);
 	call_rcu(&ops->rcu, fib_rules_put_rcu);
 }
 EXPORT_SYMBOL_GPL(fib_rules_unregister);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 5221f97..70d3450 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -198,8 +198,10 @@
  */
 int peernet2id(struct net *net, struct net *peer)
 {
-	int id = __peernet2id(net, peer, true);
+	bool alloc = atomic_read(&peer->count) == 0 ? false : true;
+	int id;
 
+	id = __peernet2id(net, peer, alloc);
 	return id >= 0 ? id : NETNSA_NSID_NOT_ASSIGNED;
 }
 EXPORT_SYMBOL(peernet2id);
@@ -349,7 +351,7 @@
 static void cleanup_net(struct work_struct *work)
 {
 	const struct pernet_operations *ops;
-	struct net *net, *tmp, *peer;
+	struct net *net, *tmp;
 	struct list_head net_kill_list;
 	LIST_HEAD(net_exit_list);
 
@@ -365,6 +367,14 @@
 	list_for_each_entry(net, &net_kill_list, cleanup_list) {
 		list_del_rcu(&net->list);
 		list_add_tail(&net->exit_list, &net_exit_list);
+		for_each_net(tmp) {
+			int id = __peernet2id(tmp, net, false);
+
+			if (id >= 0)
+				idr_remove(&tmp->netns_ids, id);
+		}
+		idr_destroy(&net->netns_ids);
+
 	}
 	rtnl_unlock();
 
@@ -390,26 +400,12 @@
 	 */
 	rcu_barrier();
 
-	rtnl_lock();
 	/* Finally it is safe to free my network namespace structure */
 	list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {
-		/* Unreference net from all peers (no need to loop over
-		 * net_exit_list because idr_destroy() will be called for each
-		 * element of this list.
-		 */
-		for_each_net(peer) {
-			int id = __peernet2id(peer, net, false);
-
-			if (id >= 0)
-				idr_remove(&peer->netns_ids, id);
-		}
-		idr_destroy(&net->netns_ids);
-
 		list_del_init(&net->exit_list);
 		put_user_ns(net->user_ns);
 		net_drop_ns(net);
 	}
-	rtnl_unlock();
 }
 static DECLARE_WORK(net_cleanup_work, cleanup_net);
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 78e89eb..71e3e5f 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -653,6 +653,25 @@
 		sock_reset_flag(sk, bit);
 }
 
+bool sk_mc_loop(struct sock *sk)
+{
+	if (dev_recursion_level())
+		return false;
+	if (!sk)
+		return true;
+	switch (sk->sk_family) {
+	case AF_INET:
+		return inet_sk(sk)->mc_loop;
+#if IS_ENABLED(CONFIG_IPV6)
+	case AF_INET6:
+		return inet6_sk(sk)->mc_loop;
+#endif
+	}
+	WARN_ON(1);
+	return true;
+}
+EXPORT_SYMBOL(sk_mc_loop);
+
 /*
  *	This is meant for all protocols to use and covers goings on
  *	at the socket level. Everything here is generic.
diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c
index faf7cc3..9d66a0f 100644
--- a/net/decnet/dn_rules.c
+++ b/net/decnet/dn_rules.c
@@ -248,7 +248,9 @@
 
 void __exit dn_fib_rules_cleanup(void)
 {
+	rtnl_lock();
 	fib_rules_unregister(dn_fib_rules_ops);
+	rtnl_unlock();
 	rcu_barrier();
 }
 
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 2173402..4dea2e0 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -501,12 +501,10 @@
 #ifdef CONFIG_OF
 static int dsa_of_setup_routing_table(struct dsa_platform_data *pd,
 					struct dsa_chip_data *cd,
-					int chip_index,
+					int chip_index, int port_index,
 					struct device_node *link)
 {
-	int ret;
 	const __be32 *reg;
-	int link_port_addr;
 	int link_sw_addr;
 	struct device_node *parent_sw;
 	int len;
@@ -519,6 +517,10 @@
 	if (!reg || (len != sizeof(*reg) * 2))
 		return -EINVAL;
 
+	/*
+	 * Get the destination switch number from the second field of its 'reg'
+	 * property, i.e. for "reg = <0x19 1>" sw_addr is '1'.
+	 */
 	link_sw_addr = be32_to_cpup(reg + 1);
 
 	if (link_sw_addr >= pd->nr_chips)
@@ -535,20 +537,9 @@
 		memset(cd->rtable, -1, pd->nr_chips * sizeof(s8));
 	}
 
-	reg = of_get_property(link, "reg", NULL);
-	if (!reg) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	link_port_addr = be32_to_cpup(reg);
-
-	cd->rtable[link_sw_addr] = link_port_addr;
+	cd->rtable[link_sw_addr] = port_index;
 
 	return 0;
-out:
-	kfree(cd->rtable);
-	return ret;
 }
 
 static void dsa_of_free_platform_data(struct dsa_platform_data *pd)
@@ -658,7 +649,7 @@
 			if (!strcmp(port_name, "dsa") && link &&
 					pd->nr_chips > 1) {
 				ret = dsa_of_setup_routing_table(pd, cd,
-						chip_index, link);
+						chip_index, port_index, link);
 				if (ret)
 					goto out_free_chip;
 			}
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 57be71d..23b9b3e 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1111,11 +1111,10 @@
 {
 	unsigned int i;
 
+	rtnl_lock();
 #ifdef CONFIG_IP_MULTIPLE_TABLES
 	fib4_rules_exit(net);
 #endif
-
-	rtnl_lock();
 	for (i = 0; i < FIB_TABLE_HASHSZ; i++) {
 		struct fib_table *tb;
 		struct hlist_head *head;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 9282544..fe54eba 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -278,11 +278,13 @@
 {
 	struct mr_table *mrt, *next;
 
+	rtnl_lock();
 	list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) {
 		list_del(&mrt->list);
 		ipmr_free_table(mrt);
 	}
 	fib_rules_unregister(net->ipv4.mr_rules_ops);
+	rtnl_unlock();
 }
 #else
 #define ipmr_for_each_table(mrt, net) \
@@ -308,7 +310,10 @@
 
 static void __net_exit ipmr_rules_exit(struct net *net)
 {
+	rtnl_lock();
 	ipmr_free_table(net->ipv4.mrt);
+	net->ipv4.mrt = NULL;
+	rtnl_unlock();
 }
 #endif
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index fb4cf8b..f501ac04 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3105,10 +3105,11 @@
 			if (!first_ackt.v64)
 				first_ackt = last_ackt;
 
-			if (!(sacked & TCPCB_SACKED_ACKED))
+			if (!(sacked & TCPCB_SACKED_ACKED)) {
 				reord = min(pkts_acked, reord);
-			if (!after(scb->end_seq, tp->high_seq))
-				flag |= FLAG_ORIG_SACK_ACKED;
+				if (!after(scb->end_seq, tp->high_seq))
+					flag |= FLAG_ORIG_SACK_ACKED;
+			}
 		}
 
 		if (sacked & TCPCB_SACKED_ACKED)
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index 27ca796..70bc6ab 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -322,7 +322,9 @@
 
 static void __net_exit fib6_rules_net_exit(struct net *net)
 {
+	rtnl_lock();
 	fib_rules_unregister(net->ipv6.fib6_rules_ops);
+	rtnl_unlock();
 }
 
 static struct pernet_operations fib6_rules_net_ops = {
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 7e80b61..36cf0ab 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -542,7 +542,8 @@
 {
 	struct sk_buff *frag;
 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
-	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
+	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
+				inet6_sk(skb->sk) : NULL;
 	struct ipv6hdr *tmp_hdr;
 	struct frag_hdr *fh;
 	unsigned int mtu, hlen, left, len;
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 52028f4..312e0ff 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -267,8 +267,8 @@
 		list_del(&mrt->list);
 		ip6mr_free_table(mrt);
 	}
-	rtnl_unlock();
 	fib_rules_unregister(net->ipv6.mr6_rules_ops);
+	rtnl_unlock();
 }
 #else
 #define ip6mr_for_each_table(mrt, net) \
@@ -336,7 +336,7 @@
 
 static void ip6mr_free_table(struct mr6_table *mrt)
 {
-	del_timer(&mrt->ipmr_expire_timer);
+	del_timer_sync(&mrt->ipmr_expire_timer);
 	mroute_clean_tables(mrt);
 	kfree(mrt);
 }
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 895348e..a29a504 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -1871,6 +1871,7 @@
 	l2tp_wq = alloc_workqueue("l2tp", WQ_UNBOUND, 0);
 	if (!l2tp_wq) {
 		pr_err("alloc_workqueue failed\n");
+		unregister_pernet_device(&l2tp_net_ops);
 		rc = -ENOMEM;
 		goto out;
 	}