Merge branch 'mptcp-Connection-and-accounting-fixes'
Mat Martineau says:
====================
mptcp: Connection and accounting fixes
Here are some miscellaneous fixes for MPTCP:
Patch 1 modifies an MPTCP hash so it doesn't depend on one of skb->dev
and skb->sk being non-NULL.
Patch 2 removes an extra destructor call when rejecting a join due to
port mismatch.
Patches 3 and 5 more cleanly handle error conditions with MP_JOIN and
syncookies, and update a related self test.
Patch 4 makes sure packets that trigger a subflow TCP reset during MPTCP
option header processing are correctly dropped.
Patch 6 addresses a rmem accounting issue that could keep packets in
subflow receive buffers longer than necessary, delaying MPTCP-level
ACKs.
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/include/net/mptcp.h b/include/net/mptcp.h
index cb580b0..8b5af68 100644
--- a/include/net/mptcp.h
+++ b/include/net/mptcp.h
@@ -105,7 +105,7 @@
bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
unsigned int *size, unsigned int remaining,
struct mptcp_out_options *opts);
-void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb);
+bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb);
void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
struct mptcp_out_options *opts);
@@ -227,9 +227,10 @@
return false;
}
-static inline void mptcp_incoming_options(struct sock *sk,
+static inline bool mptcp_incoming_options(struct sock *sk,
struct sk_buff *skb)
{
+ return true;
}
static inline void mptcp_skb_ext_move(struct sk_buff *to,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a5a8d0a..149ceb5 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4247,6 +4247,9 @@
{
trace_tcp_receive_reset(sk);
+ /* mptcp can't tell us to ignore reset pkts,
+ * so just ignore the return value of mptcp_incoming_options().
+ */
if (sk_is_mptcp(sk))
mptcp_incoming_options(sk, skb);
@@ -4941,8 +4944,13 @@
bool fragstolen;
int eaten;
- if (sk_is_mptcp(sk))
- mptcp_incoming_options(sk, skb);
+ /* If a subflow has been reset, the packet should not continue
+ * to be processed, drop the packet.
+ */
+ if (sk_is_mptcp(sk) && !mptcp_incoming_options(sk, skb)) {
+ __kfree_skb(skb);
+ return;
+ }
if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
__kfree_skb(skb);
@@ -6523,8 +6531,11 @@
case TCP_CLOSING:
case TCP_LAST_ACK:
if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
- if (sk_is_mptcp(sk))
- mptcp_incoming_options(sk, skb);
+ /* If a subflow has been reset, the packet should not
+ * continue to be processed, drop the packet.
+ */
+ if (sk_is_mptcp(sk) && !mptcp_incoming_options(sk, skb))
+ goto discard;
break;
}
fallthrough;
diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c
index 52ea251..ff2cc0e 100644
--- a/net/mptcp/mib.c
+++ b/net/mptcp/mib.c
@@ -44,6 +44,7 @@
SNMP_MIB_ITEM("RmSubflow", MPTCP_MIB_RMSUBFLOW),
SNMP_MIB_ITEM("MPPrioTx", MPTCP_MIB_MPPRIOTX),
SNMP_MIB_ITEM("MPPrioRx", MPTCP_MIB_MPPRIORX),
+ SNMP_MIB_ITEM("RcvPruned", MPTCP_MIB_RCVPRUNED),
SNMP_MIB_SENTINEL
};
diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h
index 193466c..0663cb1 100644
--- a/net/mptcp/mib.h
+++ b/net/mptcp/mib.h
@@ -37,6 +37,7 @@
MPTCP_MIB_RMSUBFLOW, /* Remove a subflow */
MPTCP_MIB_MPPRIOTX, /* Transmit a MP_PRIO */
MPTCP_MIB_MPPRIORX, /* Received a MP_PRIO */
+ MPTCP_MIB_RCVPRUNED, /* Incoming packet dropped due to memory limit */
__MPTCP_MIB_MAX
};
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index b5850af..4452455 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -1035,7 +1035,8 @@
return hmac == mp_opt->ahmac;
}
-void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
+/* Return false if a subflow has been reset, else return true */
+bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
@@ -1053,12 +1054,16 @@
__mptcp_check_push(subflow->conn, sk);
__mptcp_data_acked(subflow->conn);
mptcp_data_unlock(subflow->conn);
- return;
+ return true;
}
mptcp_get_options(sk, skb, &mp_opt);
+
+ /* The subflow can be in close state only if check_fully_established()
+ * just sent a reset. If so, tell the caller to ignore the current packet.
+ */
if (!check_fully_established(msk, sk, subflow, skb, &mp_opt))
- return;
+ return sk->sk_state != TCP_CLOSE;
if (mp_opt.fastclose &&
msk->local_key == mp_opt.rcvr_key) {
@@ -1100,7 +1105,7 @@
}
if (!mp_opt.dss)
- return;
+ return true;
/* we can't wait for recvmsg() to update the ack_seq, otherwise
* monodirectional flows will stuck
@@ -1119,12 +1124,12 @@
schedule_work(&msk->work))
sock_hold(subflow->conn);
- return;
+ return true;
}
mpext = skb_ext_add(skb, SKB_EXT_MPTCP);
if (!mpext)
- return;
+ return true;
memset(mpext, 0, sizeof(*mpext));
@@ -1153,6 +1158,8 @@
if (mpext->csum_reqd)
mpext->csum = mp_opt.csum;
}
+
+ return true;
}
static void mptcp_set_rwin(const struct tcp_sock *tp)
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 7a5afa8..a889249 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -474,7 +474,7 @@
bool cleanup, rx_empty;
cleanup = (space > 0) && (space >= (old_space << 1));
- rx_empty = !atomic_read(&sk->sk_rmem_alloc);
+ rx_empty = !__mptcp_rmem(sk);
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
@@ -720,8 +720,10 @@
sk_rbuf = ssk_rbuf;
/* over limit? can't append more skbs to msk, Also, no need to wake-up*/
- if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf)
+ if (__mptcp_rmem(sk) > sk_rbuf) {
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED);
return;
+ }
/* Wake-up the reader only for in-sequence data */
mptcp_data_lock(sk);
@@ -1754,7 +1756,7 @@
if (!(flags & MSG_PEEK)) {
/* we will bulk release the skb memory later */
skb->destructor = NULL;
- msk->rmem_released += skb->truesize;
+ WRITE_ONCE(msk->rmem_released, msk->rmem_released + skb->truesize);
__skb_unlink(skb, &msk->receive_queue);
__kfree_skb(skb);
}
@@ -1873,7 +1875,7 @@
atomic_sub(msk->rmem_released, &sk->sk_rmem_alloc);
sk_mem_uncharge(sk, msk->rmem_released);
- msk->rmem_released = 0;
+ WRITE_ONCE(msk->rmem_released, 0);
}
static void __mptcp_splice_receive_queue(struct sock *sk)
@@ -2380,7 +2382,7 @@
msk->out_of_order_queue = RB_ROOT;
msk->first_pending = NULL;
msk->wmem_reserved = 0;
- msk->rmem_released = 0;
+ WRITE_ONCE(msk->rmem_released, 0);
msk->tx_pending_data = 0;
msk->first = NULL;
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 426ed80..0f0c026 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -296,9 +296,17 @@
return (struct mptcp_sock *)sk;
}
+/* the msk socket don't use the backlog, also account for the bulk
+ * free memory
+ */
+static inline int __mptcp_rmem(const struct sock *sk)
+{
+ return atomic_read(&sk->sk_rmem_alloc) - READ_ONCE(mptcp_sk(sk)->rmem_released);
+}
+
static inline int __mptcp_space(const struct sock *sk)
{
- return tcp_space(sk) + READ_ONCE(mptcp_sk(sk)->rmem_released);
+ return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - __mptcp_rmem(sk));
}
static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk)
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 66d0b18..966f777 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -214,11 +214,6 @@
ntohs(inet_sk(sk_listener)->inet_sport),
ntohs(inet_sk((struct sock *)subflow_req->msk)->inet_sport));
if (!mptcp_pm_sport_in_anno_list(subflow_req->msk, sk_listener)) {
- sock_put((struct sock *)subflow_req->msk);
- mptcp_token_destroy_request(req);
- tcp_request_sock_ops.destructor(req);
- subflow_req->msk = NULL;
- subflow_req->mp_join = 0;
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTSYNRX);
return -EPERM;
}
@@ -230,6 +225,8 @@
if (unlikely(req->syncookie)) {
if (mptcp_can_accept_new_subflow(subflow_req->msk))
subflow_init_req_cookie_join_save(subflow_req, skb);
+ else
+ return -EPERM;
}
pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req->token,
@@ -269,9 +266,7 @@
if (!mptcp_token_join_cookie_init_state(subflow_req, skb))
return -EINVAL;
- if (mptcp_can_accept_new_subflow(subflow_req->msk))
- subflow_req->mp_join = 1;
-
+ subflow_req->mp_join = 1;
subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1;
}
diff --git a/net/mptcp/syncookies.c b/net/mptcp/syncookies.c
index abe0fd0..3712778 100644
--- a/net/mptcp/syncookies.c
+++ b/net/mptcp/syncookies.c
@@ -37,7 +37,21 @@
static u32 mptcp_join_entry_hash(struct sk_buff *skb, struct net *net)
{
- u32 i = skb_get_hash(skb) ^ net_hash_mix(net);
+ static u32 mptcp_join_hash_secret __read_mostly;
+ struct tcphdr *th = tcp_hdr(skb);
+ u32 seq, i;
+
+ net_get_random_once(&mptcp_join_hash_secret,
+ sizeof(mptcp_join_hash_secret));
+
+ if (th->syn)
+ seq = TCP_SKB_CB(skb)->seq;
+ else
+ seq = TCP_SKB_CB(skb)->seq - 1;
+
+ i = jhash_3words(seq, net_hash_mix(net),
+ (__force __u32)th->source << 16 | (__force __u32)th->dest,
+ mptcp_join_hash_secret);
return i % ARRAY_SIZE(join_entries);
}
diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index 9a191c1..f02f4de 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -1409,7 +1409,7 @@
ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow
run_tests $ns1 $ns2 10.0.1.1
- chk_join_nr "subflows limited by server w cookies" 2 2 1
+ chk_join_nr "subflows limited by server w cookies" 2 1 1
# test signal address with cookies
reset_with_cookies