Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 362d5204 authored by Daniel Borkmann's avatar Daniel Borkmann Committed by David S. Miller
Browse files

Revert "net: sctp: Fix a_rwnd/rwnd management to reflect real state of the receiver's buffer"



This reverts commit ef2820a7 ("net: sctp: Fix a_rwnd/rwnd management
to reflect real state of the receiver's buffer") as it introduced a
serious performance regression on SCTP over IPv4 and IPv6, though a not
as dramatic on the latter. Measurements are on 10Gbit/s with ixgbe NICs.

Current state:

[root@Lab200slot2 ~]# iperf3 --sctp -4 -c 192.168.241.3 -V -l 1452 -t 60
iperf version 3.0.1 (10 January 2014)
Linux Lab200slot2 3.14.0 #1 SMP Thu Apr 3 23:18:29 EDT 2014 x86_64
Time: Fri, 11 Apr 2014 17:56:21 GMT
Connecting to host 192.168.241.3, port 5201
      Cookie: Lab200slot2.1397238981.812898.548918
[  4] local 192.168.241.2 port 38616 connected to 192.168.241.3 port 5201
Starting Test: protocol: SCTP, 1 streams, 1452 byte blocks, omitting 0 seconds, 60 second test
[ ID] Interval           Transfer     Bandwidth
[  4]   0.00-1.09   sec  20.8 MBytes   161 Mbits/sec
[  4]   1.09-2.13   sec  10.8 MBytes  86.8 Mbits/sec
[  4]   2.13-3.15   sec  3.57 MBytes  29.5 Mbits/sec
[  4]   3.15-4.16   sec  4.33 MBytes  35.7 Mbits/sec
[  4]   4.16-6.21   sec  10.4 MBytes  42.7 Mbits/sec
[  4]   6.21-6.21   sec  0.00 Bytes    0.00 bits/sec
[  4]   6.21-7.35   sec  34.6 MBytes   253 Mbits/sec
[  4]   7.35-11.45  sec  22.0 MBytes  45.0 Mbits/sec
[  4]  11.45-11.45  sec  0.00 Bytes    0.00 bits/sec
[  4]  11.45-11.45  sec  0.00 Bytes    0.00 bits/sec
[  4]  11.45-11.45  sec  0.00 Bytes    0.00 bits/sec
[  4]  11.45-12.51  sec  16.0 MBytes   126 Mbits/sec
[  4]  12.51-13.59  sec  20.3 MBytes   158 Mbits/sec
[  4]  13.59-14.65  sec  13.4 MBytes   107 Mbits/sec
[  4]  14.65-16.79  sec  33.3 MBytes   130 Mbits/sec
[  4]  16.79-16.79  sec  0.00 Bytes    0.00 bits/sec
[  4]  16.79-17.82  sec  5.94 MBytes  48.7 Mbits/sec
(etc)

[root@Lab200slot2 ~]#  iperf3 --sctp -6 -c 2001:db8:0:f101::1 -V -l 1400 -t 60
iperf version 3.0.1 (10 January 2014)
Linux Lab200slot2 3.14.0 #1 SMP Thu Apr 3 23:18:29 EDT 2014 x86_64
Time: Fri, 11 Apr 2014 19:08:41 GMT
Connecting to host 2001:db8:0:f101::1, port 5201
      Cookie: Lab200slot2.1397243321.714295.2b3f7c
[  4] local 2001:db8:0:f101::2 port 55804 connected to 2001:db8:0:f101::1 port 5201
Starting Test: protocol: SCTP, 1 streams, 1400 byte blocks, omitting 0 seconds, 60 second test
[ ID] Interval           Transfer     Bandwidth
[  4]   0.00-1.00   sec   169 MBytes  1.42 Gbits/sec
[  4]   1.00-2.00   sec   201 MBytes  1.69 Gbits/sec
[  4]   2.00-3.00   sec   188 MBytes  1.58 Gbits/sec
[  4]   3.00-4.00   sec   174 MBytes  1.46 Gbits/sec
[  4]   4.00-5.00   sec   165 MBytes  1.39 Gbits/sec
[  4]   5.00-6.00   sec   199 MBytes  1.67 Gbits/sec
[  4]   6.00-7.00   sec   163 MBytes  1.36 Gbits/sec
[  4]   7.00-8.00   sec   174 MBytes  1.46 Gbits/sec
[  4]   8.00-9.00   sec   193 MBytes  1.62 Gbits/sec
[  4]   9.00-10.00  sec   196 MBytes  1.65 Gbits/sec
[  4]  10.00-11.00  sec   157 MBytes  1.31 Gbits/sec
[  4]  11.00-12.00  sec   175 MBytes  1.47 Gbits/sec
[  4]  12.00-13.00  sec   192 MBytes  1.61 Gbits/sec
[  4]  13.00-14.00  sec   199 MBytes  1.67 Gbits/sec
(etc)

After patch:

[root@Lab200slot2 ~]#  iperf3 --sctp -4 -c 192.168.240.3 -V -l 1452 -t 60
iperf version 3.0.1 (10 January 2014)
Linux Lab200slot2 3.14.0+ #1 SMP Mon Apr 14 12:06:40 EDT 2014 x86_64
Time: Mon, 14 Apr 2014 16:40:48 GMT
Connecting to host 192.168.240.3, port 5201
      Cookie: Lab200slot2.1397493648.413274.65e131
[  4] local 192.168.240.2 port 50548 connected to 192.168.240.3 port 5201
Starting Test: protocol: SCTP, 1 streams, 1452 byte blocks, omitting 0 seconds, 60 second test
[ ID] Interval           Transfer     Bandwidth
[  4]   0.00-1.00   sec   240 MBytes  2.02 Gbits/sec
[  4]   1.00-2.00   sec   239 MBytes  2.01 Gbits/sec
[  4]   2.00-3.00   sec   240 MBytes  2.01 Gbits/sec
[  4]   3.00-4.00   sec   239 MBytes  2.00 Gbits/sec
[  4]   4.00-5.00   sec   245 MBytes  2.05 Gbits/sec
[  4]   5.00-6.00   sec   240 MBytes  2.01 Gbits/sec
[  4]   6.00-7.00   sec   240 MBytes  2.02 Gbits/sec
[  4]   7.00-8.00   sec   239 MBytes  2.01 Gbits/sec

With the reverted patch applied, the SCTP/IPv4 performance is back
to normal on latest upstream for IPv4 and IPv6 and has same throughput
as 3.4.2 test kernel, steady and interval reports are smooth again.

Fixes: ef2820a7 ("net: sctp: Fix a_rwnd/rwnd management to reflect real state of the receiver's buffer")
Reported-by: default avatarPeter Butler <pbutler@sonusnet.com>
Reported-by: default avatarDongsheng Song <dongsheng.song@gmail.com>
Reported-by: default avatarFengguang Wu <fengguang.wu@intel.com>
Tested-by: default avatarPeter Butler <pbutler@sonusnet.com>
Signed-off-by: default avatarDaniel Borkmann <dborkman@redhat.com>
Cc: Matija Glavinic Pecotic <matija.glavinic-pecotic.ext@nsn.com>
Cc: Alexander Sverdlin <alexander.sverdlin@nsn.com>
Cc: Vlad Yasevich <vyasevich@gmail.com>
Acked-by: default avatarVlad Yasevich <vyasevich@gmail.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent bfae2324
Loading
Loading
Loading
Loading
+13 −1
Original line number Original line Diff line number Diff line
@@ -1653,6 +1653,17 @@ struct sctp_association {
	/* This is the last advertised value of rwnd over a SACK chunk. */
	/* This is the last advertised value of rwnd over a SACK chunk. */
	__u32 a_rwnd;
	__u32 a_rwnd;


	/* Number of bytes by which the rwnd has slopped.  The rwnd is allowed
	 * to slop over a maximum of the association's frag_point.
	 */
	__u32 rwnd_over;

	/* Keeps treack of rwnd pressure.  This happens when we have
	 * a window, but not recevie buffer (i.e small packets).  This one
	 * is releases slowly (1 PMTU at a time ).
	 */
	__u32 rwnd_press;

	/* This is the sndbuf size in use for the association.
	/* This is the sndbuf size in use for the association.
	 * This corresponds to the sndbuf size for the association,
	 * This corresponds to the sndbuf size for the association,
	 * as specified in the sk->sndbuf.
	 * as specified in the sk->sndbuf.
@@ -1881,7 +1892,8 @@ void sctp_assoc_update(struct sctp_association *old,
__u32 sctp_association_get_next_tsn(struct sctp_association *);
__u32 sctp_association_get_next_tsn(struct sctp_association *);


void sctp_assoc_sync_pmtu(struct sock *, struct sctp_association *);
void sctp_assoc_sync_pmtu(struct sock *, struct sctp_association *);
void sctp_assoc_rwnd_update(struct sctp_association *, bool);
void sctp_assoc_rwnd_increase(struct sctp_association *, unsigned int);
void sctp_assoc_rwnd_decrease(struct sctp_association *, unsigned int);
void sctp_assoc_set_primary(struct sctp_association *,
void sctp_assoc_set_primary(struct sctp_association *,
			    struct sctp_transport *);
			    struct sctp_transport *);
void sctp_assoc_del_nonprimary_peers(struct sctp_association *,
void sctp_assoc_del_nonprimary_peers(struct sctp_association *,
+65 −17
Original line number Original line Diff line number Diff line
@@ -1395,35 +1395,44 @@ static inline bool sctp_peer_needs_update(struct sctp_association *asoc)
	return false;
	return false;
}
}


/* Update asoc's rwnd for the approximated state in the buffer,
/* Increase asoc's rwnd by len and send any window update SACK if needed. */
 * and check whether SACK needs to be sent.
void sctp_assoc_rwnd_increase(struct sctp_association *asoc, unsigned int len)
 */
void sctp_assoc_rwnd_update(struct sctp_association *asoc, bool update_peer)
{
{
	int rx_count;
	struct sctp_chunk *sack;
	struct sctp_chunk *sack;
	struct timer_list *timer;
	struct timer_list *timer;


	if (asoc->ep->rcvbuf_policy)
	if (asoc->rwnd_over) {
		rx_count = atomic_read(&asoc->rmem_alloc);
		if (asoc->rwnd_over >= len) {
	else
			asoc->rwnd_over -= len;
		rx_count = atomic_read(&asoc->base.sk->sk_rmem_alloc);
		} else {
			asoc->rwnd += (len - asoc->rwnd_over);
			asoc->rwnd_over = 0;
		}
	} else {
		asoc->rwnd += len;
	}


	if ((asoc->base.sk->sk_rcvbuf - rx_count) > 0)
	/* If we had window pressure, start recovering it
		asoc->rwnd = (asoc->base.sk->sk_rcvbuf - rx_count) >> 1;
	 * once our rwnd had reached the accumulated pressure
	else
	 * threshold.  The idea is to recover slowly, but up
		asoc->rwnd = 0;
	 * to the initial advertised window.
	 */
	if (asoc->rwnd_press && asoc->rwnd >= asoc->rwnd_press) {
		int change = min(asoc->pathmtu, asoc->rwnd_press);
		asoc->rwnd += change;
		asoc->rwnd_press -= change;
	}


	pr_debug("%s: asoc:%p rwnd=%u, rx_count=%d, sk_rcvbuf=%d\n",
	pr_debug("%s: asoc:%p rwnd increased by %d to (%u, %u) - %u\n",
		 __func__, asoc, asoc->rwnd, rx_count,
		 __func__, asoc, len, asoc->rwnd, asoc->rwnd_over,
		 asoc->base.sk->sk_rcvbuf);
		 asoc->a_rwnd);


	/* Send a window update SACK if the rwnd has increased by at least the
	/* Send a window update SACK if the rwnd has increased by at least the
	 * minimum of the association's PMTU and half of the receive buffer.
	 * minimum of the association's PMTU and half of the receive buffer.
	 * The algorithm used is similar to the one described in
	 * The algorithm used is similar to the one described in
	 * Section 4.2.3.3 of RFC 1122.
	 * Section 4.2.3.3 of RFC 1122.
	 */
	 */
	if (update_peer && sctp_peer_needs_update(asoc)) {
	if (sctp_peer_needs_update(asoc)) {
		asoc->a_rwnd = asoc->rwnd;
		asoc->a_rwnd = asoc->rwnd;


		pr_debug("%s: sending window update SACK- asoc:%p rwnd:%u "
		pr_debug("%s: sending window update SACK- asoc:%p rwnd:%u "
@@ -1445,6 +1454,45 @@ void sctp_assoc_rwnd_update(struct sctp_association *asoc, bool update_peer)
	}
	}
}
}


/* Decrease asoc's rwnd by len. */
void sctp_assoc_rwnd_decrease(struct sctp_association *asoc, unsigned int len)
{
	int rx_count;
	int over = 0;

	if (unlikely(!asoc->rwnd || asoc->rwnd_over))
		pr_debug("%s: association:%p has asoc->rwnd:%u, "
			 "asoc->rwnd_over:%u!\n", __func__, asoc,
			 asoc->rwnd, asoc->rwnd_over);

	if (asoc->ep->rcvbuf_policy)
		rx_count = atomic_read(&asoc->rmem_alloc);
	else
		rx_count = atomic_read(&asoc->base.sk->sk_rmem_alloc);

	/* If we've reached or overflowed our receive buffer, announce
	 * a 0 rwnd if rwnd would still be positive.  Store the
	 * the potential pressure overflow so that the window can be restored
	 * back to original value.
	 */
	if (rx_count >= asoc->base.sk->sk_rcvbuf)
		over = 1;

	if (asoc->rwnd >= len) {
		asoc->rwnd -= len;
		if (over) {
			asoc->rwnd_press += asoc->rwnd;
			asoc->rwnd = 0;
		}
	} else {
		asoc->rwnd_over = len - asoc->rwnd;
		asoc->rwnd = 0;
	}

	pr_debug("%s: asoc:%p rwnd decreased by %d to (%u, %u, %u)\n",
		 __func__, asoc, len, asoc->rwnd, asoc->rwnd_over,
		 asoc->rwnd_press);
}


/* Build the bind address list for the association based on info from the
/* Build the bind address list for the association based on info from the
 * local endpoint and the remote peer.
 * local endpoint and the remote peer.
+1 −1
Original line number Original line Diff line number Diff line
@@ -6178,7 +6178,7 @@ static int sctp_eat_data(const struct sctp_association *asoc,
	 * PMTU.  In cases, such as loopback, this might be a rather
	 * PMTU.  In cases, such as loopback, this might be a rather
	 * large spill over.
	 * large spill over.
	 */
	 */
	if ((!chunk->data_accepted) && (!asoc->rwnd ||
	if ((!chunk->data_accepted) && (!asoc->rwnd || asoc->rwnd_over ||
	    (datalen > asoc->rwnd + asoc->frag_point))) {
	    (datalen > asoc->rwnd + asoc->frag_point))) {


		/* If this is the next TSN, consider reneging to make
		/* If this is the next TSN, consider reneging to make
+6 −0
Original line number Original line Diff line number Diff line
@@ -2115,6 +2115,12 @@ static int sctp_recvmsg(struct kiocb *iocb, struct sock *sk,
		sctp_skb_pull(skb, copied);
		sctp_skb_pull(skb, copied);
		skb_queue_head(&sk->sk_receive_queue, skb);
		skb_queue_head(&sk->sk_receive_queue, skb);


		/* When only partial message is copied to the user, increase
		 * rwnd by that amount. If all the data in the skb is read,
		 * rwnd is updated when the event is freed.
		 */
		if (!sctp_ulpevent_is_notification(event))
			sctp_assoc_rwnd_increase(event->asoc, copied);
		goto out;
		goto out;
	} else if ((event->msg_flags & MSG_NOTIFICATION) ||
	} else if ((event->msg_flags & MSG_NOTIFICATION) ||
		   (event->msg_flags & MSG_EOR))
		   (event->msg_flags & MSG_EOR))
+2 −6
Original line number Original line Diff line number Diff line
@@ -989,7 +989,7 @@ static void sctp_ulpevent_receive_data(struct sctp_ulpevent *event,
	skb = sctp_event2skb(event);
	skb = sctp_event2skb(event);
	/* Set the owner and charge rwnd for bytes received.  */
	/* Set the owner and charge rwnd for bytes received.  */
	sctp_ulpevent_set_owner(event, asoc);
	sctp_ulpevent_set_owner(event, asoc);
	sctp_assoc_rwnd_update(asoc, false);
	sctp_assoc_rwnd_decrease(asoc, skb_headlen(skb));


	if (!skb->data_len)
	if (!skb->data_len)
		return;
		return;
@@ -1011,7 +1011,6 @@ static void sctp_ulpevent_release_data(struct sctp_ulpevent *event)
{
{
	struct sk_buff *skb, *frag;
	struct sk_buff *skb, *frag;
	unsigned int	len;
	unsigned int	len;
	struct sctp_association *asoc;


	/* Current stack structures assume that the rcv buffer is
	/* Current stack structures assume that the rcv buffer is
	 * per socket.   For UDP style sockets this is not true as
	 * per socket.   For UDP style sockets this is not true as
@@ -1036,11 +1035,8 @@ static void sctp_ulpevent_release_data(struct sctp_ulpevent *event)
	}
	}


done:
done:
	asoc = event->asoc;
	sctp_assoc_rwnd_increase(event->asoc, len);
	sctp_association_hold(asoc);
	sctp_ulpevent_release_owner(event);
	sctp_ulpevent_release_owner(event);
	sctp_assoc_rwnd_update(asoc, true);
	sctp_association_put(asoc);
}
}


static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event)
static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event)