Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 72a3effa authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller
Browse files

[NET]: Size listen hash tables using backlog hint



We currently allocate a fixed size (TCP_SYNQ_HSIZE=512) slots hash table for
each LISTEN socket, regardless of various parameters (listen backlog for
example)

On x86_64, this means order-1 allocations (might fail), even for 'small'
sockets, expecting few connections. On the contrary, a huge server wanting a
backlog of 50000 is slowed down a bit because of this fixed limit.

This patch makes the sizing of listen hash table a dynamic parameter,
depending of :
- net.core.somaxconn tunable (default is 128)
- net.ipv4.tcp_max_syn_backlog tunable (default : 256, 1024 or 128)
- backlog value given by user application  (2nd parameter of listen())

For large allocations (bigger than PAGE_SIZE), we use vmalloc() instead of
kmalloc().

We still limit memory allocation with the two existing tunables (somaxconn &
tcp_max_syn_backlog). So for standard setups, this patch actually reduce RAM
usage.

Signed-off-by: default avatarEric Dumazet <dada1@cosmosbay.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 3c62f75a
Loading
Loading
Loading
Loading
+4 −4
Original line number Diff line number Diff line
@@ -28,8 +28,8 @@ struct proto;

struct request_sock_ops {
	int		family;
	kmem_cache_t	*slab;
	int		obj_size;
	kmem_cache_t	*slab;
	int		(*rtx_syn_ack)(struct sock *sk,
				       struct request_sock *req,
				       struct dst_entry *dst);
@@ -51,13 +51,13 @@ struct request_sock {
	u32				rcv_wnd;	  /* rcv_wnd offered first time */
	u32				ts_recent;
	unsigned long			expires;
	struct request_sock_ops		*rsk_ops;
	const struct request_sock_ops	*rsk_ops;
	struct sock			*sk;
	u32				secid;
	u32				peer_secid;
};

static inline struct request_sock *reqsk_alloc(struct request_sock_ops *ops)
static inline struct request_sock *reqsk_alloc(const struct request_sock_ops *ops)
{
	struct request_sock *req = kmem_cache_alloc(ops->slab, SLAB_ATOMIC);

@@ -121,7 +121,7 @@ struct request_sock_queue {
};

extern int reqsk_queue_alloc(struct request_sock_queue *queue,
			     const int nr_table_entries);
			     unsigned int nr_table_entries);

static inline struct listen_sock *reqsk_queue_yank_listen_sk(struct request_sock_queue *queue)
{
+0 −1
Original line number Diff line number Diff line
@@ -138,7 +138,6 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
#define MAX_TCP_SYNCNT		127

#define TCP_SYNQ_INTERVAL	(HZ/5)	/* Period of SYNACK timer */
#define TCP_SYNQ_HSIZE		512	/* Size of SYNACK hash table */

#define TCP_PAWS_24DAYS	(60 * 60 * 24 * 24)
#define TCP_PAWS_MSL	60		/* Per-host timestamps are invalidated
+25 −10
Original line number Diff line number Diff line
@@ -15,6 +15,7 @@
#include <linux/random.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/vmalloc.h>

#include <net/request_sock.h>

@@ -29,22 +30,31 @@
 * it is absolutely not enough even at 100conn/sec. 256 cures most
 * of problems. This value is adjusted to 128 for very small machines
 * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
 * Further increasing requires to change hash table size.
 * Note : Dont forget somaxconn that may limit backlog too.
 */
int sysctl_max_syn_backlog = 256;

int reqsk_queue_alloc(struct request_sock_queue *queue,
		      const int nr_table_entries)
		      unsigned int nr_table_entries)
{
	const int lopt_size = sizeof(struct listen_sock) +
			      nr_table_entries * sizeof(struct request_sock *);
	struct listen_sock *lopt = kzalloc(lopt_size, GFP_KERNEL);

	size_t lopt_size = sizeof(struct listen_sock);
	struct listen_sock *lopt;

	nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog);
	nr_table_entries = max_t(u32, nr_table_entries, 8);
	nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);
	lopt_size += nr_table_entries * sizeof(struct request_sock *);
	if (lopt_size > PAGE_SIZE)
		lopt = __vmalloc(lopt_size,
			GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
			PAGE_KERNEL);
	else
		lopt = kzalloc(lopt_size, GFP_KERNEL);
	if (lopt == NULL)
		return -ENOMEM;

	for (lopt->max_qlen_log = 6;
	     (1 << lopt->max_qlen_log) < sysctl_max_syn_backlog;
	for (lopt->max_qlen_log = 3;
	     (1 << lopt->max_qlen_log) < nr_table_entries;
	     lopt->max_qlen_log++);

	get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));
@@ -65,9 +75,11 @@ void reqsk_queue_destroy(struct request_sock_queue *queue)
{
	/* make all the listen_opt local to us */
	struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue);
	size_t lopt_size = sizeof(struct listen_sock) +
		lopt->nr_table_entries * sizeof(struct request_sock *);

	if (lopt->qlen != 0) {
		int i;
		unsigned int i;

		for (i = 0; i < lopt->nr_table_entries; i++) {
			struct request_sock *req;
@@ -81,6 +93,9 @@ void reqsk_queue_destroy(struct request_sock_queue *queue)
	}

	BUG_TRAP(lopt->qlen == 0);
	if (lopt_size > PAGE_SIZE)
		vfree(lopt);
	else
		kfree(lopt);
}

+1 −1
Original line number Diff line number Diff line
@@ -1022,7 +1022,7 @@ static void dccp_v4_reqsk_destructor(struct request_sock *req)
	kfree(inet_rsk(req)->opt);
}

static struct request_sock_ops dccp_request_sock_ops = {
static struct request_sock_ops dccp_request_sock_ops _read_mostly = {
	.family		= PF_INET,
	.obj_size	= sizeof(struct dccp_request_sock),
	.rtx_syn_ack	= dccp_v4_send_response,
+3 −3
Original line number Diff line number Diff line
@@ -262,12 +262,12 @@ int dccp_destroy_sock(struct sock *sk)

EXPORT_SYMBOL_GPL(dccp_destroy_sock);

static inline int dccp_listen_start(struct sock *sk)
static inline int dccp_listen_start(struct sock *sk, int backlog)
{
	struct dccp_sock *dp = dccp_sk(sk);

	dp->dccps_role = DCCP_ROLE_LISTEN;
	return inet_csk_listen_start(sk, TCP_SYNQ_HSIZE);
	return inet_csk_listen_start(sk, backlog);
}

int dccp_disconnect(struct sock *sk, int flags)
@@ -788,7 +788,7 @@ int inet_dccp_listen(struct socket *sock, int backlog)
		 * FIXME: here it probably should be sk->sk_prot->listen_start
		 * see tcp_listen_start
		 */
		err = dccp_listen_start(sk);
		err = dccp_listen_start(sk, backlog);
		if (err)
			goto out;
	}
Loading