Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit d23ff701 authored by Julian Anastasov's avatar Julian Anastasov Committed by David S. Miller
Browse files

tcp: add generic netlink support for tcp_metrics



Add support for genl "tcp_metrics". No locking
is changed, only that now we can unlink and delete
entries after grace period. We implement get/del for
single entry and dump to support show/flush filtering
in user space. Del without address attribute causes
flush for all addresses, sadly under genl_mutex.

v2:
- remove rcu_assign_pointer as suggested by Eric Dumazet,
it is not needed because there are no other writes under lock
- move the flushing code in tcp_metrics_flush_all

v3:
- remove synchronize_rcu on flush as suggested by Eric Dumazet

Signed-off-by: default avatarJulian Anastasov <ja@ssi.bg>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent ab868256
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -363,6 +363,7 @@ header-y += sysctl.h
header-y += sysinfo.h
header-y += taskstats.h
header-y += tcp.h
header-y += tcp_metrics.h
header-y += telephony.h
header-y += termios.h
header-y += time.h
+54 −0
Original line number Diff line number Diff line
/* tcp_metrics.h - TCP Metrics Interface */

#ifndef _LINUX_TCP_METRICS_H
#define _LINUX_TCP_METRICS_H

#include <linux/types.h>

/* NETLINK_GENERIC related info
 */
#define TCP_METRICS_GENL_NAME		"tcp_metrics"
#define TCP_METRICS_GENL_VERSION	0x1

enum tcp_metric_index {
	TCP_METRIC_RTT,
	TCP_METRIC_RTTVAR,
	TCP_METRIC_SSTHRESH,
	TCP_METRIC_CWND,
	TCP_METRIC_REORDERING,

	/* Always last.  */
	__TCP_METRIC_MAX,
};

#define TCP_METRIC_MAX	(__TCP_METRIC_MAX - 1)

enum {
	TCP_METRICS_ATTR_UNSPEC,
	TCP_METRICS_ATTR_ADDR_IPV4,		/* u32 */
	TCP_METRICS_ATTR_ADDR_IPV6,		/* binary */
	TCP_METRICS_ATTR_AGE,			/* msecs */
	TCP_METRICS_ATTR_TW_TSVAL,		/* u32, raw, rcv tsval */
	TCP_METRICS_ATTR_TW_TS_STAMP,		/* s32, sec age */
	TCP_METRICS_ATTR_VALS,			/* nested +1, u32 */
	TCP_METRICS_ATTR_FOPEN_MSS,		/* u16 */
	TCP_METRICS_ATTR_FOPEN_SYN_DROPS,	/* u16, count of drops */
	TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS,	/* msecs age */
	TCP_METRICS_ATTR_FOPEN_COOKIE,		/* binary */

	__TCP_METRICS_ATTR_MAX,
};

#define TCP_METRICS_ATTR_MAX	(__TCP_METRICS_ATTR_MAX - 1)

enum {
	TCP_METRICS_CMD_UNSPEC,
	TCP_METRICS_CMD_GET,
	TCP_METRICS_CMD_DEL,

	__TCP_METRICS_CMD_MAX,
};

#define TCP_METRICS_CMD_MAX	(__TCP_METRICS_CMD_MAX - 1)

#endif /* _LINUX_TCP_METRICS_H */
+341 −13
Original line number Diff line number Diff line
@@ -8,6 +8,7 @@
#include <linux/init.h>
#include <linux/tcp.h>
#include <linux/hash.h>
#include <linux/tcp_metrics.h>

#include <net/inet_connection_sock.h>
#include <net/net_namespace.h>
@@ -17,20 +18,10 @@
#include <net/ipv6.h>
#include <net/dst.h>
#include <net/tcp.h>
#include <net/genetlink.h>

int sysctl_tcp_nometrics_save __read_mostly;

enum tcp_metric_index {
	TCP_METRIC_RTT,
	TCP_METRIC_RTTVAR,
	TCP_METRIC_SSTHRESH,
	TCP_METRIC_CWND,
	TCP_METRIC_REORDERING,

	/* Always last.  */
	TCP_METRIC_MAX,
};

struct tcp_fastopen_metrics {
	u16	mss;
	u16	syn_loss:10;		/* Recurring Fast Open SYN losses */
@@ -45,8 +36,10 @@ struct tcp_metrics_block {
	u32				tcpm_ts;
	u32				tcpm_ts_stamp;
	u32				tcpm_lock;
	u32				tcpm_vals[TCP_METRIC_MAX];
	u32				tcpm_vals[TCP_METRIC_MAX + 1];
	struct tcp_fastopen_metrics	tcpm_fastopen;

	struct rcu_head			rcu_head;
};

static bool tcp_metric_locked(struct tcp_metrics_block *tm,
@@ -690,6 +683,325 @@ void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
	rcu_read_unlock();
}

static struct genl_family tcp_metrics_nl_family = {
	.id		= GENL_ID_GENERATE,
	.hdrsize	= 0,
	.name		= TCP_METRICS_GENL_NAME,
	.version	= TCP_METRICS_GENL_VERSION,
	.maxattr	= TCP_METRICS_ATTR_MAX,
	.netnsok	= true,
};

static struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = {
	[TCP_METRICS_ATTR_ADDR_IPV4]	= { .type = NLA_U32, },
	[TCP_METRICS_ATTR_ADDR_IPV6]	= { .type = NLA_BINARY,
					    .len = sizeof(struct in6_addr), },
	/* Following attributes are not received for GET/DEL,
	 * we keep them for reference
	 */
#if 0
	[TCP_METRICS_ATTR_AGE]		= { .type = NLA_MSECS, },
	[TCP_METRICS_ATTR_TW_TSVAL]	= { .type = NLA_U32, },
	[TCP_METRICS_ATTR_TW_TS_STAMP]	= { .type = NLA_S32, },
	[TCP_METRICS_ATTR_VALS]		= { .type = NLA_NESTED, },
	[TCP_METRICS_ATTR_FOPEN_MSS]	= { .type = NLA_U16, },
	[TCP_METRICS_ATTR_FOPEN_SYN_DROPS]	= { .type = NLA_U16, },
	[TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS]	= { .type = NLA_MSECS, },
	[TCP_METRICS_ATTR_FOPEN_COOKIE]	= { .type = NLA_BINARY,
					    .len = TCP_FASTOPEN_COOKIE_MAX, },
#endif
};

/* Add attributes, caller cancels its header on failure */
static int tcp_metrics_fill_info(struct sk_buff *msg,
				 struct tcp_metrics_block *tm)
{
	struct nlattr *nest;
	int i;

	switch (tm->tcpm_addr.family) {
	case AF_INET:
		if (nla_put_be32(msg, TCP_METRICS_ATTR_ADDR_IPV4,
				tm->tcpm_addr.addr.a4) < 0)
			goto nla_put_failure;
		break;
	case AF_INET6:
		if (nla_put(msg, TCP_METRICS_ATTR_ADDR_IPV6, 16,
			    tm->tcpm_addr.addr.a6) < 0)
			goto nla_put_failure;
		break;
	default:
		return -EAFNOSUPPORT;
	}

	if (nla_put_msecs(msg, TCP_METRICS_ATTR_AGE,
			  jiffies - tm->tcpm_stamp) < 0)
		goto nla_put_failure;
	if (tm->tcpm_ts_stamp) {
		if (nla_put_s32(msg, TCP_METRICS_ATTR_TW_TS_STAMP,
				(s32) (get_seconds() - tm->tcpm_ts_stamp)) < 0)
			goto nla_put_failure;
		if (nla_put_u32(msg, TCP_METRICS_ATTR_TW_TSVAL,
				tm->tcpm_ts) < 0)
			goto nla_put_failure;
	}

	{
		int n = 0;

		nest = nla_nest_start(msg, TCP_METRICS_ATTR_VALS);
		if (!nest)
			goto nla_put_failure;
		for (i = 0; i < TCP_METRIC_MAX + 1; i++) {
			if (!tm->tcpm_vals[i])
				continue;
			if (nla_put_u32(msg, i + 1, tm->tcpm_vals[i]) < 0)
				goto nla_put_failure;
			n++;
		}
		if (n)
			nla_nest_end(msg, nest);
		else
			nla_nest_cancel(msg, nest);
	}

	{
		struct tcp_fastopen_metrics tfom_copy[1], *tfom;
		unsigned int seq;

		do {
			seq = read_seqbegin(&fastopen_seqlock);
			tfom_copy[0] = tm->tcpm_fastopen;
		} while (read_seqretry(&fastopen_seqlock, seq));

		tfom = tfom_copy;
		if (tfom->mss &&
		    nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_MSS,
				tfom->mss) < 0)
			goto nla_put_failure;
		if (tfom->syn_loss &&
		    (nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROPS,
				tfom->syn_loss) < 0 ||
		     nla_put_msecs(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS,
				jiffies - tfom->last_syn_loss) < 0))
			goto nla_put_failure;
		if (tfom->cookie.len > 0 &&
		    nla_put(msg, TCP_METRICS_ATTR_FOPEN_COOKIE,
			    tfom->cookie.len, tfom->cookie.val) < 0)
			goto nla_put_failure;
	}

	return 0;

nla_put_failure:
	return -EMSGSIZE;
}

static int tcp_metrics_dump_info(struct sk_buff *skb,
				 struct netlink_callback *cb,
				 struct tcp_metrics_block *tm)
{
	void *hdr;

	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
			  &tcp_metrics_nl_family, NLM_F_MULTI,
			  TCP_METRICS_CMD_GET);
	if (!hdr)
		return -EMSGSIZE;

	if (tcp_metrics_fill_info(skb, tm) < 0)
		goto nla_put_failure;

	return genlmsg_end(skb, hdr);

nla_put_failure:
	genlmsg_cancel(skb, hdr);
	return -EMSGSIZE;
}

static int tcp_metrics_nl_dump(struct sk_buff *skb,
			       struct netlink_callback *cb)
{
	struct net *net = sock_net(skb->sk);
	unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log;
	unsigned int row, s_row = cb->args[0];
	int s_col = cb->args[1], col = s_col;

	for (row = s_row; row < max_rows; row++, s_col = 0) {
		struct tcp_metrics_block *tm;
		struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash + row;

		rcu_read_lock();
		for (col = 0, tm = rcu_dereference(hb->chain); tm;
		     tm = rcu_dereference(tm->tcpm_next), col++) {
			if (col < s_col)
				continue;
			if (tcp_metrics_dump_info(skb, cb, tm) < 0) {
				rcu_read_unlock();
				goto done;
			}
		}
		rcu_read_unlock();
	}

done:
	cb->args[0] = row;
	cb->args[1] = col;
	return skb->len;
}

static int parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr,
			 unsigned int *hash, int optional)
{
	struct nlattr *a;

	a = info->attrs[TCP_METRICS_ATTR_ADDR_IPV4];
	if (a) {
		addr->family = AF_INET;
		addr->addr.a4 = nla_get_be32(a);
		*hash = (__force unsigned int) addr->addr.a4;
		return 0;
	}
	a = info->attrs[TCP_METRICS_ATTR_ADDR_IPV6];
	if (a) {
		if (nla_len(a) != sizeof(sizeof(struct in6_addr)))
			return -EINVAL;
		addr->family = AF_INET6;
		memcpy(addr->addr.a6, nla_data(a), sizeof(addr->addr.a6));
		*hash = ipv6_addr_hash((struct in6_addr *) addr->addr.a6);
		return 0;
	}
	return optional ? 1 : -EAFNOSUPPORT;
}

static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info)
{
	struct tcp_metrics_block *tm;
	struct inetpeer_addr addr;
	unsigned int hash;
	struct sk_buff *msg;
	struct net *net = genl_info_net(info);
	void *reply;
	int ret;

	ret = parse_nl_addr(info, &addr, &hash, 0);
	if (ret < 0)
		return ret;

	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
	if (!msg)
		return -ENOMEM;

	reply = genlmsg_put_reply(msg, info, &tcp_metrics_nl_family, 0,
				  info->genlhdr->cmd);
	if (!reply)
		goto nla_put_failure;

	hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
	ret = -ESRCH;
	rcu_read_lock();
	for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
	     tm = rcu_dereference(tm->tcpm_next)) {
		if (addr_same(&tm->tcpm_addr, &addr)) {
			ret = tcp_metrics_fill_info(msg, tm);
			break;
		}
	}
	rcu_read_unlock();
	if (ret < 0)
		goto out_free;

	genlmsg_end(msg, reply);
	return genlmsg_reply(msg, info);

nla_put_failure:
	ret = -EMSGSIZE;

out_free:
	nlmsg_free(msg);
	return ret;
}

#define deref_locked_genl(p)	\
	rcu_dereference_protected(p, lockdep_genl_is_held() && \
				     lockdep_is_held(&tcp_metrics_lock))

#define deref_genl(p)	rcu_dereference_protected(p, lockdep_genl_is_held())

static int tcp_metrics_flush_all(struct net *net)
{
	unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log;
	struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash;
	struct tcp_metrics_block *tm;
	unsigned int row;

	for (row = 0; row < max_rows; row++, hb++) {
		spin_lock_bh(&tcp_metrics_lock);
		tm = deref_locked_genl(hb->chain);
		if (tm)
			hb->chain = NULL;
		spin_unlock_bh(&tcp_metrics_lock);
		while (tm) {
			struct tcp_metrics_block *next;

			next = deref_genl(tm->tcpm_next);
			kfree_rcu(tm, rcu_head);
			tm = next;
		}
	}
	return 0;
}

static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info)
{
	struct tcpm_hash_bucket *hb;
	struct tcp_metrics_block *tm;
	struct tcp_metrics_block __rcu **pp;
	struct inetpeer_addr addr;
	unsigned int hash;
	struct net *net = genl_info_net(info);
	int ret;

	ret = parse_nl_addr(info, &addr, &hash, 1);
	if (ret < 0)
		return ret;
	if (ret > 0)
		return tcp_metrics_flush_all(net);

	hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
	hb = net->ipv4.tcp_metrics_hash + hash;
	pp = &hb->chain;
	spin_lock_bh(&tcp_metrics_lock);
	for (tm = deref_locked_genl(*pp); tm;
	     pp = &tm->tcpm_next, tm = deref_locked_genl(*pp)) {
		if (addr_same(&tm->tcpm_addr, &addr)) {
			*pp = tm->tcpm_next;
			break;
		}
	}
	spin_unlock_bh(&tcp_metrics_lock);
	if (!tm)
		return -ESRCH;
	kfree_rcu(tm, rcu_head);
	return 0;
}

static struct genl_ops tcp_metrics_nl_ops[] = {
	{
		.cmd = TCP_METRICS_CMD_GET,
		.doit = tcp_metrics_nl_cmd_get,
		.dumpit = tcp_metrics_nl_dump,
		.policy = tcp_metrics_nl_policy,
		.flags = GENL_ADMIN_PERM,
	},
	{
		.cmd = TCP_METRICS_CMD_DEL,
		.doit = tcp_metrics_nl_cmd_del,
		.policy = tcp_metrics_nl_policy,
		.flags = GENL_ADMIN_PERM,
	},
};

static unsigned int tcpmhash_entries;
static int __init set_tcpmhash_entries(char *str)
{
@@ -753,5 +1065,21 @@ static __net_initdata struct pernet_operations tcp_net_metrics_ops = {

void __init tcp_metrics_init(void)
{
	register_pernet_subsys(&tcp_net_metrics_ops);
	int ret;

	ret = register_pernet_subsys(&tcp_net_metrics_ops);
	if (ret < 0)
		goto cleanup;
	ret = genl_register_family_with_ops(&tcp_metrics_nl_family,
					    tcp_metrics_nl_ops,
					    ARRAY_SIZE(tcp_metrics_nl_ops));
	if (ret < 0)
		goto cleanup_subsys;
	return;

cleanup_subsys:
	unregister_pernet_subsys(&tcp_net_metrics_ops);

cleanup:
	return;
}