Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 4289e60c authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'tc-skb-diet'

Willem de Bruijn says:

====================
convert tc_verd to integer bitfields

The skb tc_verd field takes up two bytes but uses far fewer bits.
Convert the remaining use cases to bitfields that fit in existing
holes (depending on config options) and potentially save the two
bytes in struct sk_buff.

This patchset is based on an earlier set by Florian Westphal and its
discussion (http://www.spinics.net/lists/netdev/msg329181.html

).

Patches 1 and 2 are low hanging fruit: removing the last traces of
  data that are no longer stored in tc_verd.

Patches 3 and 4 convert tc_verd to individual bitfields (5 bits).

Patch 5 reduces TC_AT to a single bitfield,
  as AT_STACK is not valid here (unlike in the case of TC_FROM).

Patch 6 changes TC_FROM to two bitfields with clearly defined purpose.

It may be possible to reduce storage further after this initial round.
If tc_skip_classify is set only by IFB, testing skb_iif may suffice.
The L2 header pushing/popping logic can perhaps be shared with
AF_PACKET, which currently not pkt_type for the same purpose.

Changes:
  RFC -> v1
    - (patch 3): remove no longer needed label in tfc_action_exec
    - (patch 5): set tc_at_ingress at the same points as existing
                 SET_TC_AT calls

Tested ingress mirred + netem + ifb:

  ip link set dev ifb0 up
  tc qdisc add dev eth0 ingress
  tc filter add dev eth0 parent ffff: \
    u32 match ip dport 8000 0xffff \
    action mirred egress redirect dev ifb0
  tc qdisc add dev ifb0 root netem delay 1000ms
  nc -u -l 8000 &
  ssh $otherhost nc -u $host 8000

Tested egress mirred:

  ip link add veth1 type veth peer name veth2
  ip link set dev veth1 up
  ip link set dev veth2 up
  tcpdump -n -i veth2 udp and dst port 8000 &

  tc qdisc add dev eth0 root handle 1: prio
  tc filter add dev eth0 parent 1:0 \
    u32 match ip dport 8000 0xffff \
    action mirred egress redirect dev veth1
  tc qdisc add dev veth1 root netem delay 1000ms
  nc -u $otherhost 8000

Tested ingress mirred:

  ip link add veth1 type veth peer name veth2
  ip link add veth3 type veth peer name veth4

  ip netns add ns0
  ip netns add ns1

  for i in 1 2 3 4; do \
    NS=ns$((${i}%2)); \
    ip link set dev veth${i} netns ${NS}; \
    ip netns exec ${NS} \
      ip addr add dev veth${i} 192.168.1.${i}/24; \
    ip netns exec ${NS} \
      ip link set dev veth${i} up; \
  done

  ip netns exec ns0 tc qdisc add dev veth2 ingress
  ip netns exec ns0 \
    tc filter add dev veth2 parent ffff: \
      u32 match ip dport 8000 0xffff \
      action mirred ingress redirect dev veth4

  ip netns exec ns0 \
    tcpdump -n -i veth4 udp and dst port 8000 &
  ip netns exec ns1 \
    nc -u 192.168.1.2 8000
====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 29b84f20 bc31c905
Loading
Loading
Loading
Loading
+6 −10
Original line number Diff line number Diff line
@@ -78,10 +78,8 @@ static void ifb_ri_tasklet(unsigned long _txp)
	}

	while ((skb = __skb_dequeue(&txp->tq)) != NULL) {
		u32 from = G_TC_FROM(skb->tc_verd);

		skb->tc_verd = 0;
		skb->tc_verd = SET_TC_NCLS(skb->tc_verd);
		skb->tc_redirected = 0;
		skb->tc_skip_classify = 1;

		u64_stats_update_begin(&txp->tsync);
		txp->tx_packets++;
@@ -101,13 +99,12 @@ static void ifb_ri_tasklet(unsigned long _txp)
		rcu_read_unlock();
		skb->skb_iif = txp->dev->ifindex;

		if (from & AT_EGRESS) {
		if (!skb->tc_from_ingress) {
			dev_queue_xmit(skb);
		} else if (from & AT_INGRESS) {
		} else {
			skb_pull(skb, skb->mac_len);
			netif_receive_skb(skb);
		} else
			BUG();
		}
	}

	if (__netif_tx_trylock(txq)) {
@@ -239,7 +236,6 @@ static void ifb_setup(struct net_device *dev)
static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev)
{
	struct ifb_dev_private *dp = netdev_priv(dev);
	u32 from = G_TC_FROM(skb->tc_verd);
	struct ifb_q_private *txp = dp->tx_private + skb_get_queue_mapping(skb);

	u64_stats_update_begin(&txp->rsync);
@@ -247,7 +243,7 @@ static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev)
	txp->rx_bytes += skb->len;
	u64_stats_update_end(&txp->rsync);

	if (!(from & (AT_INGRESS|AT_EGRESS)) || !skb->skb_iif) {
	if (!skb->tc_redirected || !skb->skb_iif) {
		dev_kfree_skb(skb);
		dev->stats.rx_dropped++;
		return NETDEV_TX_OK;
+2 −3
Original line number Diff line number Diff line
@@ -23,6 +23,7 @@
#endif /* CONFIG_XFRM */

#include <linux/atomic.h>
#include <net/sch_generic.h>

#include <asm/octeon/octeon.h>

@@ -369,9 +370,7 @@ int cvm_oct_xmit(struct sk_buff *skb, struct net_device *dev)

#ifdef CONFIG_NET_SCHED
	skb->tc_index = 0;
#ifdef CONFIG_NET_CLS_ACT
	skb->tc_verd = 0;
#endif /* CONFIG_NET_CLS_ACT */
	skb_reset_tc(skb);
#endif /* CONFIG_NET_SCHED */
#endif /* REUSE_SKBUFFS_WITHOUT_FREE */

+10 −5
Original line number Diff line number Diff line
@@ -589,6 +589,10 @@ static inline bool skb_mstamp_after(const struct skb_mstamp *t1,
 *	@pkt_type: Packet class
 *	@fclone: skbuff clone status
 *	@ipvs_property: skbuff is owned by ipvs
 *	@tc_skip_classify: do not classify packet. set by IFB device
 *	@tc_at_ingress: used within tc_classify to distinguish in/egress
 *	@tc_redirected: packet was redirected by a tc action
 *	@tc_from_ingress: if tc_redirected, tc_at_ingress at time of redirect
 *	@peeked: this packet has been seen already, so stats have been
 *		done for it, don't do them again
 *	@nf_trace: netfilter packet trace flag
@@ -598,7 +602,6 @@ static inline bool skb_mstamp_after(const struct skb_mstamp *t1,
 *	@nf_bridge: Saved data about a bridged frame - see br_netfilter.c
 *	@skb_iif: ifindex of device we arrived on
 *	@tc_index: Traffic control index
 *	@tc_verd: traffic control verdict
 *	@hash: the packet hash
 *	@queue_mapping: Queue mapping for multiqueue devices
 *	@xmit_more: More SKBs are pending for this queue
@@ -749,13 +752,15 @@ struct sk_buff {
#ifdef CONFIG_NET_SWITCHDEV
	__u8			offload_fwd_mark:1;
#endif
	/* 2, 4 or 5 bit hole */
#ifdef CONFIG_NET_CLS_ACT
	__u8			tc_skip_classify:1;
	__u8			tc_at_ingress:1;
	__u8			tc_redirected:1;
	__u8			tc_from_ingress:1;
#endif

#ifdef CONFIG_NET_SCHED
	__u16			tc_index;	/* traffic control index */
#ifdef CONFIG_NET_CLS_ACT
	__u16			tc_verd;	/* traffic control verdict */
#endif
#endif

	union {
+19 −1
Original line number Diff line number Diff line
@@ -409,15 +409,33 @@ bool tcf_destroy(struct tcf_proto *tp, bool force);
void tcf_destroy_chain(struct tcf_proto __rcu **fl);
int skb_do_redirect(struct sk_buff *);

static inline void skb_reset_tc(struct sk_buff *skb)
{
#ifdef CONFIG_NET_CLS_ACT
	skb->tc_redirected = 0;
#endif
}

static inline bool skb_at_tc_ingress(const struct sk_buff *skb)
{
#ifdef CONFIG_NET_CLS_ACT
	return G_TC_AT(skb->tc_verd) & AT_INGRESS;
	return skb->tc_at_ingress;
#else
	return false;
#endif
}

static inline bool skb_skip_tc_classify(struct sk_buff *skb)
{
#ifdef CONFIG_NET_CLS_ACT
	if (skb->tc_skip_classify) {
		skb->tc_skip_classify = 0;
		return true;
	}
#endif
	return false;
}

/* Reset all TX qdiscs greater then index of a device.  */
static inline void qdisc_reset_all_tx_gt(struct net_device *dev, unsigned int i)
{
+0 −55
Original line number Diff line number Diff line
@@ -4,61 +4,6 @@
#include <linux/types.h>
#include <linux/pkt_sched.h>

#ifdef __KERNEL__
/* I think i could have done better macros ; for now this is stolen from
 * some arch/mips code - jhs
*/
#define _TC_MAKE32(x) ((x))

#define _TC_MAKEMASK1(n) (_TC_MAKE32(1) << _TC_MAKE32(n))
#define _TC_MAKEMASK(v,n) (_TC_MAKE32((_TC_MAKE32(1)<<(v))-1) << _TC_MAKE32(n))
#define _TC_MAKEVALUE(v,n) (_TC_MAKE32(v) << _TC_MAKE32(n))
#define _TC_GETVALUE(v,n,m) ((_TC_MAKE32(v) & _TC_MAKE32(m)) >> _TC_MAKE32(n))

/* verdict bit breakdown 
 *
bit 0: when set -> this packet has been munged already

bit 1: when set -> It is ok to munge this packet

bit 2,3,4,5: Reclassify counter - sort of reverse TTL - if exceeded
assume loop

bit 6,7: Where this packet was last seen 
0: Above the transmit example at the socket level
1: on the Ingress
2: on the Egress

bit 8: when set --> Request not to classify on ingress. 

bits 9,10,11: redirect counter -  redirect TTL. Loop avoidance

 *
 * */

#define S_TC_FROM          _TC_MAKE32(6)
#define M_TC_FROM          _TC_MAKEMASK(2,S_TC_FROM)
#define G_TC_FROM(x)       _TC_GETVALUE(x,S_TC_FROM,M_TC_FROM)
#define V_TC_FROM(x)       _TC_MAKEVALUE(x,S_TC_FROM)
#define SET_TC_FROM(v,n)   ((V_TC_FROM(n)) | (v & ~M_TC_FROM))
#define AT_STACK	0x0
#define AT_INGRESS	0x1
#define AT_EGRESS	0x2

#define TC_NCLS          _TC_MAKEMASK1(8)
#define SET_TC_NCLS(v)   ( TC_NCLS | (v & ~TC_NCLS))
#define CLR_TC_NCLS(v)   ( v & ~TC_NCLS)

#define S_TC_AT          _TC_MAKE32(12)
#define M_TC_AT          _TC_MAKEMASK(2,S_TC_AT)
#define G_TC_AT(x)       _TC_GETVALUE(x,S_TC_AT,M_TC_AT)
#define V_TC_AT(x)       _TC_MAKEVALUE(x,S_TC_AT)
#define SET_TC_AT(v,n)   ((V_TC_AT(n)) | (v & ~M_TC_AT))

#define MAX_REC_LOOP 4
#define MAX_RED_LOOP 4
#endif

/* Action attributes */
enum {
	TCA_ACT_UNSPEC,
Loading