Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 206f3985 authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'xen_netback'



xen-netback: IPv6 offload support

====================
This patch series adds support for checksum and large packet offloads
into xen-netback.  Testing has mainly been done using the Microsoft
network hardware certification suite running in Server 2008R2 VMs with
Citrix PV frontends.

v2:
- Fixed Wei's email address in Cc lines

v3:
- Responded to Wei's comments:
 - netif.h now updated with comments and a definition of
   XEN_NETIF_GSO_TYPE_NONE.
 - limited number of pullups
- Responded to Annie's comments:
 - New GSO_BIT macro

v4:
- Responded to more of Wei's comments
- Remove parsing of IPv6 fragment header and added warning

v5:
- Added comment concerning the value chosen for PKT_PROT_LEN
- Dropped deprecation of feature-no-csum-offload
====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents c0f4ace7 82cada22
Loading
Loading
Loading
Loading
+9 −3
Original line number Diff line number Diff line
@@ -87,9 +87,13 @@ struct pending_tx_info {
struct xenvif_rx_meta {
	int id;
	int size;
	int gso_type;
	int gso_size;
};

#define GSO_BIT(type) \
	(1 << XEN_NETIF_GSO_TYPE_ ## type)

/* Discriminate from any valid pending_idx value. */
#define INVALID_PENDING_IDX 0xFFFF

@@ -150,10 +154,12 @@ struct xenvif {
	u8               fe_dev_addr[6];

	/* Frontend feature information. */
	int gso_mask;
	int gso_prefix_mask;

	u8 can_sg:1;
	u8 gso:1;
	u8 gso_prefix:1;
	u8 csum:1;
	u8 ip_csum:1;
	u8 ipv6_csum:1;

	/* Internal feature information. */
	u8 can_queue:1;	    /* can queue packets for receiver? */
+11 −5
Original line number Diff line number Diff line
@@ -214,10 +214,14 @@ static netdev_features_t xenvif_fix_features(struct net_device *dev,

	if (!vif->can_sg)
		features &= ~NETIF_F_SG;
	if (!vif->gso && !vif->gso_prefix)
	if (~(vif->gso_mask | vif->gso_prefix_mask) & GSO_BIT(TCPV4))
		features &= ~NETIF_F_TSO;
	if (!vif->csum)
	if (~(vif->gso_mask | vif->gso_prefix_mask) & GSO_BIT(TCPV6))
		features &= ~NETIF_F_TSO6;
	if (!vif->ip_csum)
		features &= ~NETIF_F_IP_CSUM;
	if (!vif->ipv6_csum)
		features &= ~NETIF_F_IPV6_CSUM;

	return features;
}
@@ -306,7 +310,7 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
	vif->domid  = domid;
	vif->handle = handle;
	vif->can_sg = 1;
	vif->csum = 1;
	vif->ip_csum = 1;
	vif->dev = dev;

	vif->credit_bytes = vif->remaining_credit = ~0UL;
@@ -316,8 +320,10 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
	vif->credit_timeout.expires = jiffies;

	dev->netdev_ops	= &xenvif_netdev_ops;
	dev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO;
	dev->features = dev->hw_features;
	dev->hw_features = NETIF_F_SG |
		NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
		NETIF_F_TSO | NETIF_F_TSO6;
	dev->features = dev->hw_features | NETIF_F_RXCSUM;
	SET_ETHTOOL_OPS(dev, &xenvif_ethtool_ops);

	dev->tx_queue_len = XENVIF_QUEUE_LENGTH;
+242 −52
Original line number Diff line number Diff line
@@ -109,15 +109,12 @@ static inline unsigned long idx_to_kaddr(struct xenvif *vif,
	return (unsigned long)pfn_to_kaddr(idx_to_pfn(vif, idx));
}

/*
 * This is the amount of packet we copy rather than map, so that the
 * guest can't fiddle with the contents of the headers while we do
 * packet processing on them (netfilter, routing, etc).
/* This is a miniumum size for the linear area to avoid lots of
 * calls to __pskb_pull_tail() as we set up checksum offsets. The
 * value 128 was chosen as it covers all IPv4 and most likely
 * IPv6 headers.
 */
#define PKT_PROT_LEN    (ETH_HLEN + \
			 VLAN_HLEN + \
			 sizeof(struct iphdr) + MAX_IPOPTLEN + \
			 sizeof(struct tcphdr) + MAX_TCP_OPTION_SPACE)
#define PKT_PROT_LEN 128

static u16 frag_get_pending_idx(skb_frag_t *frag)
{
@@ -145,7 +142,7 @@ static int max_required_rx_slots(struct xenvif *vif)
	int max = DIV_ROUND_UP(vif->dev->mtu, PAGE_SIZE);

	/* XXX FIXME: RX path dependent on MAX_SKB_FRAGS */
	if (vif->can_sg || vif->gso || vif->gso_prefix)
	if (vif->can_sg || vif->gso_mask || vif->gso_prefix_mask)
		max += MAX_SKB_FRAGS + 1; /* extra_info + frags */

	return max;
@@ -317,6 +314,7 @@ static struct xenvif_rx_meta *get_next_rx_buffer(struct xenvif *vif,
	req = RING_GET_REQUEST(&vif->rx, vif->rx.req_cons++);

	meta = npo->meta + npo->meta_prod++;
	meta->gso_type = XEN_NETIF_GSO_TYPE_NONE;
	meta->gso_size = 0;
	meta->size = 0;
	meta->id = req->id;
@@ -339,6 +337,7 @@ static void xenvif_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb,
	struct gnttab_copy *copy_gop;
	struct xenvif_rx_meta *meta;
	unsigned long bytes;
	int gso_type;

	/* Data must not cross a page boundary. */
	BUG_ON(size + offset > PAGE_SIZE<<compound_order(page));
@@ -397,7 +396,14 @@ static void xenvif_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb,
		}

		/* Leave a gap for the GSO descriptor. */
		if (*head && skb_shinfo(skb)->gso_size && !vif->gso_prefix)
		if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4)
			gso_type = XEN_NETIF_GSO_TYPE_TCPV4;
		else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6)
			gso_type = XEN_NETIF_GSO_TYPE_TCPV6;
		else
			gso_type = XEN_NETIF_GSO_TYPE_NONE;

		if (*head && ((1 << gso_type) & vif->gso_mask))
			vif->rx.req_cons++;

		*head = 0; /* There must be something in this buffer now. */
@@ -428,14 +434,28 @@ static int xenvif_gop_skb(struct sk_buff *skb,
	unsigned char *data;
	int head = 1;
	int old_meta_prod;
	int gso_type;
	int gso_size;

	old_meta_prod = npo->meta_prod;

	if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
		gso_type = XEN_NETIF_GSO_TYPE_TCPV4;
		gso_size = skb_shinfo(skb)->gso_size;
	} else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) {
		gso_type = XEN_NETIF_GSO_TYPE_TCPV6;
		gso_size = skb_shinfo(skb)->gso_size;
	} else {
		gso_type = XEN_NETIF_GSO_TYPE_NONE;
		gso_size = 0;
	}

	/* Set up a GSO prefix descriptor, if necessary */
	if (skb_shinfo(skb)->gso_size && vif->gso_prefix) {
	if ((1 << skb_shinfo(skb)->gso_type) & vif->gso_prefix_mask) {
		req = RING_GET_REQUEST(&vif->rx, vif->rx.req_cons++);
		meta = npo->meta + npo->meta_prod++;
		meta->gso_size = skb_shinfo(skb)->gso_size;
		meta->gso_type = gso_type;
		meta->gso_size = gso_size;
		meta->size = 0;
		meta->id = req->id;
	}
@@ -443,10 +463,13 @@ static int xenvif_gop_skb(struct sk_buff *skb,
	req = RING_GET_REQUEST(&vif->rx, vif->rx.req_cons++);
	meta = npo->meta + npo->meta_prod++;

	if (!vif->gso_prefix)
		meta->gso_size = skb_shinfo(skb)->gso_size;
	else
	if ((1 << gso_type) & vif->gso_mask) {
		meta->gso_type = gso_type;
		meta->gso_size = gso_size;
	} else {
		meta->gso_type = XEN_NETIF_GSO_TYPE_NONE;
		meta->gso_size = 0;
	}

	meta->size = 0;
	meta->id = req->id;
@@ -592,7 +615,8 @@ void xenvif_rx_action(struct xenvif *vif)

		vif = netdev_priv(skb->dev);

		if (vif->meta[npo.meta_cons].gso_size && vif->gso_prefix) {
		if ((1 << vif->meta[npo.meta_cons].gso_type) &
		    vif->gso_prefix_mask) {
			resp = RING_GET_RESPONSE(&vif->rx,
						 vif->rx.rsp_prod_pvt++);

@@ -629,7 +653,8 @@ void xenvif_rx_action(struct xenvif *vif)
					vif->meta[npo.meta_cons].size,
					flags);

		if (vif->meta[npo.meta_cons].gso_size && !vif->gso_prefix) {
		if ((1 << vif->meta[npo.meta_cons].gso_type) &
		    vif->gso_mask) {
			struct xen_netif_extra_info *gso =
				(struct xen_netif_extra_info *)
				RING_GET_RESPONSE(&vif->rx,
@@ -637,8 +662,8 @@ void xenvif_rx_action(struct xenvif *vif)

			resp->flags |= XEN_NETRXF_extra_info;

			gso->u.gso.type = vif->meta[npo.meta_cons].gso_type;
			gso->u.gso.size = vif->meta[npo.meta_cons].gso_size;
			gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
			gso->u.gso.pad = 0;
			gso->u.gso.features = 0;

@@ -1101,15 +1126,20 @@ static int xenvif_set_skb_gso(struct xenvif *vif,
		return -EINVAL;
	}

	/* Currently only TCPv4 S.O. is supported. */
	if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) {
	switch (gso->u.gso.type) {
	case XEN_NETIF_GSO_TYPE_TCPV4:
		skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
		break;
	case XEN_NETIF_GSO_TYPE_TCPV6:
		skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
		break;
	default:
		netdev_err(vif->dev, "Bad GSO type %d.\n", gso->u.gso.type);
		xenvif_fatal_tx_err(vif);
		return -EINVAL;
	}

	skb_shinfo(skb)->gso_size = gso->u.gso.size;
	skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;

	/* Header must be checked, and gso_segs computed. */
	skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
@@ -1118,61 +1148,74 @@ static int xenvif_set_skb_gso(struct xenvif *vif,
	return 0;
}

static int checksum_setup(struct xenvif *vif, struct sk_buff *skb)
static inline void maybe_pull_tail(struct sk_buff *skb, unsigned int len)
{
	struct iphdr *iph;
	int err = -EPROTO;
	int recalculate_partial_csum = 0;

	/*
	 * A GSO SKB must be CHECKSUM_PARTIAL. However some buggy
	 * peers can fail to set NETRXF_csum_blank when sending a GSO
	 * frame. In this case force the SKB to CHECKSUM_PARTIAL and
	 * recalculate the partial checksum.
	if (skb_is_nonlinear(skb) && skb_headlen(skb) < len) {
		/* If we need to pullup then pullup to the max, so we
		 * won't need to do it again.
		 */
	if (skb->ip_summed != CHECKSUM_PARTIAL && skb_is_gso(skb)) {
		vif->rx_gso_checksum_fixup++;
		skb->ip_summed = CHECKSUM_PARTIAL;
		recalculate_partial_csum = 1;
		int target = min_t(int, skb->len, MAX_TCP_HEADER);
		__pskb_pull_tail(skb, target - skb_headlen(skb));
	}
}

	/* A non-CHECKSUM_PARTIAL SKB does not require setup. */
	if (skb->ip_summed != CHECKSUM_PARTIAL)
		return 0;
static int checksum_setup_ip(struct xenvif *vif, struct sk_buff *skb,
			     int recalculate_partial_csum)
{
	struct iphdr *iph = (void *)skb->data;
	unsigned int header_size;
	unsigned int off;
	int err = -EPROTO;

	if (skb->protocol != htons(ETH_P_IP))
		goto out;
	off = sizeof(struct iphdr);

	header_size = skb->network_header + off + MAX_IPOPTLEN;
	maybe_pull_tail(skb, header_size);

	off = iph->ihl * 4;

	iph = (void *)skb->data;
	switch (iph->protocol) {
	case IPPROTO_TCP:
		if (!skb_partial_csum_set(skb, 4 * iph->ihl,
		if (!skb_partial_csum_set(skb, off,
					  offsetof(struct tcphdr, check)))
			goto out;

		if (recalculate_partial_csum) {
			struct tcphdr *tcph = tcp_hdr(skb);

			header_size = skb->network_header +
				off +
				sizeof(struct tcphdr);
			maybe_pull_tail(skb, header_size);

			tcph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
							 skb->len - iph->ihl*4,
							 skb->len - off,
							 IPPROTO_TCP, 0);
		}
		break;
	case IPPROTO_UDP:
		if (!skb_partial_csum_set(skb, 4 * iph->ihl,
		if (!skb_partial_csum_set(skb, off,
					  offsetof(struct udphdr, check)))
			goto out;

		if (recalculate_partial_csum) {
			struct udphdr *udph = udp_hdr(skb);

			header_size = skb->network_header +
				off +
				sizeof(struct udphdr);
			maybe_pull_tail(skb, header_size);

			udph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
							 skb->len - iph->ihl*4,
							 skb->len - off,
							 IPPROTO_UDP, 0);
		}
		break;
	default:
		if (net_ratelimit())
			netdev_err(vif->dev,
				   "Attempting to checksum a non-TCP/UDP packet, dropping a protocol %d packet\n",
				   "Attempting to checksum a non-TCP/UDP packet, "
				   "dropping a protocol %d packet\n",
				   iph->protocol);
		goto out;
	}
@@ -1183,6 +1226,158 @@ static int checksum_setup(struct xenvif *vif, struct sk_buff *skb)
	return err;
}

static int checksum_setup_ipv6(struct xenvif *vif, struct sk_buff *skb,
			       int recalculate_partial_csum)
{
	int err = -EPROTO;
	struct ipv6hdr *ipv6h = (void *)skb->data;
	u8 nexthdr;
	unsigned int header_size;
	unsigned int off;
	bool fragment;
	bool done;

	done = false;

	off = sizeof(struct ipv6hdr);

	header_size = skb->network_header + off;
	maybe_pull_tail(skb, header_size);

	nexthdr = ipv6h->nexthdr;

	while ((off <= sizeof(struct ipv6hdr) + ntohs(ipv6h->payload_len)) &&
	       !done) {
		switch (nexthdr) {
		case IPPROTO_DSTOPTS:
		case IPPROTO_HOPOPTS:
		case IPPROTO_ROUTING: {
			struct ipv6_opt_hdr *hp = (void *)(skb->data + off);

			header_size = skb->network_header +
				off +
				sizeof(struct ipv6_opt_hdr);
			maybe_pull_tail(skb, header_size);

			nexthdr = hp->nexthdr;
			off += ipv6_optlen(hp);
			break;
		}
		case IPPROTO_AH: {
			struct ip_auth_hdr *hp = (void *)(skb->data + off);

			header_size = skb->network_header +
				off +
				sizeof(struct ip_auth_hdr);
			maybe_pull_tail(skb, header_size);

			nexthdr = hp->nexthdr;
			off += (hp->hdrlen+2)<<2;
			break;
		}
		case IPPROTO_FRAGMENT:
			fragment = true;
			/* fall through */
		default:
			done = true;
			break;
		}
	}

	if (!done) {
		if (net_ratelimit())
			netdev_err(vif->dev, "Failed to parse packet header\n");
		goto out;
	}

	if (fragment) {
		if (net_ratelimit())
			netdev_err(vif->dev, "Packet is a fragment!\n");
		goto out;
	}

	switch (nexthdr) {
	case IPPROTO_TCP:
		if (!skb_partial_csum_set(skb, off,
					  offsetof(struct tcphdr, check)))
			goto out;

		if (recalculate_partial_csum) {
			struct tcphdr *tcph = tcp_hdr(skb);

			header_size = skb->network_header +
				off +
				sizeof(struct tcphdr);
			maybe_pull_tail(skb, header_size);

			tcph->check = ~csum_ipv6_magic(&ipv6h->saddr,
						       &ipv6h->daddr,
						       skb->len - off,
						       IPPROTO_TCP, 0);
		}
		break;
	case IPPROTO_UDP:
		if (!skb_partial_csum_set(skb, off,
					  offsetof(struct udphdr, check)))
			goto out;

		if (recalculate_partial_csum) {
			struct udphdr *udph = udp_hdr(skb);

			header_size = skb->network_header +
				off +
				sizeof(struct udphdr);
			maybe_pull_tail(skb, header_size);

			udph->check = ~csum_ipv6_magic(&ipv6h->saddr,
						       &ipv6h->daddr,
						       skb->len - off,
						       IPPROTO_UDP, 0);
		}
		break;
	default:
		if (net_ratelimit())
			netdev_err(vif->dev,
				   "Attempting to checksum a non-TCP/UDP packet, "
				   "dropping a protocol %d packet\n",
				   nexthdr);
		goto out;
	}

	err = 0;

out:
	return err;
}

static int checksum_setup(struct xenvif *vif, struct sk_buff *skb)
{
	int err = -EPROTO;
	int recalculate_partial_csum = 0;

	/* A GSO SKB must be CHECKSUM_PARTIAL. However some buggy
	 * peers can fail to set NETRXF_csum_blank when sending a GSO
	 * frame. In this case force the SKB to CHECKSUM_PARTIAL and
	 * recalculate the partial checksum.
	 */
	if (skb->ip_summed != CHECKSUM_PARTIAL && skb_is_gso(skb)) {
		vif->rx_gso_checksum_fixup++;
		skb->ip_summed = CHECKSUM_PARTIAL;
		recalculate_partial_csum = 1;
	}

	/* A non-CHECKSUM_PARTIAL SKB does not require setup. */
	if (skb->ip_summed != CHECKSUM_PARTIAL)
		return 0;

	if (skb->protocol == htons(ETH_P_IP))
		err = checksum_setup_ip(vif, skb, recalculate_partial_csum);
	else if (skb->protocol == htons(ETH_P_IPV6))
		err = checksum_setup_ipv6(vif, skb, recalculate_partial_csum);

	return err;
}

static bool tx_credit_exceeded(struct xenvif *vif, unsigned size)
{
	unsigned long now = jiffies;
@@ -1428,12 +1623,7 @@ static int xenvif_tx_submit(struct xenvif *vif, int budget)

		xenvif_fill_frags(vif, skb);

		/*
		 * If the initial fragment was < PKT_PROT_LEN then
		 * pull through some bytes from the other fragments to
		 * increase the linear region to PKT_PROT_LEN bytes.
		 */
		if (skb_headlen(skb) < PKT_PROT_LEN && skb_is_nonlinear(skb)) {
		if (skb_is_nonlinear(skb) && skb_headlen(skb) < PKT_PROT_LEN) {
			int target = min_t(int, skb->len, PKT_PROT_LEN);
			__pskb_pull_tail(skb, target - skb_headlen(skb));
		}
+49 −3
Original line number Diff line number Diff line
@@ -105,6 +105,22 @@ static int netback_probe(struct xenbus_device *dev,
			goto abort_transaction;
		}

		err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv6",
				    "%d", sg);
		if (err) {
			message = "writing feature-gso-tcpv6";
			goto abort_transaction;
		}

		/* We support partial checksum setup for IPv6 packets */
		err = xenbus_printf(xbt, dev->nodename,
				    "feature-ipv6-csum-offload",
				    "%d", 1);
		if (err) {
			message = "writing feature-ipv6-csum-offload";
			goto abort_transaction;
		}

		/* We support rx-copy path. */
		err = xenbus_printf(xbt, dev->nodename,
				    "feature-rx-copy", "%d", 1);
@@ -561,20 +577,50 @@ static int connect_rings(struct backend_info *be)
		val = 0;
	vif->can_sg = !!val;

	vif->gso_mask = 0;
	vif->gso_prefix_mask = 0;

	if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-gso-tcpv4",
			 "%d", &val) < 0)
		val = 0;
	vif->gso = !!val;
	if (val)
		vif->gso_mask |= GSO_BIT(TCPV4);

	if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-gso-tcpv4-prefix",
			 "%d", &val) < 0)
		val = 0;
	vif->gso_prefix = !!val;
	if (val)
		vif->gso_prefix_mask |= GSO_BIT(TCPV4);

	if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-gso-tcpv6",
			 "%d", &val) < 0)
		val = 0;
	if (val)
		vif->gso_mask |= GSO_BIT(TCPV6);

	if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-gso-tcpv6-prefix",
			 "%d", &val) < 0)
		val = 0;
	if (val)
		vif->gso_prefix_mask |= GSO_BIT(TCPV6);

	if (vif->gso_mask & vif->gso_prefix_mask) {
		xenbus_dev_fatal(dev, err,
				 "%s: gso and gso prefix flags are not "
				 "mutually exclusive",
				 dev->otherend);
		return -EOPNOTSUPP;
	}

	if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-no-csum-offload",
			 "%d", &val) < 0)
		val = 0;
	vif->csum = !val;
	vif->ip_csum = !val;

	if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-ipv6-csum-offload",
			 "%d", &val) < 0)
		val = 0;
	vif->ipv6_csum = !!val;

	/* Map the shared frame, irq etc. */
	err = xenvif_connect(vif, tx_ring_ref, rx_ring_ref,
+17 −1
Original line number Diff line number Diff line
@@ -50,6 +50,20 @@
 * node as before.
 */

/*
 * "feature-no-csum-offload" should be used to turn IPv4 TCP/UDP checksum
 * offload off or on. If it is missing then the feature is assumed to be on.
 * "feature-ipv6-csum-offload" should be used to turn IPv6 TCP/UDP checksum
 * offload on or off. If it is missing then the feature is assumed to be off.
 */

/*
 * "feature-gso-tcpv4" and "feature-gso-tcpv6" advertise the capability to
 * handle large TCP packets (in IPv4 or IPv6 form respectively). Neither
 * frontends nor backends are assumed to be capable unless the flags are
 * present.
 */

/*
 * This is the 'wire' format for packets:
 *  Request 1: xen_netif_tx_request  -- XEN_NETTXF_* (any flags)
@@ -95,8 +109,10 @@ struct xen_netif_tx_request {
#define _XEN_NETIF_EXTRA_FLAG_MORE	(0)
#define  XEN_NETIF_EXTRA_FLAG_MORE	(1U<<_XEN_NETIF_EXTRA_FLAG_MORE)

/* GSO types - only TCPv4 currently supported. */
/* GSO types */
#define XEN_NETIF_GSO_TYPE_NONE		(0)
#define XEN_NETIF_GSO_TYPE_TCPV4	(1)
#define XEN_NETIF_GSO_TYPE_TCPV6	(2)

/*
 * This structure needs to fit within both netif_tx_request and