Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 3d5b0605 authored by Ed L. Cashin's avatar Ed L. Cashin Committed by Linus Torvalds
Browse files

aoe: for performance support larger packet payloads

tAdd adds the ability to work with large packets composed of a number of
segments, using the scatter gather feature of the block layer (biovecs)
and the network layer (skb frag array).  The motivation is the performance
gained by using a packet data payload greater than a page size and by
using the network card's scatter gather feature.

Users of the out-of-tree aoe driver already had these changes, but since
early 2011, they have complained of increased memory utilization and
higher CPU utilization during heavy writes.[1] The commit below appears
related, as it disables scatter gather on non-IP protocols inside the
harmonize_features function, even when the NIC supports sg.

  commit f01a5236
  Author: Jesse Gross <jesse@nicira.com>
  Date:   Sun Jan 9 06:23:31 2011 +0000

      net offloading: Generalize netif_get_vlan_features().

With that regression in place, transmits always linearize sg AoE packets,
but in-kernel users did not have this patch.  Before 2.6.38, though, these
changes were working to allow sg to increase performance.

1. http://www.spinics.net/lists/linux-mm/msg15184.html



Signed-off-by: default avatarEd Cashin <ecashin@coraid.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent a336d298
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -119,6 +119,8 @@ struct frame {
	ulong bcnt;
	sector_t lba;
	struct sk_buff *skb;
	struct bio_vec *bv;
	ulong bv_off;
};

struct aoeif {
+3 −0
Original line number Diff line number Diff line
@@ -254,6 +254,7 @@ aoeblk_gdalloc(void *vp)
{
	struct aoedev *d = vp;
	struct gendisk *gd;
	enum { KB = 1024, MB = KB * KB, READ_AHEAD = MB, };
	ulong flags;

	gd = alloc_disk(AOE_PARTITIONS);
@@ -279,6 +280,8 @@ aoeblk_gdalloc(void *vp)
	if (bdi_init(&d->blkq->backing_dev_info))
		goto err_blkq;
	spin_lock_irqsave(&d->lock, flags);
	blk_queue_max_hw_sectors(d->blkq, BLK_DEF_MAX_SECTORS);
	d->blkq->backing_dev_info.ra_pages = READ_AHEAD / PAGE_CACHE_SIZE;
	gd->major = AOE_MAJOR;
	gd->first_minor = d->sysminor * AOE_PARTITIONS;
	gd->fops = &aoe_bdops;
+96 −42
Original line number Diff line number Diff line
@@ -165,7 +165,8 @@ freeframe(struct aoedev *d)
						rf = f;
					continue;
				}
gotone:				skb_shinfo(skb)->nr_frags = skb->data_len = 0;
gotone:				skb->truesize -= skb->data_len;
				skb_shinfo(skb)->nr_frags = skb->data_len = 0;
				skb_trim(skb, 0);
				d->tgt = t;
				ifrotate(*t);
@@ -201,6 +202,24 @@ gotone: skb_shinfo(skb)->nr_frags = skb->data_len = 0;
	return NULL;
}

static void
skb_fillup(struct sk_buff *skb, struct bio_vec *bv, ulong off, ulong cnt)
{
	int frag = 0;
	ulong fcnt;
loop:
	fcnt = bv->bv_len - (off - bv->bv_offset);
	if (fcnt > cnt)
		fcnt = cnt;
	skb_fill_page_desc(skb, frag++, bv->bv_page, off, fcnt);
	cnt -= fcnt;
	if (cnt <= 0)
		return;
	bv++;
	off = bv->bv_offset;
	goto loop;
}

static int
aoecmd_ata_rw(struct aoedev *d)
{
@@ -211,7 +230,7 @@ aoecmd_ata_rw(struct aoedev *d)
	struct bio_vec *bv;
	struct aoetgt *t;
	struct sk_buff *skb;
	ulong bcnt;
	ulong bcnt, fbcnt;
	char writebit, extbit;

	writebit = 0x10;
@@ -226,8 +245,28 @@ aoecmd_ata_rw(struct aoedev *d)
	bcnt = t->ifp->maxbcnt;
	if (bcnt == 0)
		bcnt = DEFAULTBCNT;
	if (bcnt > buf->bv_resid)
		bcnt = buf->bv_resid;
	if (bcnt > buf->resid)
		bcnt = buf->resid;
	fbcnt = bcnt;
	f->bv = buf->bv;
	f->bv_off = f->bv->bv_offset + (f->bv->bv_len - buf->bv_resid);
	do {
		if (fbcnt < buf->bv_resid) {
			buf->bv_resid -= fbcnt;
			buf->resid -= fbcnt;
			break;
		}
		fbcnt -= buf->bv_resid;
		buf->resid -= buf->bv_resid;
		if (buf->resid == 0) {
			d->inprocess = NULL;
			break;
		}
		buf->bv++;
		buf->bv_resid = buf->bv->bv_len;
		WARN_ON(buf->bv_resid == 0);
	} while (fbcnt);

	/* initialize the headers & frame */
	skb = f->skb;
	h = (struct aoe_hdr *) skb_mac_header(skb);
@@ -238,7 +277,6 @@ aoecmd_ata_rw(struct aoedev *d)
	t->nout++;
	f->waited = 0;
	f->buf = buf;
	f->bufaddr = page_address(bv->bv_page) + buf->bv_off;
	f->bcnt = bcnt;
	f->lba = buf->sector;

@@ -253,10 +291,11 @@ aoecmd_ata_rw(struct aoedev *d)
		ah->lba3 |= 0xe0;	/* LBA bit + obsolete 0xa0 */
	}
	if (bio_data_dir(buf->bio) == WRITE) {
		skb_fill_page_desc(skb, 0, bv->bv_page, buf->bv_off, bcnt);
		skb_fillup(skb, f->bv, f->bv_off, bcnt);
		ah->aflags |= AOEAFL_WRITE;
		skb->len += bcnt;
		skb->data_len = bcnt;
		skb->truesize += bcnt;
		t->wpkts++;
	} else {
		t->rpkts++;
@@ -267,18 +306,7 @@ aoecmd_ata_rw(struct aoedev *d)

	/* mark all tracking fields and load out */
	buf->nframesout += 1;
	buf->bv_off += bcnt;
	buf->bv_resid -= bcnt;
	buf->resid -= bcnt;
	buf->sector += bcnt >> 9;
	if (buf->resid == 0) {
		d->inprocess = NULL;
	} else if (buf->bv_resid == 0) {
		buf->bv = ++bv;
		buf->bv_resid = bv->bv_len;
		WARN_ON(buf->bv_resid == 0);
		buf->bv_off = bv->bv_offset;
	}

	skb->dev = t->ifp->nd;
	skb = skb_clone(skb, GFP_ATOMIC);
@@ -365,14 +393,12 @@ resend(struct aoedev *d, struct aoetgt *t, struct frame *f)
		put_lba(ah, f->lba);

		n = f->bcnt;
		if (n > DEFAULTBCNT)
			n = DEFAULTBCNT;
		ah->scnt = n >> 9;
		if (ah->aflags & AOEAFL_WRITE) {
			skb_fill_page_desc(skb, 0, virt_to_page(f->bufaddr),
				offset_in_page(f->bufaddr), n);
			skb_fillup(skb, f->bv, f->bv_off, n);
			skb->len = sizeof *h + sizeof *ah + n;
			skb->data_len = n;
			skb->truesize += n;
		}
	}
	skb->dev = t->ifp->nd;
@@ -531,20 +557,6 @@ rexmit_timer(ulong vp)
				ejectif(t, ifp);
				ifp = NULL;
			}

			if (ata_scnt(skb_mac_header(f->skb)) > DEFAULTBCNT / 512
			&& ifp && ++ifp->lostjumbo > (t->nframes << 1)
			&& ifp->maxbcnt != DEFAULTBCNT) {
				printk(KERN_INFO
					"aoe: e%ld.%d: "
					"too many lost jumbo on "
					"%s:%pm - "
					"falling back to %d frames.\n",
					d->aoemajor, d->aoeminor,
					ifp->nd->name, t->addr,
					DEFAULTBCNT);
				ifp->maxbcnt = 0;
			}
			resend(d, t, f);
		}

@@ -737,6 +749,45 @@ diskstats(struct gendisk *disk, struct bio *bio, ulong duration, sector_t sector
	part_stat_unlock();
}

static void
bvcpy(struct bio_vec *bv, ulong off, struct sk_buff *skb, ulong cnt)
{
	ulong fcnt;
	char *p;
	int soff = 0;
loop:
	fcnt = bv->bv_len - (off - bv->bv_offset);
	if (fcnt > cnt)
		fcnt = cnt;
	p = page_address(bv->bv_page) + off;
	skb_copy_bits(skb, soff, p, fcnt);
	soff += fcnt;
	cnt -= fcnt;
	if (cnt <= 0)
		return;
	bv++;
	off = bv->bv_offset;
	goto loop;
}

static void
fadvance(struct frame *f, ulong cnt)
{
	ulong fcnt;

	f->lba += cnt >> 9;
loop:
	fcnt = f->bv->bv_len - (f->bv_off - f->bv->bv_offset);
	if (fcnt > cnt) {
		f->bv_off += cnt;
		return;
	}
	cnt -= fcnt;
	f->bv++;
	f->bv_off = f->bv->bv_offset;
	goto loop;
}

void
aoecmd_ata_rsp(struct sk_buff *skb)
{
@@ -754,6 +805,7 @@ aoecmd_ata_rsp(struct sk_buff *skb)
	u16 aoemajor;

	hin = (struct aoe_hdr *) skb_mac_header(skb);
	skb_pull(skb, sizeof(*hin));
	aoemajor = get_unaligned_be16(&hin->major);
	d = aoedev_by_aoeaddr(aoemajor, hin->minor);
	if (d == NULL) {
@@ -791,7 +843,8 @@ aoecmd_ata_rsp(struct sk_buff *skb)

	calc_rttavg(d, tsince(f->tag));

	ahin = (struct aoe_atahdr *) (hin+1);
	ahin = (struct aoe_atahdr *) skb->data;
	skb_pull(skb, sizeof(*ahin));
	hout = (struct aoe_hdr *) skb_mac_header(f->skb);
	ahout = (struct aoe_atahdr *) (hout+1);
	buf = f->buf;
@@ -810,7 +863,7 @@ aoecmd_ata_rsp(struct sk_buff *skb)
		switch (ahout->cmdstat) {
		case ATA_CMD_PIO_READ:
		case ATA_CMD_PIO_READ_EXT:
			if (skb->len - sizeof *hin - sizeof *ahin < n) {
			if (skb->len < n) {
				printk(KERN_ERR
					"aoe: %s.  skb->len=%d need=%ld\n",
					"runt data size in read", skb->len, n);
@@ -818,7 +871,7 @@ aoecmd_ata_rsp(struct sk_buff *skb)
				spin_unlock_irqrestore(&d->lock, flags);
				return;
			}
			memcpy(f->bufaddr, ahin+1, n);
			bvcpy(f->bv, f->bv_off, skb, n);
		case ATA_CMD_PIO_WRITE:
		case ATA_CMD_PIO_WRITE_EXT:
			ifp = getif(t, skb->dev);
@@ -828,21 +881,22 @@ aoecmd_ata_rsp(struct sk_buff *skb)
					ifp->lostjumbo = 0;
			}
			if (f->bcnt -= n) {
				f->lba += n >> 9;
				f->bufaddr += n;
				fadvance(f, n);
				resend(d, t, f);
				goto xmit;
			}
			break;
		case ATA_CMD_ID_ATA:
			if (skb->len - sizeof *hin - sizeof *ahin < 512) {
			if (skb->len < 512) {
				printk(KERN_INFO
					"aoe: runt data size in ataid.  skb->len=%d\n",
					skb->len);
				spin_unlock_irqrestore(&d->lock, flags);
				return;
			}
			ataid_complete(d, t, (char *) (ahin+1));
			if (skb_linearize(skb))
				break;
			ataid_complete(d, t, skb->data);
			break;
		default:
			printk(KERN_INFO
+1 −0
Original line number Diff line number Diff line
@@ -182,6 +182,7 @@ skbfree(struct sk_buff *skb)
			"cannot free skb -- memory leaked.");
		return;
	}
	skb->truesize -= skb->data_len;
	skb_shinfo(skb)->nr_frags = skb->data_len = 0;
	skb_trim(skb, 0);
	dev_kfree_skb(skb);
+9 −4
Original line number Diff line number Diff line
@@ -102,7 +102,9 @@ static int
aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt, struct net_device *orig_dev)
{
	struct aoe_hdr *h;
	struct aoe_atahdr *ah;
	u32 n;
	int sn;

	if (dev_net(ifp) != &init_net)
		goto exit;
@@ -110,13 +112,16 @@ aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt,
	skb = skb_share_check(skb, GFP_ATOMIC);
	if (skb == NULL)
		return 0;
	if (skb_linearize(skb))
		goto exit;
	if (!is_aoe_netif(ifp))
		goto exit;
	skb_push(skb, ETH_HLEN);	/* (1) */

	h = (struct aoe_hdr *) skb_mac_header(skb);
	sn = sizeof(*h) + sizeof(*ah);
	if (skb->len >= sn) {
		sn -= skb_headlen(skb);
		if (sn > 0 && !__pskb_pull_tail(skb, sn))
			goto exit;
	}
	h = (struct aoe_hdr *) skb->data;
	n = get_unaligned_be32(&h->tag);
	if ((h->verfl & AOEFL_RSP) == 0 || (n & 1<<31))
		goto exit;