Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 71cdd40f authored by Peng Tao's avatar Peng Tao Committed by Trond Myklebust
Browse files

pnfsblock: write_pagelist handle zero invalid extents



For invalid extents, find other pages in the same fsblock and write them out.

[pnfsblock: write_begin]
Signed-off-by: default avatarFred Isaman <iisaman@citi.umich.edu>
Signed-off-by: default avatarBenny Halevy <bhalevy@panasas.com>
Signed-off-by: default avatarBenny Halevy <bhalevy@tonian.com>
Signed-off-by: default avatarPeng Tao <peng_tao@emc.com>
Signed-off-by: default avatarJim Rees <rees@umich.edu>
Signed-off-by: default avatarTrond Myklebust <Trond.Myklebust@netapp.com>
parent 31e6306a
Loading
Loading
Loading
Loading
+233 −42
Original line number Diff line number Diff line
@@ -35,6 +35,7 @@
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/bio.h>		/* struct bio */
#include <linux/buffer_head.h>	/* various write calls */

#include "blocklayout.h"

@@ -79,12 +80,8 @@ static int is_hole(struct pnfs_block_extent *be, sector_t isect)
 */
static int is_writable(struct pnfs_block_extent *be, sector_t isect)
{
	if (be->be_state == PNFS_BLOCK_READWRITE_DATA)
		return 1;
	else if (be->be_state != PNFS_BLOCK_INVALID_DATA)
		return 0;
	else
		return bl_is_sector_init(be->be_inval, isect);
	return (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
		be->be_state == PNFS_BLOCK_INVALID_DATA);
}

/* The data we are handed might be spread across several bios.  We need
@@ -353,6 +350,31 @@ static void mark_extents_written(struct pnfs_block_layout *bl,
	}
}

static void bl_end_io_write_zero(struct bio *bio, int err)
{
	struct parallel_io *par = bio->bi_private;
	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
	struct nfs_write_data *wdata = (struct nfs_write_data *)par->data;

	do {
		struct page *page = bvec->bv_page;

		if (--bvec >= bio->bi_io_vec)
			prefetchw(&bvec->bv_page->flags);
		/* This is the zeroing page we added */
		end_page_writeback(page);
		page_cache_release(page);
	} while (bvec >= bio->bi_io_vec);
	if (!uptodate) {
		if (!wdata->pnfs_error)
			wdata->pnfs_error = -EIO;
		bl_set_lo_fail(wdata->lseg);
	}
	bio_put(bio);
	put_parallel(par);
}

/* This is basically copied from mpage_end_io_read */
static void bl_end_io_write(struct bio *bio, int err)
{
@@ -379,11 +401,8 @@ static void bl_write_cleanup(struct work_struct *work)
	dprintk("%s enter\n", __func__);
	task = container_of(work, struct rpc_task, u.tk_work);
	wdata = container_of(task, struct nfs_write_data, task);
	if (!wdata->task.tk_status) {
	if (!wdata->pnfs_error) {
		/* Marks for LAYOUTCOMMIT */
		/* BUG - this should be called after each bio, not after
		 * all finish, unless have some way of storing success/failure
		 */
		mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
				     wdata->args.offset, wdata->args.count);
	}
@@ -391,38 +410,110 @@ static void bl_write_cleanup(struct work_struct *work)
}

/* Called when last of bios associated with a bl_write_pagelist call finishes */
static void
bl_end_par_io_write(void *data)
static void bl_end_par_io_write(void *data)
{
	struct nfs_write_data *wdata = data;

	/* STUB - ignoring error handling */
	wdata->task.tk_status = 0;
	wdata->verf.committed = NFS_FILE_SYNC;
	INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
	schedule_work(&wdata->task.u.tk_work);
}

/* FIXME STUB - mark intersection of layout and page as bad, so is not
 * used again.
 */
static void mark_bad_read(void)
{
	return;
}

/*
 * map_block:  map a requested I/0 block (isect) into an offset in the LVM
 * block_device
 */
static void
map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be)
{
	dprintk("%s enter be=%p\n", __func__, be);

	set_buffer_mapped(bh);
	bh->b_bdev = be->be_mdev;
	bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >>
	    (be->be_mdev->bd_inode->i_blkbits - SECTOR_SHIFT);

	dprintk("%s isect %llu, bh->b_blocknr %ld, using bsize %Zd\n",
		__func__, (unsigned long long)isect, (long)bh->b_blocknr,
		bh->b_size);
	return;
}

/* Given an unmapped page, zero it or read in page for COW, page is locked
 * by caller.
 */
static int
init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read)
{
	struct buffer_head *bh = NULL;
	int ret = 0;
	sector_t isect;

	dprintk("%s enter, %p\n", __func__, page);
	BUG_ON(PageUptodate(page));
	if (!cow_read) {
		zero_user_segment(page, 0, PAGE_SIZE);
		SetPageUptodate(page);
		goto cleanup;
	}

	bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0);
	if (!bh) {
		ret = -ENOMEM;
		goto cleanup;
	}

	isect = (sector_t) page->index << PAGE_CACHE_SECTOR_SHIFT;
	map_block(bh, isect, cow_read);
	if (!bh_uptodate_or_lock(bh))
		ret = bh_submit_read(bh);
	if (ret)
		goto cleanup;
	SetPageUptodate(page);

cleanup:
	bl_put_extent(cow_read);
	if (bh)
		free_buffer_head(bh);
	if (ret) {
		/* Need to mark layout with bad read...should now
		 * just use nfs4 for reads and writes.
		 */
		mark_bad_read();
	}
	return ret;
}

static enum pnfs_try_status
bl_write_pagelist(struct nfs_write_data *wdata, int sync)
{
	int i;
	int i, ret, npg_zero, pg_index, last = 0;
	struct bio *bio = NULL;
	struct pnfs_block_extent *be = NULL;
	sector_t isect, extent_length = 0;
	struct pnfs_block_extent *be = NULL, *cow_read = NULL;
	sector_t isect, last_isect = 0, extent_length = 0;
	struct parallel_io *par;
	loff_t offset = wdata->args.offset;
	size_t count = wdata->args.count;
	struct page **pages = wdata->args.pages;
	int pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT;
	struct page *page;
	pgoff_t index;
	u64 temp;
	int npg_per_block =
	    NFS_SERVER(wdata->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT;

	dprintk("%s enter, %Zu@%lld\n", __func__, count, offset);
	/* At this point, wdata->pages is a (sequential) list of nfs_pages.
	 * We want to write each, and if there is an error remove it from
	 * list and call
	 * nfs_retry_request(req) to have it redone using nfs.
	 * QUEST? Do as block or per req?  Think have to do per block
	 * as part of end_bio
	 * We want to write each, and if there is an error set pnfs_error
	 * to have it redone using nfs.
	 */
	par = alloc_parallel(wdata);
	if (!par)
@@ -433,6 +524,90 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
	/* At this point, have to be more careful with error handling */

	isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
	be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read);
	if (!be || !is_writable(be, isect)) {
		dprintk("%s no matching extents!\n", __func__);
		wdata->pnfs_error = -EINVAL;
		goto out;
	}

	/* First page inside INVALID extent */
	if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
		temp = offset >> PAGE_CACHE_SHIFT;
		npg_zero = do_div(temp, npg_per_block);
		isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) &
				     (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
		extent_length = be->be_length - (isect - be->be_f_offset);

fill_invalid_ext:
		dprintk("%s need to zero %d pages\n", __func__, npg_zero);
		for (;npg_zero > 0; npg_zero--) {
			/* page ref released in bl_end_io_write_zero */
			index = isect >> PAGE_CACHE_SECTOR_SHIFT;
			dprintk("%s zero %dth page: index %lu isect %llu\n",
				__func__, npg_zero, index,
				(unsigned long long)isect);
			page =
			    find_or_create_page(wdata->inode->i_mapping, index,
						GFP_NOFS);
			if (!page) {
				dprintk("%s oom\n", __func__);
				wdata->pnfs_error = -ENOMEM;
				goto out;
			}

			/* PageDirty: Other will write this out
			 * PageWriteback: Other is writing this out
			 * PageUptodate: It was read before
			 * sector_initialized: already written out
			 */
			if (PageDirty(page) || PageWriteback(page) ||
			    bl_is_sector_init(be->be_inval, isect)) {
				print_page(page);
				unlock_page(page);
				page_cache_release(page);
				goto next_page;
			}
			if (!PageUptodate(page)) {
				/* New page, readin or zero it */
				init_page_for_write(page, cow_read);
			}
			set_page_writeback(page);
			unlock_page(page);

			ret = bl_mark_sectors_init(be->be_inval, isect,
						       PAGE_CACHE_SECTORS,
						       NULL);
			if (unlikely(ret)) {
				dprintk("%s bl_mark_sectors_init fail %d\n",
					__func__, ret);
				end_page_writeback(page);
				page_cache_release(page);
				wdata->pnfs_error = ret;
				goto out;
			}
			bio = bl_add_page_to_bio(bio, npg_zero, WRITE,
						 isect, page, be,
						 bl_end_io_write_zero, par);
			if (IS_ERR(bio)) {
				wdata->pnfs_error = PTR_ERR(bio);
				goto out;
			}
			/* FIXME: This should be done in bi_end_io */
			mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
					     page->index << PAGE_CACHE_SHIFT,
					     PAGE_CACHE_SIZE);
next_page:
			isect += PAGE_CACHE_SECTORS;
			extent_length -= PAGE_CACHE_SECTORS;
		}
		if (last)
			goto write_done;
	}
	bio = bl_submit_bio(WRITE, bio);

	/* Middle pages */
	pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT;
	for (i = pg_index; i < wdata->npages; i++) {
		if (!extent_length) {
			/* We've used up the previous extent */
@@ -442,35 +617,51 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
			be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg),
					     isect, NULL);
			if (!be || !is_writable(be, isect)) {
				wdata->pnfs_error = -ENOMEM;
				wdata->pnfs_error = -EINVAL;
				goto out;
			}
			extent_length = be->be_length -
			    (isect - be->be_f_offset);
		}
		for (;;) {
			if (!bio) {
				bio = bio_alloc(GFP_NOIO, wdata->npages - i);
				if (!bio) {
					wdata->pnfs_error = -ENOMEM;
		if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
			ret = bl_mark_sectors_init(be->be_inval, isect,
						       PAGE_CACHE_SECTORS,
						       NULL);
			if (unlikely(ret)) {
				dprintk("%s bl_mark_sectors_init fail %d\n",
					__func__, ret);
				wdata->pnfs_error = ret;
				goto out;
			}
				bio->bi_sector = isect - be->be_f_offset +
					be->be_v_offset;
				bio->bi_bdev = be->be_mdev;
				bio->bi_end_io = bl_end_io_write;
				bio->bi_private = par;
		}
			if (bio_add_page(bio, pages[i], PAGE_SIZE, 0))
				break;
			bio = bl_submit_bio(WRITE, bio);
		bio = bl_add_page_to_bio(bio, wdata->npages - i, WRITE,
					 isect, pages[i], be,
					 bl_end_io_write, par);
		if (IS_ERR(bio)) {
			wdata->pnfs_error = PTR_ERR(bio);
			goto out;
		}
		isect += PAGE_CACHE_SECTORS;
		last_isect = isect;
		extent_length -= PAGE_CACHE_SECTORS;
	}
	wdata->res.count = (isect << SECTOR_SHIFT) - (offset);
	if (count < wdata->res.count)

	/* Last page inside INVALID extent */
	if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
		bio = bl_submit_bio(WRITE, bio);
		temp = last_isect >> PAGE_CACHE_SECTOR_SHIFT;
		npg_zero = npg_per_block - do_div(temp, npg_per_block);
		if (npg_zero < npg_per_block) {
			last = 1;
			goto fill_invalid_ext;
		}
	}

write_done:
	wdata->res.count = (last_isect << SECTOR_SHIFT) - (offset);
	if (count < wdata->res.count) {
		wdata->res.count = count;
	}
out:
	bl_put_extent(be);
	bl_submit_bio(WRITE, bio);