Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit ce9adaa5 authored by Chris Mason's avatar Chris Mason
Browse files

Btrfs: Do metadata checksums for reads via a workqueue



Before, metadata checksumming was done by the callers of read_tree_block,
which would set EXTENT_CSUM bits in the extent tree to show that a given
range of pages was already checksummed and didn't need to be verified
again.

But, those bits could go away via try_to_releasepage, and the end
result was bogus checksum failures on pages that never left the cache.

The new code validates checksums when the page is read.  It is a little
tricky because metadata blocks can span pages and a single read may
end up going via multiple bios.

Signed-off-by: default avatarChris Mason <chris.mason@oracle.com>
parent 728131d8
Loading
Loading
Loading
Loading
+0 −5
Original line number Diff line number Diff line
@@ -158,9 +158,6 @@ int __btrfs_cow_block(struct btrfs_trans_handle *trans,
	} else {
		root_gen = 0;
	}
	if (!(buf->flags & EXTENT_CSUM))
		WARN_ON(1);

	WARN_ON(root->ref_cows && trans->transid !=
		root->fs_info->running_transaction->transid);
	WARN_ON(root->ref_cows && trans->transid != root->last_trans);
@@ -247,8 +244,6 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans,
		       root->fs_info->generation);
		WARN_ON(1);
	}
	if (!(buf->flags & EXTENT_CSUM))
		WARN_ON(1);

	header_trans = btrfs_header_generation(buf);
	spin_lock(&root->fs_info->hash_lock);
+4 −0
Original line number Diff line number Diff line
@@ -485,6 +485,10 @@ struct btrfs_fs_info {
	struct list_head trans_list;
	struct list_head hashers;
	struct list_head dead_roots;
	struct list_head end_io_work_list;
	struct work_struct end_io_work;
	spinlock_t end_io_work_lock;

#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
	struct work_struct trans_work;
#else
+224 −34
Original line number Diff line number Diff line
@@ -24,6 +24,7 @@
#include <linux/radix-tree.h>
#include <linux/writeback.h>
#include <linux/buffer_head.h> // for block_sync_page
#include <linux/workqueue.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -45,6 +46,16 @@ static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf)
#endif

static struct extent_io_ops btree_extent_io_ops;
static struct workqueue_struct *end_io_workqueue;

struct end_io_wq {
	struct bio *bio;
	bio_end_io_t *end_io;
	void *private;
	struct btrfs_fs_info *info;
	int error;
	struct list_head list;
};

struct extent_map *btree_get_extent(struct inode *inode, struct page *page,
				    size_t page_offset, u64 start, u64 len,
@@ -219,11 +230,108 @@ static int btree_writepage_io_hook(struct page *page, u64 start, u64 end)
	return 0;
}

int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
			       struct extent_state *state)
{
	struct extent_io_tree *tree;
	u64 found_start;
	int found_level;
	unsigned long len;
	struct extent_buffer *eb;
	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
	int ret;

	tree = &BTRFS_I(page->mapping->host)->io_tree;
	if (page->private == EXTENT_PAGE_PRIVATE)
		goto out;
	if (!page->private)
		goto out;
	len = page->private >> 2;
	if (len == 0) {
		WARN_ON(1);
	}
	eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
	read_extent_buffer_pages(tree, eb, start + PAGE_CACHE_SIZE, 1,
				 btree_get_extent);
	btrfs_clear_buffer_defrag(eb);
	found_start = btrfs_header_bytenr(eb);
	if (found_start != start) {
		printk("warning: eb start incorrect %Lu buffer %Lu len %lu\n",
		       start, found_start, len);
		WARN_ON(1);
		goto err;
	}
	if (eb->first_page != page) {
		printk("bad first page %lu %lu\n", eb->first_page->index,
		       page->index);
		WARN_ON(1);
		goto err;
	}
	found_level = btrfs_header_level(eb);

	ret = csum_tree_block(root, eb, 1);

	end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
	end = eb->start + end - 1;
	release_extent_buffer_tail_pages(eb);
err:
	free_extent_buffer(eb);
out:
	return 0;
}

#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
static void end_workqueue_bio(struct bio *bio, int err)
#else
static int end_workqueue_bio(struct bio *bio,
				   unsigned int bytes_done, int err)
#endif
{
	struct end_io_wq *end_io_wq = bio->bi_private;
	struct btrfs_fs_info *fs_info;
	unsigned long flags;

#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
	if (bio->bi_size)
		return 1;
#endif

	fs_info = end_io_wq->info;
	spin_lock_irqsave(&fs_info->end_io_work_lock, flags);
	end_io_wq->error = err;
	list_add_tail(&end_io_wq->list, &fs_info->end_io_work_list);
	spin_unlock_irqrestore(&fs_info->end_io_work_lock, flags);
	queue_work(end_io_workqueue, &fs_info->end_io_work);

#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
	return 0;
#endif
}

static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio)
{
	struct btrfs_root *root = BTRFS_I(inode)->root;
	struct end_io_wq *end_io_wq;
	u64 offset;
	offset = bio->bi_sector << 9;

	if (rw & (1 << BIO_RW)) {
		return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio);
	}

	end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS);
	if (!end_io_wq)
		return -ENOMEM;

	end_io_wq->private = bio->bi_private;
	end_io_wq->end_io = bio->bi_end_io;
	end_io_wq->info = root->fs_info;
	end_io_wq->error = 0;
	end_io_wq->bio = bio;

	bio->bi_private = end_io_wq;
	bio->bi_end_io = end_workqueue_bio;

	if (offset == BTRFS_SUPER_INFO_OFFSET) {
		bio->bi_bdev = root->fs_info->sb->s_bdev;
		submit_bio(rw, bio);
@@ -363,36 +471,7 @@ static int close_all_devices(struct btrfs_fs_info *fs_info)
int btrfs_verify_block_csum(struct btrfs_root *root,
			    struct extent_buffer *buf)
{
	struct extent_io_tree *io_tree;
	u64 end;
	int ret;

	io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
	if (buf->flags & EXTENT_CSUM)
		return 0;

	end = min_t(u64, buf->len, PAGE_CACHE_SIZE);
	end = buf->start + end - 1;
	if (test_range_bit(io_tree, buf->start, end, EXTENT_CSUM, 1)) {
		buf->flags |= EXTENT_CSUM;
		return 0;
	}
	lock_extent(io_tree, buf->start, end, GFP_NOFS);

	if (test_range_bit(io_tree, buf->start, end, EXTENT_CSUM, 1)) {
		buf->flags |= EXTENT_CSUM;
		ret = 0;
		goto out_unlock;
	}
WARN_ON(buf->flags & EXTENT_CSUM);

	ret = csum_tree_block(root, buf, 1);
	set_extent_bits(io_tree, buf->start, end, EXTENT_CSUM, GFP_NOFS);
	buf->flags |= EXTENT_CSUM;

out_unlock:
	unlock_extent(io_tree, buf->start, end, GFP_NOFS);
	return ret;
	return btrfs_buffer_uptodate(buf);
}

struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
@@ -430,11 +509,15 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
	buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
	if (!buf)
		return NULL;
	read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, buf, 0, 1,
				 btree_get_extent);

	ret = btrfs_verify_block_csum(root, buf);
	ret = read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, buf, 0,
				       1, btree_get_extent);

	if (ret == 0) {
		buf->flags |= EXTENT_UPTODATE;
	}
	return buf;

}

int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -724,6 +807,99 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
	return 0;
}

static int bio_ready_for_csum(struct bio *bio)
{
	u64 length = 0;
	u64 buf_len = 0;
	u64 start = 0;
	struct page *page;
	struct extent_io_tree *io_tree = NULL;
	struct btrfs_fs_info *info = NULL;
	struct bio_vec *bvec;
	int i;
	int ret;

	bio_for_each_segment(bvec, bio, i) {
		page = bvec->bv_page;
		if (page->private == EXTENT_PAGE_PRIVATE) {
			length += bvec->bv_len;
			continue;
		}
		if (!page->private) {
			length += bvec->bv_len;
			continue;
		}
		length = bvec->bv_len;
		buf_len = page->private >> 2;
		start = page_offset(page) + bvec->bv_offset;
		io_tree = &BTRFS_I(page->mapping->host)->io_tree;
		info = BTRFS_I(page->mapping->host)->root->fs_info;
	}
	/* are we fully contained in this bio? */
	if (buf_len <= length)
		return 1;

	ret = extent_range_uptodate(io_tree, start + length,
				    start + buf_len - 1);
	if (ret == 1)
		return ret;
	return ret;
}

#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
void btrfs_end_io_csum(void *p)
#else
void btrfs_end_io_csum(struct work_struct *work)
#endif
{
#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
	struct btrfs_fs_info *fs_info = p;
#else
	struct btrfs_fs_info *fs_info = container_of(work,
						     struct btrfs_fs_info,
						     end_io_work);
#endif
	unsigned long flags;
	struct end_io_wq *end_io_wq;
	struct bio *bio;
	struct list_head *next;
	int error;
	int was_empty;

	while(1) {
		spin_lock_irqsave(&fs_info->end_io_work_lock, flags);
		if (list_empty(&fs_info->end_io_work_list)) {
			spin_unlock_irqrestore(&fs_info->end_io_work_lock,
					       flags);
			return;
		}
		next = fs_info->end_io_work_list.next;
		list_del(next);
		spin_unlock_irqrestore(&fs_info->end_io_work_lock, flags);

		end_io_wq = list_entry(next, struct end_io_wq, list);

		bio = end_io_wq->bio;
		if (!bio_ready_for_csum(bio)) {
			spin_lock_irqsave(&fs_info->end_io_work_lock, flags);
			was_empty = list_empty(&fs_info->end_io_work_list);
			list_add_tail(&end_io_wq->list,
				      &fs_info->end_io_work_list);
			spin_unlock_irqrestore(&fs_info->end_io_work_lock,
					       flags);
			if (was_empty)
				return;
			continue;
		}
		error = end_io_wq->error;
		bio->bi_private = end_io_wq->private;
		bio->bi_end_io = end_io_wq->end_io;
		kfree(end_io_wq);
		bio_endio(bio, error);
	}
}


struct btrfs_root *open_ctree(struct super_block *sb,
			      struct btrfs_fs_devices *fs_devices)
{
@@ -750,11 +926,16 @@ struct btrfs_root *open_ctree(struct super_block *sb,
		err = -ENOMEM;
		goto fail;
	}
	end_io_workqueue = create_workqueue("btrfs-end-io");
	BUG_ON(!end_io_workqueue);

	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
	INIT_LIST_HEAD(&fs_info->trans_list);
	INIT_LIST_HEAD(&fs_info->dead_roots);
	INIT_LIST_HEAD(&fs_info->hashers);
	INIT_LIST_HEAD(&fs_info->end_io_work_list);
	spin_lock_init(&fs_info->hash_lock);
	spin_lock_init(&fs_info->end_io_work_lock);
	spin_lock_init(&fs_info->delalloc_lock);
	spin_lock_init(&fs_info->new_trans_lock);

@@ -799,6 +980,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
			     fs_info->btree_inode->i_mapping, GFP_NOFS);
	fs_info->do_barriers = 1;

	INIT_WORK(&fs_info->end_io_work, btrfs_end_io_csum);
#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
	INIT_WORK(&fs_info->trans_work, btrfs_transaction_cleaner, fs_info);
#else
@@ -1044,6 +1226,8 @@ int close_ctree(struct btrfs_root *root)
	extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree);

	truncate_inode_pages(fs_info->btree_inode->i_mapping, 0);
	flush_workqueue(end_io_workqueue);
	destroy_workqueue(end_io_workqueue);

	iput(fs_info->btree_inode);
#if 0
@@ -1171,12 +1355,18 @@ int btrfs_read_buffer(struct extent_buffer *buf)
{
	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
	struct inode *btree_inode = root->fs_info->btree_inode;
	return read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
	int ret;
	ret = read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
					buf, 0, 1, btree_get_extent);
	if (ret == 0) {
		buf->flags |= EXTENT_UPTODATE;
	}
	return ret;
}

static struct extent_io_ops btree_extent_io_ops = {
	.writepage_io_hook = btree_writepage_io_hook,
	.readpage_end_io_hook = btree_readpage_end_io_hook,
	.submit_bio_hook = btree_submit_bio_hook,
	/* note we're sharing with inode.c for the merge bio hook */
	.merge_bio_hook = btrfs_merge_bio_hook,
+0 −4
Original line number Diff line number Diff line
@@ -1898,10 +1898,6 @@ struct extent_buffer *__btrfs_alloc_free_block(struct btrfs_trans_handle *trans,

	set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
			 buf->start + buf->len - 1, GFP_NOFS);
	set_extent_bits(&BTRFS_I(root->fs_info->btree_inode)->io_tree,
			buf->start, buf->start + buf->len - 1,
			EXTENT_CSUM, GFP_NOFS);
	buf->flags |= EXTENT_CSUM;
	if (!btrfs_test_opt(root, SSD))
		btrfs_set_buffer_defrag(buf);
	trans->blocks_used++;
+78 −22
Original line number Diff line number Diff line
@@ -2592,6 +2592,22 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb,
	return p;
}

int release_extent_buffer_tail_pages(struct extent_buffer *eb)
{
	unsigned long num_pages = num_extent_pages(eb->start, eb->len);
	struct page *page;
	unsigned long i;

	if (num_pages == 1)
		return 0;
	for (i = 1; i < num_pages; i++) {
		page = extent_buffer_page(eb, i);
		page_cache_release(page);
	}
	return 0;
}


int invalidate_extent_lru(struct extent_io_tree *tree, u64 start,
			  unsigned long len)
{
@@ -2609,9 +2625,6 @@ int invalidate_extent_lru(struct extent_io_tree *tree, u64 start,
		if (eb->start <= start && eb->start + eb->len > start) {
			eb->flags &= ~EXTENT_UPTODATE;
		}
		if (eb->start == start) {
			eb->flags &= ~EXTENT_CSUM;
		}
		cur = cur->next;
	} while (cur != lru);
out:
@@ -2682,7 +2695,6 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
		page_cache_get(page0);
		mark_page_accessed(page0);
		set_page_extent_mapped(page0);
		WARN_ON(!PageUptodate(page0));
		set_page_extent_head(page0, len);
	} else {
		i = 0;
@@ -2933,13 +2945,39 @@ int set_extent_buffer_uptodate(struct extent_io_tree *tree,
}
EXPORT_SYMBOL(set_extent_buffer_uptodate);

int extent_range_uptodate(struct extent_io_tree *tree,
			  u64 start, u64 end)
{
	struct page *page;
	int ret;
	int pg_uptodate = 1;
	int uptodate;
	unsigned long index;

	ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1);
	if (ret)
		return 1;
	while(start <= end) {
		index = start >> PAGE_CACHE_SHIFT;
		page = find_get_page(tree->mapping, index);
		uptodate = PageUptodate(page);
		page_cache_release(page);
		if (!uptodate) {
			pg_uptodate = 0;
			break;
		}
		start += PAGE_CACHE_SIZE;
	}
	return pg_uptodate;
}

int extent_buffer_uptodate(struct extent_io_tree *tree,
			   struct extent_buffer *eb)
{
	int ret = 0;
	int ret2;
	int num_pages;
	int i;
	unsigned long num_pages;
	unsigned long i;
	struct page *page;
	int pg_uptodate = 1;

@@ -2975,13 +3013,16 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
	struct page *page;
	int err;
	int ret = 0;
	int locked_pages = 0;
	int all_uptodate = 1;
	int inc_all_pages = 0;
	unsigned long num_pages;
	struct bio *bio = NULL;

	if (eb->flags & EXTENT_UPTODATE)
		return 0;

	if (0 && test_range_bit(tree, eb->start, eb->start + eb->len - 1,
	if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
			   EXTENT_UPTODATE, 1)) {
		return 0;
	}
@@ -2997,17 +3038,30 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
	num_pages = num_extent_pages(eb->start, eb->len);
	for (i = start_i; i < num_pages; i++) {
		page = extent_buffer_page(eb, i);
		if (PageUptodate(page)) {
			continue;
		}
		if (!wait) {
			if (TestSetPageLocked(page)) {
				continue;
			}
			if (TestSetPageLocked(page))
				goto unlock_exit;
		} else {
			lock_page(page);
		}
		locked_pages++;
		if (!PageUptodate(page)) {
			all_uptodate = 0;
		}
	}
	if (all_uptodate) {
		if (start_i == 0)
			eb->flags |= EXTENT_UPTODATE;
		goto unlock_exit;
	}

	for (i = start_i; i < num_pages; i++) {
		page = extent_buffer_page(eb, i);
		if (inc_all_pages)
			page_cache_get(page);
		if (!PageUptodate(page)) {
			if (start_i == 0)
				inc_all_pages = 1;
			err = __extent_read_full_page(tree, page,
						      get_extent, &bio);
			if (err) {
@@ -3034,6 +3088,16 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
	if (!ret)
		eb->flags |= EXTENT_UPTODATE;
	return ret;

unlock_exit:
	i = start_i;
	while(locked_pages > 0) {
		page = extent_buffer_page(eb, i);
		i++;
		unlock_page(page);
		locked_pages--;
	}
	return ret;
}
EXPORT_SYMBOL(read_extent_buffer_pages);

@@ -3048,7 +3112,6 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
	char *dst = (char *)dstv;
	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
	unsigned long num_pages = num_extent_pages(eb->start, eb->len);

	WARN_ON(start > eb->len);
	WARN_ON(start + len > eb->start + eb->len);
@@ -3057,11 +3120,6 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,

	while(len > 0) {
		page = extent_buffer_page(eb, i);
		if (!PageUptodate(page)) {
			printk("page %lu not up to date i %lu, total %lu, len %lu\n", page->index, i, num_pages, eb->len);
			WARN_ON(1);
		}
		WARN_ON(!PageUptodate(page));

		cur = min(len, (PAGE_CACHE_SIZE - offset));
		kaddr = kmap_atomic(page, KM_USER1);
@@ -3105,7 +3163,6 @@ printk("bad mapping eb start %Lu len %lu, wanted %lu %lu\n", eb->start, eb->len,
	}

	p = extent_buffer_page(eb, i);
	WARN_ON(!PageUptodate(p));
	kaddr = kmap_atomic(p, km);
	*token = kaddr;
	*map = kaddr + offset;
@@ -3165,7 +3222,6 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,

	while(len > 0) {
		page = extent_buffer_page(eb, i);
		WARN_ON(!PageUptodate(page));

		cur = min(len, (PAGE_CACHE_SIZE - offset));

Loading