Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 03e8f644 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull btrfs fixes from Chris Mason:
 "This is an assorted set I've been queuing up:

  Jeff Mahoney tracked down a tricky one where we ended up starting IO
  on the wrong mapping for special files in btrfs_evict_inode.  A few
  people reported this one on the list.

  Filipe found (and provided a test for) a difficult bug in reading
  compressed extents, and Josef fixed up some quota record keeping with
  snapshot deletion.  Chandan killed off an accounting bug during DIO
  that lead to WARN_ONs as we freed inodes"

* 'for-linus-4.3' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs:
  Btrfs: keep dropped roots in cache until transaction commit
  Btrfs: Direct I/O: Fix space accounting
  btrfs: skip waiting on ordered range for special files
  Btrfs: fix read corruption of compressed and shared extents
  Btrfs: remove unnecessary locking of cleaner_mutex to avoid deadlock
  Btrfs: don't initialize a space info as full to prevent ENOSPC
parents 101688f5 2b9dbef2
Loading
Loading
Loading
Loading
+0 −2
Original line number Diff line number Diff line
@@ -44,8 +44,6 @@
#define BTRFS_INODE_IN_DELALLOC_LIST		9
#define BTRFS_INODE_READDIO_NEED_LOCK		10
#define BTRFS_INODE_HAS_PROPS		        11
/* DIO is ready to submit */
#define BTRFS_INODE_DIO_READY		        12
/*
 * The following 3 bits are meant only for the btree inode.
 * When any of them is set, it means an error happened while writing an
+0 −2
Original line number Diff line number Diff line
@@ -3765,9 +3765,7 @@ void close_ctree(struct btrfs_root *root)
		 * block groups queued for removal, the deletion will be
		 * skipped when we quit the cleaner thread.
		 */
		mutex_lock(&root->fs_info->cleaner_mutex);
		btrfs_delete_unused_bgs(root->fs_info);
		mutex_unlock(&root->fs_info->cleaner_mutex);

		ret = btrfs_commit_super(root);
		if (ret)
+2 −5
Original line number Diff line number Diff line
@@ -3742,10 +3742,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
	found->bytes_reserved = 0;
	found->bytes_readonly = 0;
	found->bytes_may_use = 0;
	if (total_bytes > 0)
	found->full = 0;
	else
		found->full = 1;
	found->force_alloc = CHUNK_ALLOC_NO_FORCE;
	found->chunk_alloc = 0;
	found->flush = 0;
@@ -8668,7 +8665,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
	}

	if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
		btrfs_drop_and_free_fs_root(tree_root->fs_info, root);
		btrfs_add_dropped_root(trans, root);
	} else {
		free_extent_buffer(root->node);
		free_extent_buffer(root->commit_root);
+57 −8
Original line number Diff line number Diff line
@@ -2798,7 +2798,8 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
			      bio_end_io_t end_io_func,
			      int mirror_num,
			      unsigned long prev_bio_flags,
			      unsigned long bio_flags)
			      unsigned long bio_flags,
			      bool force_bio_submit)
{
	int ret = 0;
	struct bio *bio;
@@ -2814,6 +2815,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
			contig = bio_end_sector(bio) == sector;

		if (prev_bio_flags != bio_flags || !contig ||
		    force_bio_submit ||
		    merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) ||
		    bio_add_page(bio, page, page_size, offset) < page_size) {
			ret = submit_one_bio(rw, bio, mirror_num,
@@ -2910,7 +2912,8 @@ static int __do_readpage(struct extent_io_tree *tree,
			 get_extent_t *get_extent,
			 struct extent_map **em_cached,
			 struct bio **bio, int mirror_num,
			 unsigned long *bio_flags, int rw)
			 unsigned long *bio_flags, int rw,
			 u64 *prev_em_start)
{
	struct inode *inode = page->mapping->host;
	u64 start = page_offset(page);
@@ -2958,6 +2961,7 @@ static int __do_readpage(struct extent_io_tree *tree,
	}
	while (cur <= end) {
		unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
		bool force_bio_submit = false;

		if (cur >= last_byte) {
			char *userpage;
@@ -3008,6 +3012,49 @@ static int __do_readpage(struct extent_io_tree *tree,
		block_start = em->block_start;
		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
			block_start = EXTENT_MAP_HOLE;

		/*
		 * If we have a file range that points to a compressed extent
		 * and it's followed by a consecutive file range that points to
		 * to the same compressed extent (possibly with a different
		 * offset and/or length, so it either points to the whole extent
		 * or only part of it), we must make sure we do not submit a
		 * single bio to populate the pages for the 2 ranges because
		 * this makes the compressed extent read zero out the pages
		 * belonging to the 2nd range. Imagine the following scenario:
		 *
		 *  File layout
		 *  [0 - 8K]                     [8K - 24K]
		 *    |                               |
		 *    |                               |
		 * points to extent X,         points to extent X,
		 * offset 4K, length of 8K     offset 0, length 16K
		 *
		 * [extent X, compressed length = 4K uncompressed length = 16K]
		 *
		 * If the bio to read the compressed extent covers both ranges,
		 * it will decompress extent X into the pages belonging to the
		 * first range and then it will stop, zeroing out the remaining
		 * pages that belong to the other range that points to extent X.
		 * So here we make sure we submit 2 bios, one for the first
		 * range and another one for the third range. Both will target
		 * the same physical extent from disk, but we can't currently
		 * make the compressed bio endio callback populate the pages
		 * for both ranges because each compressed bio is tightly
		 * coupled with a single extent map, and each range can have
		 * an extent map with a different offset value relative to the
		 * uncompressed data of our extent and different lengths. This
		 * is a corner case so we prioritize correctness over
		 * non-optimal behavior (submitting 2 bios for the same extent).
		 */
		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
		    prev_em_start && *prev_em_start != (u64)-1 &&
		    *prev_em_start != em->orig_start)
			force_bio_submit = true;

		if (prev_em_start)
			*prev_em_start = em->orig_start;

		free_extent_map(em);
		em = NULL;

@@ -3057,7 +3104,8 @@ static int __do_readpage(struct extent_io_tree *tree,
					 bdev, bio, pnr,
					 end_bio_extent_readpage, mirror_num,
					 *bio_flags,
					 this_bio_flag);
					 this_bio_flag,
					 force_bio_submit);
		if (!ret) {
			nr++;
			*bio_flags = this_bio_flag;
@@ -3089,6 +3137,7 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
	struct inode *inode;
	struct btrfs_ordered_extent *ordered;
	int index;
	u64 prev_em_start = (u64)-1;

	inode = pages[0]->mapping->host;
	while (1) {
@@ -3104,7 +3153,7 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree,

	for (index = 0; index < nr_pages; index++) {
		__do_readpage(tree, pages[index], get_extent, em_cached, bio,
			      mirror_num, bio_flags, rw);
			      mirror_num, bio_flags, rw, &prev_em_start);
		page_cache_release(pages[index]);
	}
}
@@ -3172,7 +3221,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
	}

	ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num,
			    bio_flags, rw);
			    bio_flags, rw, NULL);
	return ret;
}

@@ -3198,7 +3247,7 @@ int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page,
	int ret;

	ret = __do_readpage(tree, page, get_extent, NULL, &bio, mirror_num,
				      &bio_flags, READ);
			    &bio_flags, READ, NULL);
	if (bio)
		ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
	return ret;
@@ -3451,7 +3500,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
						 sector, iosize, pg_offset,
						 bdev, &epd->bio, max_nr,
						 end_bio_extent_writepage,
						 0, 0, 0);
						 0, 0, 0, false);
			if (ret)
				SetPageError(page);
		}
@@ -3754,7 +3803,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
		ret = submit_extent_page(rw, tree, wbc, p, offset >> 9,
					 PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
					 -1, end_bio_extent_buffer_writepage,
					 0, epd->bio_flags, bio_flags);
					 0, epd->bio_flags, bio_flags, false);
		epd->bio_flags = bio_flags;
		if (ret) {
			set_btree_ioerr(p);
+23 −22
Original line number Diff line number Diff line
@@ -5084,6 +5084,7 @@ void btrfs_evict_inode(struct inode *inode)
		goto no_delete;
	}
	/* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
	if (!special_file(inode->i_mode))
		btrfs_wait_ordered_range(inode, 0, (u64)-1);

	btrfs_free_io_failure_record(inode, 0, (u64)-1);
@@ -7408,6 +7409,10 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
	return em;
}

struct btrfs_dio_data {
	u64 outstanding_extents;
	u64 reserve;
};

static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
				   struct buffer_head *bh_result, int create)
@@ -7415,10 +7420,10 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
	struct extent_map *em;
	struct btrfs_root *root = BTRFS_I(inode)->root;
	struct extent_state *cached_state = NULL;
	struct btrfs_dio_data *dio_data = NULL;
	u64 start = iblock << inode->i_blkbits;
	u64 lockstart, lockend;
	u64 len = bh_result->b_size;
	u64 *outstanding_extents = NULL;
	int unlock_bits = EXTENT_LOCKED;
	int ret = 0;

@@ -7436,7 +7441,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
		 * that anything that needs to check if there's a transction doesn't get
		 * confused.
		 */
		outstanding_extents = current->journal_info;
		dio_data = current->journal_info;
		current->journal_info = NULL;
	}

@@ -7568,17 +7573,18 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
		 * within our reservation, otherwise we need to adjust our inode
		 * counter appropriately.
		 */
		if (*outstanding_extents) {
			(*outstanding_extents)--;
		if (dio_data->outstanding_extents) {
			(dio_data->outstanding_extents)--;
		} else {
			spin_lock(&BTRFS_I(inode)->lock);
			BTRFS_I(inode)->outstanding_extents++;
			spin_unlock(&BTRFS_I(inode)->lock);
		}

		current->journal_info = outstanding_extents;
		btrfs_free_reserved_data_space(inode, len);
		set_bit(BTRFS_INODE_DIO_READY, &BTRFS_I(inode)->runtime_flags);
		WARN_ON(dio_data->reserve < len);
		dio_data->reserve -= len;
		current->journal_info = dio_data;
	}

	/*
@@ -7601,8 +7607,8 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
unlock_err:
	clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
			 unlock_bits, 1, 0, &cached_state, GFP_NOFS);
	if (outstanding_extents)
		current->journal_info = outstanding_extents;
	if (dio_data)
		current->journal_info = dio_data;
	return ret;
}

@@ -8329,7 +8335,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
{
	struct file *file = iocb->ki_filp;
	struct inode *inode = file->f_mapping->host;
	u64 outstanding_extents = 0;
	struct btrfs_root *root = BTRFS_I(inode)->root;
	struct btrfs_dio_data dio_data = { 0 };
	size_t count = 0;
	int flags = 0;
	bool wakeup = true;
@@ -8367,7 +8374,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
		ret = btrfs_delalloc_reserve_space(inode, count);
		if (ret)
			goto out;
		outstanding_extents = div64_u64(count +
		dio_data.outstanding_extents = div64_u64(count +
						BTRFS_MAX_EXTENT_SIZE - 1,
						BTRFS_MAX_EXTENT_SIZE);

@@ -8376,7 +8383,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
		 * do the accounting properly if we go over the number we
		 * originally calculated.  Abuse current->journal_info for this.
		 */
		current->journal_info = &outstanding_extents;
		dio_data.reserve = round_up(count, root->sectorsize);
		current->journal_info = &dio_data;
	} else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
				     &BTRFS_I(inode)->runtime_flags)) {
		inode_dio_end(inode);
@@ -8391,16 +8399,9 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
	if (iov_iter_rw(iter) == WRITE) {
		current->journal_info = NULL;
		if (ret < 0 && ret != -EIOCBQUEUED) {
			/*
			 * If the error comes from submitting stage,
			 * btrfs_get_blocsk_direct() has free'd data space,
			 * and metadata space will be handled by
			 * finish_ordered_fn, don't do that again to make
			 * sure bytes_may_use is correct.
			 */
			if (!test_and_clear_bit(BTRFS_INODE_DIO_READY,
				     &BTRFS_I(inode)->runtime_flags))
				btrfs_delalloc_release_space(inode, count);
			if (dio_data.reserve)
				btrfs_delalloc_release_space(inode,
							dio_data.reserve);
		} else if (ret >= 0 && (size_t)ret < count)
			btrfs_delalloc_release_space(inode,
						     count - (size_t)ret);
Loading