Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 49dae1bc authored by Filipe Manana's avatar Filipe Manana Committed by Chris Mason
Browse files

Btrfs: fix fsync data loss after a ranged fsync



While we're doing a full fsync (when the inode has the flag
BTRFS_INODE_NEEDS_FULL_SYNC set) that is ranged too (covers only a
portion of the file), we might have ordered operations that are started
before or while we're logging the inode and that fall outside the fsync
range.

Therefore when a full ranged fsync finishes don't remove every extent
map from the list of modified extent maps - as for some of them, that
fall outside our fsync range, their respective ordered operation hasn't
finished yet, meaning the corresponding file extent item wasn't inserted
into the fs/subvol tree yet and therefore we didn't log it, and we must
let the next fast fsync (one that checks only the modified list) see this
extent map and log a matching file extent item to the log btree and wait
for its ordered operation to finish (if it's still ongoing).

A test case for xfstests follows.

Signed-off-by: default avatarFilipe Manana <fdmanana@suse.com>
Signed-off-by: default avatarChris Mason <clm@fb.com>
parent c47ca32d
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -1966,7 +1966,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)

	btrfs_init_log_ctx(&ctx);

	ret = btrfs_log_dentry_safe(trans, root, dentry, &ctx);
	ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx);
	if (ret < 0) {
		/* Fallthrough and commit/free transaction. */
		ret = 1;
+61 −16
Original line number Diff line number Diff line
@@ -95,7 +95,9 @@

static int btrfs_log_inode(struct btrfs_trans_handle *trans,
			   struct btrfs_root *root, struct inode *inode,
			     int inode_only);
			   int inode_only,
			   const loff_t start,
			   const loff_t end);
static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
			     struct btrfs_root *root,
			     struct btrfs_path *path, u64 objectid);
@@ -3859,7 +3861,9 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 */
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
			   struct btrfs_root *root, struct inode *inode,
			     int inode_only)
			   int inode_only,
			   const loff_t start,
			   const loff_t end)
{
	struct btrfs_path *path;
	struct btrfs_path *dst_path;
@@ -3876,6 +3880,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
	int ins_nr;
	bool fast_search = false;
	u64 ino = btrfs_ino(inode);
	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;

	path = btrfs_alloc_path();
	if (!path)
@@ -4049,13 +4054,35 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
			goto out_unlock;
		}
	} else if (inode_only == LOG_INODE_ALL) {
		struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
		struct extent_map *em, *n;

		write_lock(&tree->lock);
		list_for_each_entry_safe(em, n, &tree->modified_extents, list)
		write_lock(&em_tree->lock);
		/*
		 * We can't just remove every em if we're called for a ranged
		 * fsync - that is, one that doesn't cover the whole possible
		 * file range (0 to LLONG_MAX). This is because we can have
		 * em's that fall outside the range we're logging and therefore
		 * their ordered operations haven't completed yet
		 * (btrfs_finish_ordered_io() not invoked yet). This means we
		 * didn't get their respective file extent item in the fs/subvol
		 * tree yet, and need to let the next fast fsync (one which
		 * consults the list of modified extent maps) find the em so
		 * that it logs a matching file extent item and waits for the
		 * respective ordered operation to complete (if it's still
		 * running).
		 *
		 * Removing every em outside the range we're logging would make
		 * the next fast fsync not log their matching file extent items,
		 * therefore making us lose data after a log replay.
		 */
		list_for_each_entry_safe(em, n, &em_tree->modified_extents,
					 list) {
			const u64 mod_end = em->mod_start + em->mod_len - 1;

			if (em->mod_start >= start && mod_end <= end)
				list_del_init(&em->list);
		write_unlock(&tree->lock);
		}
		write_unlock(&em_tree->lock);
	}

	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
@@ -4065,8 +4092,19 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
			goto out_unlock;
		}
	}

	write_lock(&em_tree->lock);
	/*
	 * If we're doing a ranged fsync and there are still modified extents
	 * in the list, we must run on the next fsync call as it might cover
	 * those extents (a full fsync or an fsync for other range).
	 */
	if (list_empty(&em_tree->modified_extents)) {
		BTRFS_I(inode)->logged_trans = trans->transid;
	BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
		BTRFS_I(inode)->last_log_commit =
			BTRFS_I(inode)->last_sub_trans;
	}
	write_unlock(&em_tree->lock);
out_unlock:
	if (unlikely(err))
		btrfs_put_logged_extents(&logged_list);
@@ -4161,7 +4199,10 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
 */
static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
			    	  struct btrfs_root *root, struct inode *inode,
			    	  struct dentry *parent, int exists_only,
				  struct dentry *parent,
				  const loff_t start,
				  const loff_t end,
				  int exists_only,
				  struct btrfs_log_ctx *ctx)
{
	int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
@@ -4207,7 +4248,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
	if (ret)
		goto end_no_trans;

	ret = btrfs_log_inode(trans, root, inode, inode_only);
	ret = btrfs_log_inode(trans, root, inode, inode_only, start, end);
	if (ret)
		goto end_trans;

@@ -4235,7 +4276,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,

		if (BTRFS_I(inode)->generation >
		    root->fs_info->last_trans_committed) {
			ret = btrfs_log_inode(trans, root, inode, inode_only);
			ret = btrfs_log_inode(trans, root, inode, inode_only,
					      0, LLONG_MAX);
			if (ret)
				goto end_trans;
		}
@@ -4269,13 +4311,15 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 */
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
			  struct btrfs_root *root, struct dentry *dentry,
			  const loff_t start,
			  const loff_t end,
			  struct btrfs_log_ctx *ctx)
{
	struct dentry *parent = dget_parent(dentry);
	int ret;

	ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent,
				     0, ctx);
				     start, end, 0, ctx);
	dput(parent);

	return ret;
@@ -4512,6 +4556,7 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
		    root->fs_info->last_trans_committed))
		return 0;

	return btrfs_log_inode_parent(trans, root, inode, parent, 1, NULL);
	return btrfs_log_inode_parent(trans, root, inode, parent, 0,
				      LLONG_MAX, 1, NULL);
}
+2 −0
Original line number Diff line number Diff line
@@ -59,6 +59,8 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
int btrfs_recover_log_trees(struct btrfs_root *tree_root);
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
			  struct btrfs_root *root, struct dentry *dentry,
			  const loff_t start,
			  const loff_t end,
			  struct btrfs_log_ctx *ctx);
int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root,