btrfs: always wait on ordered extents at fsync time (b5e6c3e1) · Commits · e / devices / android_kernel_fairphone_FP5

fs/btrfs/file.c

+4 −52

Original line number	Diff line number	Diff line
		@@ -2068,53 +2068,12 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
		atomic_inc(&root->log_batch);
		full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
		&BTRFS_I(inode)->runtime_flags);

		/*
		* We might have have had more pages made dirty after calling
		* start_ordered_ops and before acquiring the inode's i_mutex.
		*/
		if (full_sync) {
		/*
		* For a full sync, we need to make sure any ordered operations
		* start and finish before we start logging the inode, so that
		* all extents are persisted and the respective file extent
		* items are in the fs/subvol btree.
		* We have to do this here to avoid the priority inversion of waiting on
		* IO of a lower priority task while holding a transaciton open.
		*/
		ret = btrfs_wait_ordered_range(inode, start, len);
		} else {
		/*
		* Start any new ordered operations before starting to log the
		* inode. We will wait for them to finish in btrfs_sync_log().
		*
		* Right before acquiring the inode's mutex, we might have new
		* writes dirtying pages, which won't immediately start the
		* respective ordered operations - that is done through the
		* fill_delalloc callbacks invoked from the writepage and
		* writepages address space operations. So make sure we start
		* all ordered operations before starting to log our inode. Not
		* doing this means that while logging the inode, writeback
		* could start and invoke writepage/writepages, which would call
		* the fill_delalloc callbacks (cow_file_range,
		* submit_compressed_extents). These callbacks add first an
		* extent map to the modified list of extents and then create
		* the respective ordered operation, which means in
		* tree-log.c:btrfs_log_inode() we might capture all existing
		* ordered operations (with btrfs_get_logged_extents()) before
		* the fill_delalloc callback adds its ordered operation, and by
		* the time we visit the modified list of extent maps (with
		* btrfs_log_changed_extents()), we see and process the extent
		* map they created. We then use the extent map to construct a
		* file extent item for logging without waiting for the
		* respective ordered operation to finish - this file extent
		* item points to a disk location that might not have yet been
		* written to, containing random data - so after a crash a log
		* replay will make our inode have file extent items that point
		* to disk locations containing invalid data, as we returned
		* success to userspace without waiting for the respective
		* ordered operation to finish, because it wasn't captured by
		* btrfs_get_logged_extents().
		*/
		ret = start_ordered_ops(inode, start, end);
		}
		if (ret) {
		inode_unlock(inode);
		goto out;
		@@ -2239,13 +2198,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
		goto out;
		}
		}
		if (!full_sync) {
		ret = btrfs_wait_ordered_range(inode, start, len);
		if (ret) {
		btrfs_end_transaction(trans);
		goto out;
		}
		}
		ret = btrfs_commit_transaction(trans);
		} else {
		ret = btrfs_end_transaction(trans);