Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit b436b9be authored by Jan Kara's avatar Jan Kara Committed by Theodore Ts'o
Browse files

ext4: Wait for proper transaction commit on fsync



We cannot rely on buffer dirty bits during fsync because pdflush can come
before fsync is called and clear dirty bits without forcing a transaction
commit. What we do is that we track which transaction has last changed
the inode and which transaction last changed allocation and force it to
disk on fsync.

Signed-off-by: default avatarJan Kara <jack@suse.cz>
Signed-off-by: default avatar"Theodore Ts'o" <tytso@mit.edu>
parent 194074ac
Loading
Loading
Loading
Loading
+7 −0
Original line number Diff line number Diff line
@@ -709,6 +709,13 @@ struct ext4_inode_info {
	struct list_head i_aio_dio_complete_list;
	/* current io_end structure for async DIO write*/
	ext4_io_end_t *cur_aio_dio;

	/*
	 * Transactions that contain inode's metadata needed to complete
	 * fsync and fdatasync, respectively.
	 */
	tid_t i_sync_tid;
	tid_t i_datasync_tid;
};

/*
+13 −0
Original line number Diff line number Diff line
@@ -249,6 +249,19 @@ static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
	return 0;
}

static inline void ext4_update_inode_fsync_trans(handle_t *handle,
						 struct inode *inode,
						 int datasync)
{
	struct ext4_inode_info *ei = EXT4_I(inode);

	if (ext4_handle_valid(handle)) {
		ei->i_sync_tid = handle->h_transaction->t_tid;
		if (datasync)
			ei->i_datasync_tid = handle->h_transaction->t_tid;
	}
}

/* super.c */
int ext4_force_commit(struct super_block *sb);

+12 −2
Original line number Diff line number Diff line
@@ -3058,6 +3058,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
	if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) {
		ret = ext4_convert_unwritten_extents_dio(handle, inode,
							path);
		if (ret >= 0)
			ext4_update_inode_fsync_trans(handle, inode, 1);
		goto out2;
	}
	/* buffered IO case */
@@ -3085,6 +3087,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
	ret = ext4_ext_convert_to_initialized(handle, inode,
						path, iblock,
						max_blocks);
	if (ret >= 0)
		ext4_update_inode_fsync_trans(handle, inode, 1);
out:
	if (ret <= 0) {
		err = ret;
@@ -3323,10 +3327,16 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
	allocated = ext4_ext_get_actual_len(&newex);
	set_buffer_new(bh_result);

	/* Cache only when it is _not_ an uninitialized extent */
	if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0)
	/*
	 * Cache the extent and update transaction to commit on fdatasync only
	 * when it is _not_ an uninitialized extent.
	 */
	if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
		ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
						EXT4_EXT_CACHE_EXTENT);
		ext4_update_inode_fsync_trans(handle, inode, 1);
	} else
		ext4_update_inode_fsync_trans(handle, inode, 0);
out:
	if (allocated > max_blocks)
		allocated = max_blocks;
+17 −29
Original line number Diff line number Diff line
@@ -51,25 +51,30 @@
int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
{
	struct inode *inode = dentry->d_inode;
	struct ext4_inode_info *ei = EXT4_I(inode);
	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
	int err, ret = 0;
	int ret;
	tid_t commit_tid;

	J_ASSERT(ext4_journal_current_handle() == NULL);

	trace_ext4_sync_file(file, dentry, datasync);

	if (inode->i_sb->s_flags & MS_RDONLY)
		return 0;

	ret = flush_aio_dio_completed_IO(inode);
	if (ret < 0)
		return ret;
	
	if (!journal)
		return simple_fsync(file, dentry, datasync);

	/*
	 * data=writeback:
	 * data=writeback,ordered:
	 *  The caller's filemap_fdatawrite()/wait will sync the data.
	 *  sync_inode() will sync the metadata
	 *
	 * data=ordered:
	 *  The caller's filemap_fdatawrite() will write the data and
	 *  sync_inode() will write the inode if it is dirty.  Then the caller's
	 *  filemap_fdatawait() will wait on the pages.
	 *  Metadata is in the journal, we wait for proper transaction to
	 *  commit here.
	 *
	 * data=journal:
	 *  filemap_fdatawrite won't do anything (the buffers are clean).
@@ -82,27 +87,10 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
	if (ext4_should_journal_data(inode))
		return ext4_force_commit(inode->i_sb);

	if (!journal)
		ret = sync_mapping_buffers(inode->i_mapping);

	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
		goto out;

	/*
	 * The VFS has written the file data.  If the inode is unaltered
	 * then we need not start a commit.
	 */
	if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) {
		struct writeback_control wbc = {
			.sync_mode = WB_SYNC_ALL,
			.nr_to_write = 0, /* sys_fsync did this */
		};
		err = sync_inode(inode, &wbc);
		if (ret == 0)
			ret = err;
	}
out:
	if (journal && (journal->j_flags & JBD2_BARRIER))
	commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
	if (jbd2_log_start_commit(journal, commit_tid))
		jbd2_log_wait_commit(journal, commit_tid);
	else if (journal->j_flags & JBD2_BARRIER)
		blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
	return ret;
}
+29 −0
Original line number Diff line number Diff line
@@ -983,6 +983,8 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
		goto cleanup;

	set_buffer_new(bh_result);

	ext4_update_inode_fsync_trans(handle, inode, 1);
got_it:
	map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
	if (count > blocks_to_boundary)
@@ -4738,6 +4740,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
	struct ext4_inode *raw_inode;
	struct ext4_inode_info *ei;
	struct inode *inode;
	journal_t *journal = EXT4_SB(sb)->s_journal;
	long ret;
	int block;

@@ -4802,6 +4805,31 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
		ei->i_data[block] = raw_inode->i_block[block];
	INIT_LIST_HEAD(&ei->i_orphan);

	/*
	 * Set transaction id's of transactions that have to be committed
	 * to finish f[data]sync. We set them to currently running transaction
	 * as we cannot be sure that the inode or some of its metadata isn't
	 * part of the transaction - the inode could have been reclaimed and
	 * now it is reread from disk.
	 */
	if (journal) {
		transaction_t *transaction;
		tid_t tid;

		spin_lock(&journal->j_state_lock);
		if (journal->j_running_transaction)
			transaction = journal->j_running_transaction;
		else
			transaction = journal->j_committing_transaction;
		if (transaction)
			tid = transaction->t_tid;
		else
			tid = journal->j_commit_sequence;
		spin_unlock(&journal->j_state_lock);
		ei->i_sync_tid = tid;
		ei->i_datasync_tid = tid;
	}

	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
		ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
		if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
@@ -5056,6 +5084,7 @@ static int ext4_do_update_inode(handle_t *handle,
		err = rc;
	ei->i_state &= ~EXT4_STATE_NEW;

	ext4_update_inode_fsync_trans(handle, inode, 0);
out_brelse:
	brelse(bh);
	ext4_std_error(inode->i_sb, err);
Loading