Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 28a535f9 authored by Dmitry Monakhov's avatar Dmitry Monakhov Committed by Theodore Ts'o
Browse files

ext4: completed_io locking cleanup

Current unwritten extent conversion state-machine is very fuzzy.
- For unknown reason it performs conversion under i_mutex. What for?
  My diagnosis:
  We already protect extent tree with i_data_sem, truncate and punch_hole
  should wait for DIO, so the only data we have to protect is end_io->flags
  modification, but only flush_completed_IO and end_io_work modified this
  flags and we can serialize them via i_completed_io_lock.

  Currently all these games with mutex_trylock result in the following deadlock
   truncate:                          kworker:
    ext4_setattr                       ext4_end_io_work
    mutex_lock(i_mutex)
    inode_dio_wait(inode)  ->BLOCK
                             DEADLOCK<- mutex_trylock()
                                        inode_dio_done()
  #TEST_CASE1_BEGIN
  MNT=/mnt_scrach
  unlink $MNT/file
  fallocate -l $((1024*1024*1024)) $MNT/file
  aio-stress -I 100000 -O -s 100m -n -t 1 -c 10 -o 2 -o 3 $MNT/file
  sleep 2
  truncate -s 0 $MNT/file
  #TEST_CASE1_END

Or use 286's xfstests https://github.com/dmonakhov/xfstests/blob/devel/286



This patch makes state machine simple and clean:

(1) xxx_end_io schedule final extent conversion simply by calling
    ext4_add_complete_io(), which append it to ei->i_completed_io_list
    NOTE1: because of (2A) work should be queued only if
    ->i_completed_io_list was empty, otherwise the work is scheduled already.

(2) ext4_flush_completed_IO is responsible for handling all pending
    end_io from ei->i_completed_io_list
    Flushing sequence consists of following stages:
    A) LOCKED: Atomically drain completed_io_list to local_list
    B) Perform extents conversion
    C) LOCKED: move converted io's to to_free list for final deletion
       	     This logic depends on context which we was called from.
    D) Final end_io context destruction
    NOTE1: i_mutex is no longer required because end_io->flags modification
    is protected by ei->ext4_complete_io_lock

Full list of changes:
- Move all completion end_io related routines to page-io.c in order to improve
  logic locality
- Move open coded logic from various xx_end_xx routines to ext4_add_complete_io()
- remove EXT4_IO_END_FSYNC
- Improve SMP scalability by removing useless i_mutex which does not
  protect io->flags anymore.
- Reduce lock contention on i_completed_io_lock by optimizing list walk.
- Rename ext4_end_io_nolock to end4_end_io and make it static
- Check flush completion status to ext4_ext_punch_hole(). Because it is
  not good idea to punch blocks from corrupted inode.

Changes since V3 (in request to Jan's comments):
  Fall back to active flush_completed_IO() approach in order to prevent
  performance issues with nolocked DIO reads.
Changes since V2:
  Fix use-after-free caused by race truncate vs end_io_work

Signed-off-by: default avatarDmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: default avatar"Theodore Ts'o" <tytso@mit.edu>
parent 82e54229
Loading
Loading
Loading
Loading
+1 −2
Original line number Diff line number Diff line
@@ -186,7 +186,6 @@ struct mpage_da_data {
#define EXT4_IO_END_ERROR	0x0002
#define EXT4_IO_END_QUEUED	0x0004
#define EXT4_IO_END_DIRECT	0x0008
#define EXT4_IO_END_IN_FSYNC	0x0010

struct ext4_io_page {
	struct page	*p_page;
@@ -2418,11 +2417,11 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,

/* page-io.c */
extern int __init ext4_init_pageio(void);
extern void ext4_add_complete_io(ext4_io_end_t *io_end);
extern void ext4_exit_pageio(void);
extern void ext4_ioend_wait(struct inode *);
extern void ext4_free_io_end(ext4_io_end_t *io);
extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
extern int ext4_end_io_nolock(ext4_io_end_t *io);
extern void ext4_io_submit(struct ext4_io_submit *io);
extern int ext4_bio_write_page(struct ext4_io_submit *io,
			       struct page *page,
+3 −1
Original line number Diff line number Diff line
@@ -4833,7 +4833,9 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
	}

	/* finish any pending end_io work */
	ext4_flush_completed_IO(inode);
	err = ext4_flush_completed_IO(inode);
	if (err)
		return err;

	credits = ext4_writepage_trans_blocks(inode);
	handle = ext4_journal_start(inode, credits);
+0 −81
Original line number Diff line number Diff line
@@ -34,87 +34,6 @@

#include <trace/events/ext4.h>

static void dump_completed_IO(struct inode * inode)
{
#ifdef	EXT4FS_DEBUG
	struct list_head *cur, *before, *after;
	ext4_io_end_t *io, *io0, *io1;
	unsigned long flags;

	if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
		ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
		return;
	}

	ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
	spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
	list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
		cur = &io->list;
		before = cur->prev;
		io0 = container_of(before, ext4_io_end_t, list);
		after = cur->next;
		io1 = container_of(after, ext4_io_end_t, list);

		ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
			    io, inode->i_ino, io0, io1);
	}
	spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
#endif
}

/*
 * This function is called from ext4_sync_file().
 *
 * When IO is completed, the work to convert unwritten extents to
 * written is queued on workqueue but may not get immediately
 * scheduled. When fsync is called, we need to ensure the
 * conversion is complete before fsync returns.
 * The inode keeps track of a list of pending/completed IO that
 * might needs to do the conversion. This function walks through
 * the list and convert the related unwritten extents for completed IO
 * to written.
 * The function return the number of pending IOs on success.
 */
int ext4_flush_completed_IO(struct inode *inode)
{
	ext4_io_end_t *io;
	struct ext4_inode_info *ei = EXT4_I(inode);
	unsigned long flags;
	int ret = 0;
	int ret2 = 0;

	dump_completed_IO(inode);
	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
	while (!list_empty(&ei->i_completed_io_list)){
		io = list_entry(ei->i_completed_io_list.next,
				ext4_io_end_t, list);
		list_del_init(&io->list);
		io->flag |= EXT4_IO_END_IN_FSYNC;
		/*
		 * Calling ext4_end_io_nolock() to convert completed
		 * IO to written.
		 *
		 * When ext4_sync_file() is called, run_queue() may already
		 * about to flush the work corresponding to this io structure.
		 * It will be upset if it founds the io structure related
		 * to the work-to-be schedule is freed.
		 *
		 * Thus we need to keep the io structure still valid here after
		 * conversion finished. The io structure has a flag to
		 * avoid double converting from both fsync and background work
		 * queue work.
		 */
		spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
		ret = ext4_end_io_nolock(io);
		if (ret < 0)
			ret2 = ret;
		spin_lock_irqsave(&ei->i_completed_io_lock, flags);
		io->flag &= ~EXT4_IO_END_IN_FSYNC;
	}
	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
	return (ret2 < 0) ? ret2 : 0;
}

/*
 * If we're not journaling and this is a just-created file, we have to
 * sync our parent directory (if it was freshly created) since
+2 −4
Original line number Diff line number Diff line
@@ -807,11 +807,9 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,

retry:
	if (rw == READ && ext4_should_dioread_nolock(inode)) {
		if (unlikely(!list_empty(&ei->i_completed_io_list))) {
			mutex_lock(&inode->i_mutex);
		if (unlikely(!list_empty(&ei->i_completed_io_list)))
			ext4_flush_completed_IO(inode);
			mutex_unlock(&inode->i_mutex);
		}

		ret = __blockdev_direct_IO(rw, iocb, inode,
				 inode->i_sb->s_bdev, iov,
				 offset, nr_segs,
+2 −23
Original line number Diff line number Diff line
@@ -2881,9 +2881,6 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
{
	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
        ext4_io_end_t *io_end = iocb->private;
	struct workqueue_struct *wq;
	unsigned long flags;
	struct ext4_inode_info *ei;

	/* if not async direct IO or dio with 0 bytes write, just return */
	if (!io_end || !size)
@@ -2912,24 +2909,14 @@ out:
		io_end->iocb = iocb;
		io_end->result = ret;
	}
	wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;

	/* Add the io_end to per-inode completed aio dio list*/
	ei = EXT4_I(io_end->inode);
	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
	list_add_tail(&io_end->list, &ei->i_completed_io_list);
	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);

	/* queue the work to convert unwritten extents to written */
	queue_work(wq, &io_end->work);
	ext4_add_complete_io(io_end);
}

static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
{
	ext4_io_end_t *io_end = bh->b_private;
	struct workqueue_struct *wq;
	struct inode *inode;
	unsigned long flags;

	if (!test_clear_buffer_uninit(bh) || !io_end)
		goto out;
@@ -2948,15 +2935,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
	 */
	inode = io_end->inode;
	ext4_set_io_unwritten_flag(inode, io_end);

	/* Add the io_end to per-inode completed io list*/
	spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
	list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
	spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);

	wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
	/* queue the work to convert unwritten extents to written */
	queue_work(wq, &io_end->work);
	ext4_add_complete_io(io_end);
out:
	bh->b_private = NULL;
	bh->b_end_io = NULL;
Loading