Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit fe0f07d0 authored by Jens Axboe's avatar Jens Axboe Committed by Al Viro
Browse files

direct-io: only inc/dec inode->i_dio_count for file systems

do_blockdev_direct_IO() increments and decrements the inode
->i_dio_count for each IO operation. It does this to protect against
truncate of a file. Block devices don't need this sort of protection.

For a capable multiqueue setup, this atomic int is the only shared
state between applications accessing the device for O_DIRECT, and it
presents a scaling wall for that. In my testing, as much as 30% of
system time is spent incrementing and decrementing this value. A mixed
read/write workload improved from ~2.5M IOPS to ~9.6M IOPS, with
better latencies too. Before:

clat percentiles (usec):
 |  1.00th=[   33],  5.00th=[   34], 10.00th=[   34], 20.00th=[   34],
 | 30.00th=[   34], 40.00th=[   34], 50.00th=[   35], 60.00th=[   35],
 | 70.00th=[   35], 80.00th=[   35], 90.00th=[   37], 95.00th=[   80],
 | 99.00th=[   98], 99.50th=[  151], 99.90th=[  155], 99.95th=[  155],
 | 99.99th=[  165]

After:

clat percentiles (usec):
 |  1.00th=[   95],  5.00th=[  108], 10.00th=[  129], 20.00th=[  149],
 | 30.00th=[  155], 40.00th=[  161], 50.00th=[  167], 60.00th=[  171],
 | 70.00th=[  177], 80.00th=[  185], 90.00th=[  201], 95.00th=[  270],
 | 99.00th=[  390], 99.50th=[  398], 99.90th=[  418], 99.95th=[  422],
 | 99.99th=[  438]

In other setups, Robert Elliott reported seeing good performance
improvements:

https://lkml.org/lkml/2015/4/3/557



The more applications accessing the device, the worse it gets.

Add a new direct-io flags, DIO_SKIP_DIO_COUNT, which tells
do_blockdev_direct_IO() that it need not worry about incrementing
or decrementing the inode i_dio_count for this caller.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Elliott, Robert (Server Storage) <elliott@hp.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: default avatarJens Axboe <axboe@fb.com>
Signed-off-by: default avatarAl Viro <viro@zeniv.linux.org.uk>
parent 8e3c5005
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -152,7 +152,8 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
	struct inode *inode = file->f_mapping->host;

	return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter, offset,
				    blkdev_get_block, NULL, NULL, 0);
				    blkdev_get_block, NULL, NULL,
				    DIO_SKIP_DIO_COUNT);
}

int __sync_blockdev(struct block_device *bdev, int wait)
+3 −3
Original line number Diff line number Diff line
@@ -8129,7 +8129,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
	if (check_direct_IO(BTRFS_I(inode)->root, iocb, iter, offset))
		return 0;

	atomic_inc(&inode->i_dio_count);
	inode_dio_begin(inode);
	smp_mb__after_atomic();

	/*
@@ -8169,7 +8169,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
		current->journal_info = &outstanding_extents;
	} else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
				     &BTRFS_I(inode)->runtime_flags)) {
		inode_dio_done(inode);
		inode_dio_end(inode);
		flags = DIO_LOCKING | DIO_SKIP_HOLES;
		wakeup = false;
	}
@@ -8188,7 +8188,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
	}
out:
	if (wakeup)
		inode_dio_done(inode);
		inode_dio_end(inode);
	if (relock)
		mutex_lock(&inode->i_mutex);

+2 −2
Original line number Diff line number Diff line
@@ -209,7 +209,7 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
	}

	/* Protects against truncate */
	atomic_inc(&inode->i_dio_count);
	inode_dio_begin(inode);

	retval = dax_io(inode, iter, pos, end, get_block, &bh);

@@ -219,7 +219,7 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
	if ((retval > 0) && end_io)
		end_io(iocb, pos, retval, bh.b_private);

	inode_dio_done(inode);
	inode_dio_end(inode);
 out:
	return retval;
}
+5 −2
Original line number Diff line number Diff line
@@ -253,7 +253,9 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret,
	if (dio->end_io && dio->result)
		dio->end_io(dio->iocb, offset, transferred, dio->private);

	inode_dio_done(dio->inode);
	if (!(dio->flags & DIO_SKIP_DIO_COUNT))
		inode_dio_end(dio->inode);

	if (is_async) {
		if (dio->rw & WRITE) {
			int err;
@@ -1195,7 +1197,8 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
	/*
	 * Will be decremented at I/O completion time.
	 */
	atomic_inc(&inode->i_dio_count);
	if (!(dio->flags & DIO_SKIP_DIO_COUNT))
		inode_dio_begin(inode);

	retval = 0;
	sdio.blkbits = blkbits;
+3 −3
Original line number Diff line number Diff line
@@ -682,11 +682,11 @@ ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
		 * via ext4_inode_block_unlocked_dio(). Check inode's state
		 * while holding extra i_dio_count ref.
		 */
		atomic_inc(&inode->i_dio_count);
		inode_dio_begin(inode);
		smp_mb();
		if (unlikely(ext4_test_inode_state(inode,
						    EXT4_STATE_DIOREAD_LOCK))) {
			inode_dio_done(inode);
			inode_dio_end(inode);
			goto locked;
		}
		if (IS_DAX(inode))
@@ -697,7 +697,7 @@ ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
						   inode->i_sb->s_bdev, iter,
						   offset, ext4_get_block, NULL,
						   NULL, 0);
		inode_dio_done(inode);
		inode_dio_end(inode);
	} else {
locked:
		if (IS_DAX(inode))
Loading