Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit bd5fe6c5 authored by Christoph Hellwig's avatar Christoph Hellwig Committed by Al Viro
Browse files

fs: kill i_alloc_sem



i_alloc_sem is a rather special rw_semaphore.  It's the last one that may
be released by a non-owner, and it's write side is always mirrored by
real exclusion.  It's intended use it to wait for all pending direct I/O
requests to finish before starting a truncate.

Replace it with a hand-grown construct:

 - exclusion for truncates is already guaranteed by i_mutex, so it can
   simply fall way
 - the reader side is replaced by an i_dio_count member in struct inode
   that counts the number of pending direct I/O requests.  Truncate can't
   proceed as long as it's non-zero
 - when i_dio_count reaches non-zero we wake up a pending truncate using
   wake_up_bit on a new bit in i_flags
 - new references to i_dio_count can't appear while we are waiting for
   it to read zero because the direct I/O count always needs i_mutex
   (or an equivalent like XFS's i_iolock) for starting a new operation.

This scheme is much simpler, and saves the space of a spinlock_t and a
struct list_head in struct inode (typically 160 bits on a non-debug 64-bit
system).

Signed-off-by: default avatarChristoph Hellwig <hch@lst.de>
Signed-off-by: default avatarAl Viro <viro@zeniv.linux.org.uk>
parent f9b5570d
Loading
Loading
Loading
Loading
+1 −4
Original line number Original line Diff line number Diff line
@@ -233,16 +233,13 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
		return error;
		return error;


	if (ia_valid & ATTR_SIZE)
	if (ia_valid & ATTR_SIZE)
		down_write(&dentry->d_inode->i_alloc_sem);
		inode_dio_wait(inode);


	if (inode->i_op->setattr)
	if (inode->i_op->setattr)
		error = inode->i_op->setattr(dentry, attr);
		error = inode->i_op->setattr(dentry, attr);
	else
	else
		error = simple_setattr(dentry, attr);
		error = simple_setattr(dentry, attr);


	if (ia_valid & ATTR_SIZE)
		up_write(&dentry->d_inode->i_alloc_sem);

	if (!error)
	if (!error)
		fsnotify_change(dentry, ia_valid);
		fsnotify_change(dentry, ia_valid);


+51 −14
Original line number Original line Diff line number Diff line
@@ -135,6 +135,50 @@ struct dio {
	struct page *pages[DIO_PAGES];	/* page buffer */
	struct page *pages[DIO_PAGES];	/* page buffer */
};
};


static void __inode_dio_wait(struct inode *inode)
{
	wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP);
	DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP);

	do {
		prepare_to_wait(wq, &q.wait, TASK_UNINTERRUPTIBLE);
		if (atomic_read(&inode->i_dio_count))
			schedule();
	} while (atomic_read(&inode->i_dio_count));
	finish_wait(wq, &q.wait);
}

/**
 * inode_dio_wait - wait for outstanding DIO requests to finish
 * @inode: inode to wait for
 *
 * Waits for all pending direct I/O requests to finish so that we can
 * proceed with a truncate or equivalent operation.
 *
 * Must be called under a lock that serializes taking new references
 * to i_dio_count, usually by inode->i_mutex.
 */
void inode_dio_wait(struct inode *inode)
{
	if (atomic_read(&inode->i_dio_count))
		__inode_dio_wait(inode);
}
EXPORT_SYMBOL_GPL(inode_dio_wait);

/*
 * inode_dio_done - signal finish of a direct I/O requests
 * @inode: inode the direct I/O happens on
 *
 * This is called once we've finished processing a direct I/O request,
 * and is used to wake up callers waiting for direct I/O to be quiesced.
 */
void inode_dio_done(struct inode *inode)
{
	if (atomic_dec_and_test(&inode->i_dio_count))
		wake_up_bit(&inode->i_state, __I_DIO_WAKEUP);
}
EXPORT_SYMBOL_GPL(inode_dio_done);

/*
/*
 * How many pages are in the queue?
 * How many pages are in the queue?
 */
 */
@@ -254,9 +298,7 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is
	}
	}


	if (dio->flags & DIO_LOCKING)
	if (dio->flags & DIO_LOCKING)
		/* lockdep: non-owner release */
		inode_dio_done(dio->inode);
		up_read_non_owner(&dio->inode->i_alloc_sem);

	return ret;
	return ret;
}
}


@@ -980,9 +1022,6 @@ static int do_direct_IO(struct dio *dio)
	return ret;
	return ret;
}
}


/*
 * Releases both i_mutex and i_alloc_sem
 */
static ssize_t
static ssize_t
direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, 
direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, 
	const struct iovec *iov, loff_t offset, unsigned long nr_segs, 
	const struct iovec *iov, loff_t offset, unsigned long nr_segs, 
@@ -1146,15 +1185,14 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 *    For writes this function is called under i_mutex and returns with
 *    For writes this function is called under i_mutex and returns with
 *    i_mutex held, for reads, i_mutex is not held on entry, but it is
 *    i_mutex held, for reads, i_mutex is not held on entry, but it is
 *    taken and dropped again before returning.
 *    taken and dropped again before returning.
 *    For reads and writes i_alloc_sem is taken in shared mode and released
 *    The i_dio_count counter keeps track of the number of outstanding
 *    on I/O completion (which may happen asynchronously after returning to
 *    direct I/O requests, and truncate waits for it to reach zero.
 *    the caller).
 *    New references to i_dio_count must only be grabbed with i_mutex
 *    held.
 *
 *
 *  - if the flags value does NOT contain DIO_LOCKING we don't use any
 *  - if the flags value does NOT contain DIO_LOCKING we don't use any
 *    internal locking but rather rely on the filesystem to synchronize
 *    internal locking but rather rely on the filesystem to synchronize
 *    direct I/O reads/writes versus each other and truncate.
 *    direct I/O reads/writes versus each other and truncate.
 *    For reads and writes both i_mutex and i_alloc_sem are not held on
 *    entry and are never taken.
 */
 */
ssize_t
ssize_t
__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
@@ -1234,10 +1272,9 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
		}
		}


		/*
		/*
		 * Will be released at I/O completion, possibly in a
		 * Will be decremented at I/O completion time.
		 * different thread.
		 */
		 */
		down_read_non_owner(&inode->i_alloc_sem);
		atomic_inc(&inode->i_dio_count);
	}
	}


	/*
	/*
+1 −2
Original line number Original line Diff line number Diff line
@@ -168,8 +168,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
	mutex_init(&inode->i_mutex);
	mutex_init(&inode->i_mutex);
	lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key);
	lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key);


	init_rwsem(&inode->i_alloc_sem);
	atomic_set(&inode->i_dio_count, 0);
	lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key);


	mapping->a_ops = &empty_aops;
	mapping->a_ops = &empty_aops;
	mapping->host = inode;
	mapping->host = inode;
+1 −2
Original line number Original line Diff line number Diff line
@@ -1832,9 +1832,8 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
	 * fails again.
	 * fails again.
	 */
	 */
	if (unlikely(NInoTruncateFailed(ni))) {
	if (unlikely(NInoTruncateFailed(ni))) {
		down_write(&vi->i_alloc_sem);
		inode_dio_wait(vi);
		err = ntfs_truncate(vi);
		err = ntfs_truncate(vi);
		up_write(&vi->i_alloc_sem);
		if (err || NInoTruncateFailed(ni)) {
		if (err || NInoTruncateFailed(ni)) {
			if (!err)
			if (!err)
				err = -EIO;
				err = -EIO;
+2 −8
Original line number Original line Diff line number Diff line
@@ -2357,12 +2357,7 @@ static const char *es = " Leaving inconsistent metadata. Unmount and run "
 *
 *
 * Returns 0 on success or -errno on error.
 * Returns 0 on success or -errno on error.
 *
 *
 * Called with ->i_mutex held.  In all but one case ->i_alloc_sem is held for
 * Called with ->i_mutex held.
 * writing.  The only case in the kernel where ->i_alloc_sem is not held is
 * mm/filemap.c::generic_file_buffered_write() where vmtruncate() is called
 * with the current i_size as the offset.  The analogous place in NTFS is in
 * fs/ntfs/file.c::ntfs_file_buffered_write() where we call vmtruncate() again
 * without holding ->i_alloc_sem.
 */
 */
int ntfs_truncate(struct inode *vi)
int ntfs_truncate(struct inode *vi)
{
{
@@ -2887,8 +2882,7 @@ void ntfs_truncate_vfs(struct inode *vi) {
 * We also abort all changes of user, group, and mode as we do not implement
 * We also abort all changes of user, group, and mode as we do not implement
 * the NTFS ACLs yet.
 * the NTFS ACLs yet.
 *
 *
 * Called with ->i_mutex held.  For the ATTR_SIZE (i.e. ->truncate) case, also
 * Called with ->i_mutex held.
 * called with ->i_alloc_sem held for writing.
 */
 */
int ntfs_setattr(struct dentry *dentry, struct iattr *attr)
int ntfs_setattr(struct dentry *dentry, struct iattr *attr)
{
{
Loading