Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit a022fe09 authored by Christoph Hellwig's avatar Christoph Hellwig Committed by Felix Blyakher
Browse files

xfs: fix locking in xfs_iget_cache_hit



The locking in xfs_iget_cache_hit currently has numerous problems:

 - we clear the reclaim tag without i_flags_lock which protects
   modifications to it
 - we call inode_init_always which can sleep with pag_ici_lock
   held (this is oss.sgi.com BZ #819)
 - we acquire and drop i_flags_lock a lot and thus provide no
   consistency between the various flags we set/clear under it

This patch fixes all that with a major revamp of the locking in
the function.  The new version acquires i_flags_lock early and
only drops it once we need to call into inode_init_always or before
calling xfs_ilock.

This patch fixes a bug seen in the wild where we race modifying the
reclaim tag.

Signed-off-by: default avatarChristoph Hellwig <hch@lst.de>
Reviewed-by: default avatarFelix Blyakher <felixb@sgi.com>
Reviewed-by: default avatarEric Sandeen <sandeen@sandeen.net>
Signed-off-by: default avatarFelix Blyakher <felixb@sgi.com>
parent 79dd43bb
Loading
Loading
Loading
Loading
+11 −2
Original line number Original line Diff line number Diff line
@@ -708,6 +708,16 @@ xfs_reclaim_inode(
	return 0;
	return 0;
}
}


void
__xfs_inode_set_reclaim_tag(
	struct xfs_perag	*pag,
	struct xfs_inode	*ip)
{
	radix_tree_tag_set(&pag->pag_ici_root,
			   XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
			   XFS_ICI_RECLAIM_TAG);
}

/*
/*
 * We set the inode flag atomically with the radix tree tag.
 * We set the inode flag atomically with the radix tree tag.
 * Once we get tag lookups on the radix tree, this inode flag
 * Once we get tag lookups on the radix tree, this inode flag
@@ -722,8 +732,7 @@ xfs_inode_set_reclaim_tag(


	read_lock(&pag->pag_ici_lock);
	read_lock(&pag->pag_ici_lock);
	spin_lock(&ip->i_flags_lock);
	spin_lock(&ip->i_flags_lock);
	radix_tree_tag_set(&pag->pag_ici_root,
	__xfs_inode_set_reclaim_tag(pag, ip);
			XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
	__xfs_iflags_set(ip, XFS_IRECLAIMABLE);
	__xfs_iflags_set(ip, XFS_IRECLAIMABLE);
	spin_unlock(&ip->i_flags_lock);
	spin_unlock(&ip->i_flags_lock);
	read_unlock(&pag->pag_ici_lock);
	read_unlock(&pag->pag_ici_lock);
+1 −0
Original line number Original line Diff line number Diff line
@@ -48,6 +48,7 @@ int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode);
int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);


void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
				struct xfs_inode *ip);
				struct xfs_inode *ip);


+58 −55
Original line number Original line Diff line number Diff line
@@ -134,79 +134,81 @@ xfs_iget_cache_hit(
	int			flags,
	int			flags,
	int			lock_flags) __releases(pag->pag_ici_lock)
	int			lock_flags) __releases(pag->pag_ici_lock)
{
{
	struct inode		*inode = VFS_I(ip);
	struct xfs_mount	*mp = ip->i_mount;
	struct xfs_mount	*mp = ip->i_mount;
	int			error = EAGAIN;
	int			error;

	spin_lock(&ip->i_flags_lock);


	/*
	/*
	 * If INEW is set this inode is being set up
	 * If we are racing with another cache hit that is currently
	 * If IRECLAIM is set this inode is being torn down
	 * instantiating this inode or currently recycling it out of
	 * Pause and try again.
	 * reclaimabe state, wait for the initialisation to complete
	 * before continuing.
	 *
	 * XXX(hch): eventually we should do something equivalent to
	 *	     wait_on_inode to wait for these flags to be cleared
	 *	     instead of polling for it.
	 */
	 */
	if (xfs_iflags_test(ip, (XFS_INEW|XFS_IRECLAIM))) {
	if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
		XFS_STATS_INC(xs_ig_frecycle);
		XFS_STATS_INC(xs_ig_frecycle);
		error = EAGAIN;
		goto out_error;
		goto out_error;
	}
	}


	/* If IRECLAIMABLE is set, we've torn down the vfs inode part */
	if (xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {

	/*
	/*
		 * If lookup is racing with unlink, then we should return an
	 * If lookup is racing with unlink return an error immediately.
		 * error immediately so we don't remove it from the reclaim
		 * list and potentially leak the inode.
	 */
	 */
		if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
	if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
		error = ENOENT;
		error = ENOENT;
		goto out_error;
		goto out_error;
	}
	}


	/*
	 * If IRECLAIMABLE is set, we've torn down the VFS inode already.
	 * Need to carefully get it back into useable state.
	 */
	if (ip->i_flags & XFS_IRECLAIMABLE) {
		xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
		xfs_itrace_exit_tag(ip, "xfs_iget.alloc");


		/*
		/*
		 * We need to re-initialise the VFS inode as it has been
		 * We need to set XFS_INEW atomically with clearing the
		 * 'freed' by the VFS. Do this here so we can deal with
		 * reclaimable tag so that we do have an indicator of the
		 * errors cleanly, then tag it so it can be set up correctly
		 * inode still being initialized.
		 * later.
		 */
		 */
		if (!inode_init_always(mp->m_super, VFS_I(ip))) {
		ip->i_flags |= XFS_INEW;
			error = ENOMEM;
		ip->i_flags &= ~XFS_IRECLAIMABLE;
			goto out_error;
		__xfs_inode_clear_reclaim_tag(mp, pag, ip);
		}

		spin_unlock(&ip->i_flags_lock);
		read_unlock(&pag->pag_ici_lock);


		error = -inode_init_always(mp->m_super, inode);
		if (error) {
			/*
			/*
		 * We must set the XFS_INEW flag before clearing the
			 * Re-initializing the inode failed, and we are in deep
		 * XFS_IRECLAIMABLE flag so that if a racing lookup does
			 * trouble.  Try to re-add it to the reclaim list.
		 * not find the XFS_IRECLAIMABLE above but has the igrab()
		 * below succeed we can safely check XFS_INEW to detect
		 * that this inode is still being initialised.
			 */
			 */
		xfs_iflags_set(ip, XFS_INEW);
			read_lock(&pag->pag_ici_lock);
		xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
			spin_lock(&ip->i_flags_lock);


		/* clear the radix tree reclaim flag as well. */
			ip->i_flags &= ~XFS_INEW;
		__xfs_inode_clear_reclaim_tag(mp, pag, ip);
			ip->i_flags |= XFS_IRECLAIMABLE;
	} else if (!igrab(VFS_I(ip))) {
			__xfs_inode_set_reclaim_tag(pag, ip);
		/* If the VFS inode is being torn down, pause and try again. */
		XFS_STATS_INC(xs_ig_frecycle);
			goto out_error;
			goto out_error;
	} else if (xfs_iflags_test(ip, XFS_INEW)) {
		/*
		 * We are racing with another cache hit that is
		 * currently recycling this inode out of the XFS_IRECLAIMABLE
		 * state. Wait for the initialisation to complete before
		 * continuing.
		 */
		wait_on_inode(VFS_I(ip));
		}
		}

		inode->i_state = I_LOCK|I_NEW;
	if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
	} else {
		error = ENOENT;
		/* If the VFS inode is being torn down, pause and try again. */
		iput(VFS_I(ip));
		if (!igrab(inode)) {
			error = EAGAIN;
			goto out_error;
			goto out_error;
		}
		}


		/* We've got a live one. */
		/* We've got a live one. */
		spin_unlock(&ip->i_flags_lock);
		read_unlock(&pag->pag_ici_lock);
		read_unlock(&pag->pag_ici_lock);
	}


	if (lock_flags != 0)
	if (lock_flags != 0)
		xfs_ilock(ip, lock_flags);
		xfs_ilock(ip, lock_flags);
@@ -217,6 +219,7 @@ xfs_iget_cache_hit(
	return 0;
	return 0;


out_error:
out_error:
	spin_unlock(&ip->i_flags_lock);
	read_unlock(&pag->pag_ici_lock);
	read_unlock(&pag->pag_ici_lock);
	return error;
	return error;
}
}