Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 1e868d8e authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge branch 'for-linus' of git://oss.sgi.com/xfs/xfs

* 'for-linus' of git://oss.sgi.com/xfs/xfs:
  xfs: xfs_swap_extents needs to handle dynamic fork offsets
  xfs: fix missing error check in xfs_rtfree_range
  xfs: fix stale inode flush avoidance
  xfs: Remove inode iolock held check during allocation
  xfs: reclaim all inodes by background tree walks
  xfs: Avoid inodes in reclaim when flushing from inode cache
  xfs: reclaim inodes under a write lock
parents 2faae422 e09f9860
Loading
Loading
Loading
Loading
+6 −8
Original line number Diff line number Diff line
@@ -954,16 +954,14 @@ xfs_fs_destroy_inode(
	ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM));

	/*
	 * If we have nothing to flush with this inode then complete the
	 * teardown now, otherwise delay the flush operation.
	 * We always use background reclaim here because even if the
	 * inode is clean, it still may be under IO and hence we have
	 * to take the flush lock. The background reclaim path handles
	 * this more efficiently than we can here, so simply let background
	 * reclaim tear down all inodes.
	 */
	if (!xfs_inode_clean(ip)) {
		xfs_inode_set_reclaim_tag(ip);
		return;
	}

out_reclaim:
	xfs_ireclaim(ip);
	xfs_inode_set_reclaim_tag(ip);
}

/*
+86 −97
Original line number Diff line number Diff line
@@ -65,7 +65,6 @@ xfs_inode_ag_lookup(
	 * as the tree is sparse and a gang lookup walks to find
	 * the number of objects requested.
	 */
	read_lock(&pag->pag_ici_lock);
	if (tag == XFS_ICI_NO_TAG) {
		nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
				(void **)&ip, *first_index, 1);
@@ -74,7 +73,7 @@ xfs_inode_ag_lookup(
				(void **)&ip, *first_index, 1, tag);
	}
	if (!nr_found)
		goto unlock;
		return NULL;

	/*
	 * Update the index for the next lookup. Catch overflows
@@ -84,13 +83,8 @@ xfs_inode_ag_lookup(
	 */
	*first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
	if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
		goto unlock;

	return ip;

unlock:
	read_unlock(&pag->pag_ici_lock);
		return NULL;
	return ip;
}

STATIC int
@@ -100,7 +94,8 @@ xfs_inode_ag_walk(
	int			(*execute)(struct xfs_inode *ip,
					   struct xfs_perag *pag, int flags),
	int			flags,
	int			tag)
	int			tag,
	int			exclusive)
{
	struct xfs_perag	*pag = &mp->m_perag[ag];
	uint32_t		first_index;
@@ -114,10 +109,20 @@ xfs_inode_ag_walk(
		int		error = 0;
		xfs_inode_t	*ip;

		if (exclusive)
			write_lock(&pag->pag_ici_lock);
		else
			read_lock(&pag->pag_ici_lock);
		ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag);
		if (!ip)
		if (!ip) {
			if (exclusive)
				write_unlock(&pag->pag_ici_lock);
			else
				read_unlock(&pag->pag_ici_lock);
			break;
		}

		/* execute releases pag->pag_ici_lock */
		error = execute(ip, pag, flags);
		if (error == EAGAIN) {
			skipped++;
@@ -125,9 +130,8 @@ xfs_inode_ag_walk(
		}
		if (error)
			last_error = error;
		/*
		 * bail out if the filesystem is corrupted.
		 */

		/* bail out if the filesystem is corrupted.  */
		if (error == EFSCORRUPTED)
			break;

@@ -148,7 +152,8 @@ xfs_inode_ag_iterator(
	int			(*execute)(struct xfs_inode *ip,
					   struct xfs_perag *pag, int flags),
	int			flags,
	int			tag)
	int			tag,
	int			exclusive)
{
	int			error = 0;
	int			last_error = 0;
@@ -157,7 +162,8 @@ xfs_inode_ag_iterator(
	for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
		if (!mp->m_perag[ag].pag_ici_init)
			continue;
		error = xfs_inode_ag_walk(mp, ag, execute, flags, tag);
		error = xfs_inode_ag_walk(mp, ag, execute, flags, tag,
						exclusive);
		if (error) {
			last_error = error;
			if (error == EFSCORRUPTED)
@@ -174,30 +180,31 @@ xfs_sync_inode_valid(
	struct xfs_perag	*pag)
{
	struct inode		*inode = VFS_I(ip);
	int			error = EFSCORRUPTED;

	/* nothing to sync during shutdown */
	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
		read_unlock(&pag->pag_ici_lock);
		return EFSCORRUPTED;
	}
	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
		goto out_unlock;

	/*
	 * If we can't get a reference on the inode, it must be in reclaim.
	 * Leave it for the reclaim code to flush. Also avoid inodes that
	 * haven't been fully initialised.
	 */
	if (!igrab(inode)) {
		read_unlock(&pag->pag_ici_lock);
		return ENOENT;
	}
	read_unlock(&pag->pag_ici_lock);
	/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
	error = ENOENT;
	if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
		goto out_unlock;

	/* If we can't grab the inode, it must on it's way to reclaim. */
	if (!igrab(inode))
		goto out_unlock;

	if (is_bad_inode(inode) || xfs_iflags_test(ip, XFS_INEW)) {
	if (is_bad_inode(inode)) {
		IRELE(ip);
		return ENOENT;
		goto out_unlock;
	}

	return 0;
	/* inode is valid */
	error = 0;
out_unlock:
	read_unlock(&pag->pag_ici_lock);
	return error;
}

STATIC int
@@ -282,7 +289,7 @@ xfs_sync_data(
	ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);

	error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags,
				      XFS_ICI_NO_TAG);
				      XFS_ICI_NO_TAG, 0);
	if (error)
		return XFS_ERROR(error);

@@ -304,7 +311,7 @@ xfs_sync_attr(
	ASSERT((flags & ~SYNC_WAIT) == 0);

	return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags,
				     XFS_ICI_NO_TAG);
				     XFS_ICI_NO_TAG, 0);
}

STATIC int
@@ -664,60 +671,6 @@ xfs_syncd_stop(
	kthread_stop(mp->m_sync_task);
}

STATIC int
xfs_reclaim_inode(
	xfs_inode_t	*ip,
	int		sync_mode)
{
	xfs_perag_t	*pag = xfs_get_perag(ip->i_mount, ip->i_ino);

	/* The hash lock here protects a thread in xfs_iget_core from
	 * racing with us on linking the inode back with a vnode.
	 * Once we have the XFS_IRECLAIM flag set it will not touch
	 * us.
	 */
	write_lock(&pag->pag_ici_lock);
	spin_lock(&ip->i_flags_lock);
	if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
	    !__xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
		spin_unlock(&ip->i_flags_lock);
		write_unlock(&pag->pag_ici_lock);
		return -EAGAIN;
	}
	__xfs_iflags_set(ip, XFS_IRECLAIM);
	spin_unlock(&ip->i_flags_lock);
	write_unlock(&pag->pag_ici_lock);
	xfs_put_perag(ip->i_mount, pag);

	/*
	 * If the inode is still dirty, then flush it out.  If the inode
	 * is not in the AIL, then it will be OK to flush it delwri as
	 * long as xfs_iflush() does not keep any references to the inode.
	 * We leave that decision up to xfs_iflush() since it has the
	 * knowledge of whether it's OK to simply do a delwri flush of
	 * the inode or whether we need to wait until the inode is
	 * pulled from the AIL.
	 * We get the flush lock regardless, though, just to make sure
	 * we don't free it while it is being flushed.
	 */
	xfs_ilock(ip, XFS_ILOCK_EXCL);
	xfs_iflock(ip);

	/*
	 * In the case of a forced shutdown we rely on xfs_iflush() to
	 * wait for the inode to be unpinned before returning an error.
	 */
	if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) {
		/* synchronize with xfs_iflush_done */
		xfs_iflock(ip);
		xfs_ifunlock(ip);
	}

	xfs_iunlock(ip, XFS_ILOCK_EXCL);
	xfs_ireclaim(ip);
	return 0;
}

void
__xfs_inode_set_reclaim_tag(
	struct xfs_perag	*pag,
@@ -760,19 +713,55 @@ __xfs_inode_clear_reclaim_tag(
}

STATIC int
xfs_reclaim_inode_now(
xfs_reclaim_inode(
	struct xfs_inode	*ip,
	struct xfs_perag	*pag,
	int			flags)
	int			sync_mode)
{
	/* ignore if already under reclaim */
	if (xfs_iflags_test(ip, XFS_IRECLAIM)) {
		read_unlock(&pag->pag_ici_lock);
	/*
	 * The radix tree lock here protects a thread in xfs_iget from racing
	 * with us starting reclaim on the inode.  Once we have the
	 * XFS_IRECLAIM flag set it will not touch us.
	 */
	spin_lock(&ip->i_flags_lock);
	ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
	if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
		/* ignore as it is already under reclaim */
		spin_unlock(&ip->i_flags_lock);
		write_unlock(&pag->pag_ici_lock);
		return 0;
	}
	read_unlock(&pag->pag_ici_lock);
	__xfs_iflags_set(ip, XFS_IRECLAIM);
	spin_unlock(&ip->i_flags_lock);
	write_unlock(&pag->pag_ici_lock);

	/*
	 * If the inode is still dirty, then flush it out.  If the inode
	 * is not in the AIL, then it will be OK to flush it delwri as
	 * long as xfs_iflush() does not keep any references to the inode.
	 * We leave that decision up to xfs_iflush() since it has the
	 * knowledge of whether it's OK to simply do a delwri flush of
	 * the inode or whether we need to wait until the inode is
	 * pulled from the AIL.
	 * We get the flush lock regardless, though, just to make sure
	 * we don't free it while it is being flushed.
	 */
	xfs_ilock(ip, XFS_ILOCK_EXCL);
	xfs_iflock(ip);

	return xfs_reclaim_inode(ip, flags);
	/*
	 * In the case of a forced shutdown we rely on xfs_iflush() to
	 * wait for the inode to be unpinned before returning an error.
	 */
	if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) {
		/* synchronize with xfs_iflush_done */
		xfs_iflock(ip);
		xfs_ifunlock(ip);
	}

	xfs_iunlock(ip, XFS_ILOCK_EXCL);
	xfs_ireclaim(ip);
	return 0;
}

int
@@ -780,6 +769,6 @@ xfs_reclaim_inodes(
	xfs_mount_t	*mp,
	int		mode)
{
	return xfs_inode_ag_iterator(mp, xfs_reclaim_inode_now, mode,
					XFS_ICI_RECLAIM_TAG);
	return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode,
					XFS_ICI_RECLAIM_TAG, 1);
}
+1 −1
Original line number Diff line number Diff line
@@ -54,6 +54,6 @@ void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag);
int xfs_inode_ag_iterator(struct xfs_mount *mp,
	int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
	int flags, int tag);
	int flags, int tag, int write_lock);

#endif
+1 −1
Original line number Diff line number Diff line
@@ -891,7 +891,7 @@ xfs_qm_dqrele_all_inodes(
	uint		 flags)
{
	ASSERT(mp->m_quotainfo);
	xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG);
	xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG, 0);
}

/*------------------------------------------------------------------------*/
+90 −16
Original line number Diff line number Diff line
@@ -114,10 +114,82 @@ xfs_swapext(
	return error;
}

/*
 * We need to check that the format of the data fork in the temporary inode is
 * valid for the target inode before doing the swap. This is not a problem with
 * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
 * data fork depending on the space the attribute fork is taking so we can get
 * invalid formats on the target inode.
 *
 * E.g. target has space for 7 extents in extent format, temp inode only has
 * space for 6.  If we defragment down to 7 extents, then the tmp format is a
 * btree, but when swapped it needs to be in extent format. Hence we can't just
 * blindly swap data forks on attr2 filesystems.
 *
 * Note that we check the swap in both directions so that we don't end up with
 * a corrupt temporary inode, either.
 *
 * Note that fixing the way xfs_fsr sets up the attribute fork in the source
 * inode will prevent this situation from occurring, so all we do here is
 * reject and log the attempt. basically we are putting the responsibility on
 * userspace to get this right.
 */
static int
xfs_swap_extents_check_format(
	xfs_inode_t	*ip,	/* target inode */
	xfs_inode_t	*tip)	/* tmp inode */
{

	/* Should never get a local format */
	if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
	    tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
		return EINVAL;

	/*
	 * if the target inode has less extents that then temporary inode then
	 * why did userspace call us?
	 */
	if (ip->i_d.di_nextents < tip->i_d.di_nextents)
		return EINVAL;

	/*
	 * if the target inode is in extent form and the temp inode is in btree
	 * form then we will end up with the target inode in the wrong format
	 * as we already know there are less extents in the temp inode.
	 */
	if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
	    tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
		return EINVAL;

	/* Check temp in extent form to max in target */
	if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
	    XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > ip->i_df.if_ext_max)
		return EINVAL;

	/* Check target in extent form to max in temp */
	if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
	    XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max)
		return EINVAL;

	/* Check root block of temp in btree form to max in target */
	if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
	    XFS_IFORK_BOFF(ip) &&
	    tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip))
		return EINVAL;

	/* Check root block of target in btree form to max in temp */
	if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
	    XFS_IFORK_BOFF(tip) &&
	    ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip))
		return EINVAL;

	return 0;
}

int
xfs_swap_extents(
	xfs_inode_t	*ip,
	xfs_inode_t	*tip,
	xfs_inode_t	*ip,	/* target inode */
	xfs_inode_t	*tip,	/* tmp inode */
	xfs_swapext_t	*sxp)
{
	xfs_mount_t	*mp;
@@ -161,13 +233,6 @@ xfs_swap_extents(
		goto out_unlock;
	}

	/* Should never get a local format */
	if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
	    tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
		error = XFS_ERROR(EINVAL);
		goto out_unlock;
	}

	if (VN_CACHED(VFS_I(tip)) != 0) {
		error = xfs_flushinval_pages(tip, 0, -1,
				FI_REMAPF_LOCKED);
@@ -189,13 +254,12 @@ xfs_swap_extents(
		goto out_unlock;
	}

	/*
	 * If the target has extended attributes, the tmp file
	 * must also in order to ensure the correct data fork
	 * format.
	 */
	if ( XFS_IFORK_Q(ip) != XFS_IFORK_Q(tip) ) {
		error = XFS_ERROR(EINVAL);
	/* check inode formats now that data is flushed */
	error = xfs_swap_extents_check_format(ip, tip);
	if (error) {
		xfs_fs_cmn_err(CE_NOTE, mp,
		    "%s: inode 0x%llx format is incompatible for exchanging.",
				__FILE__, ip->i_ino);
		goto out_unlock;
	}

@@ -275,6 +339,16 @@ xfs_swap_extents(
	*ifp = *tifp;		/* struct copy */
	*tifp = *tempifp;	/* struct copy */

	/*
	 * Fix the in-memory data fork values that are dependent on the fork
	 * offset in the inode. We can't assume they remain the same as attr2
	 * has dynamic fork offsets.
	 */
	ifp->if_ext_max = XFS_IFORK_SIZE(ip, XFS_DATA_FORK) /
					(uint)sizeof(xfs_bmbt_rec_t);
	tifp->if_ext_max = XFS_IFORK_SIZE(tip, XFS_DATA_FORK) /
					(uint)sizeof(xfs_bmbt_rec_t);

	/*
	 * Fix the on-disk inode values
	 */
Loading