Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 1c1c6ebc authored by Dave Chinner's avatar Dave Chinner Committed by Alex Elder
Browse files

xfs: Replace per-ag array with a radix tree



The use of an array for the per-ag structures requires reallocation
of the array when growing the filesystem. This requires locking
access to the array to avoid use after free situations, and the
locking is difficult to get right. To avoid needing to reallocate an
array, change the per-ag structures to an allocated object per ag
and index them using a tree structure.

The AGs are always densely indexed (hence the use of an array), but
the number supported is 2^32 and lookups tend to be random and hence
indexing needs to scale. A simple choice is a radix tree - it works
well with this sort of index.  This change also removes another
large contiguous allocation from the mount/growfs path in XFS.

The growing process now needs to change to only initialise the new
AGs required for the extra space, and as such only needs to
exclusively lock the tree for inserts. The rest of the code only
needs to lock the tree while doing lookups, and hence this will
remove all the deadlocks that currently occur on the m_perag_lock as
it is now an innermost lock. The lock is also changed to a spinlock
from a read/write lock as the hold time is now extremely short.

To complete the picture, the per-ag structures will need to be
reference counted to ensure that we don't free/modify them while
they are still in use.  This will be done in subsequent patch.

Signed-off-by: default avatarDave Chinner <david@fromorbit.com>
Reviewed-by: default avatarChristoph Hellwig <hch@lst.de>
Signed-off-by: default avatarAlex Elder <aelder@sgi.com>
parent 44b56e0a
Loading
Loading
Loading
Loading
+0 −8
Original line number Diff line number Diff line
@@ -2276,7 +2276,6 @@ xfs_alloc_vextent(
		 * These three force us into a single a.g.
		 */
		args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
		down_read(&mp->m_peraglock);
		args->pag = xfs_perag_get(mp, args->agno);
		args->minleft = 0;
		error = xfs_alloc_fix_freelist(args, 0);
@@ -2286,14 +2285,12 @@ xfs_alloc_vextent(
			goto error0;
		}
		if (!args->agbp) {
			up_read(&mp->m_peraglock);
			trace_xfs_alloc_vextent_noagbp(args);
			break;
		}
		args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
		if ((error = xfs_alloc_ag_vextent(args)))
			goto error0;
		up_read(&mp->m_peraglock);
		break;
	case XFS_ALLOCTYPE_START_BNO:
		/*
@@ -2345,7 +2342,6 @@ xfs_alloc_vextent(
		 * Loop over allocation groups twice; first time with
		 * trylock set, second time without.
		 */
		down_read(&mp->m_peraglock);
		for (;;) {
			args->pag = xfs_perag_get(mp, args->agno);
			if (no_min) args->minleft = 0;
@@ -2408,7 +2404,6 @@ xfs_alloc_vextent(
			}
			xfs_perag_put(args->pag);
		}
		up_read(&mp->m_peraglock);
		if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG)) {
			if (args->agno == sagno)
				mp->m_agfrotor = (mp->m_agfrotor + 1) %
@@ -2438,7 +2433,6 @@ xfs_alloc_vextent(
	return 0;
error0:
	xfs_perag_put(args->pag);
	up_read(&mp->m_peraglock);
	return error;
}

@@ -2463,7 +2457,6 @@ xfs_free_extent(
	args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
	ASSERT(args.agno < args.mp->m_sb.sb_agcount);
	args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
	down_read(&args.mp->m_peraglock);
	args.pag = xfs_perag_get(args.mp, args.agno);
	if ((error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING)))
		goto error0;
@@ -2475,7 +2468,6 @@ xfs_free_extent(
	error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
error0:
	xfs_perag_put(args.pag);
	up_read(&args.mp->m_peraglock);
	return error;
}

+1 −6
Original line number Diff line number Diff line
@@ -2629,14 +2629,12 @@ xfs_bmap_btalloc(
		if (startag == NULLAGNUMBER)
			startag = ag = 0;
		notinit = 0;
		down_read(&mp->m_peraglock);
		pag = xfs_perag_get(mp, ag);
		while (blen < ap->alen) {
			if (!pag->pagf_init &&
			    (error = xfs_alloc_pagf_init(mp, args.tp,
				    ag, XFS_ALLOC_FLAG_TRYLOCK))) {
				xfs_perag_put(pag);
				up_read(&mp->m_peraglock);
				return error;
			}
			/*
@@ -2669,10 +2667,8 @@ xfs_bmap_btalloc(

					error = xfs_filestream_new_ag(ap, &ag);
					xfs_perag_put(pag);
					if (error) {
						up_read(&mp->m_peraglock);
					if (error)
						return error;
					}

					/* loop again to set 'blen'*/
					startag = NULLAGNUMBER;
@@ -2688,7 +2684,6 @@ xfs_bmap_btalloc(
			pag = xfs_perag_get(mp, ag);
		}
		xfs_perag_put(pag);
		up_read(&mp->m_peraglock);
		/*
		 * Since the above loop did a BUF_TRYLOCK, it is
		 * possible that there is space for this request.
+4 −9
Original line number Diff line number Diff line
@@ -253,8 +253,7 @@ _xfs_filestream_pick_ag(

/*
 * Set the allocation group number for a file or a directory, updating inode
 * references and per-AG references as appropriate.  Must be called with the
 * m_peraglock held in read mode.
 * references and per-AG references as appropriate.
 */
static int
_xfs_filestream_update_ag(
@@ -456,10 +455,10 @@ xfs_filestream_unmount(
}

/*
 * If the mount point's m_perag array is going to be reallocated, all
 * If the mount point's m_perag tree is going to be modified, all
 * outstanding cache entries must be flushed to avoid accessing reference count
 * addresses that have been freed.  The call to xfs_filestream_flush() must be
 * made inside the block that holds the m_peraglock in write mode to do the
 * made inside the block that holds the m_perag_lock in write mode to do the
 * reallocation.
 */
void
@@ -531,7 +530,6 @@ xfs_filestream_associate(

	mp = pip->i_mount;
	cache = mp->m_filestream;
	down_read(&mp->m_peraglock);

	/*
	 * We have a problem, Houston.
@@ -548,10 +546,8 @@ xfs_filestream_associate(
	 *
	 * So, if we can't get the iolock without sleeping then just give up
	 */
	if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL)) {
		up_read(&mp->m_peraglock);
	if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL))
		return 1;
	}

	/* If the parent directory is already in the cache, use its AG. */
	item = xfs_mru_cache_lookup(cache, pip->i_ino);
@@ -606,7 +602,6 @@ xfs_filestream_associate(

exit:
	xfs_iunlock(pip, XFS_IOLOCK_EXCL);
	up_read(&mp->m_peraglock);
	return -err;
}

+21 −21
Original line number Diff line number Diff line
@@ -167,27 +167,14 @@ xfs_growfs_data_private(
	}
	new = nb - mp->m_sb.sb_dblocks;
	oagcount = mp->m_sb.sb_agcount;
	if (nagcount > oagcount) {
		void *new_perag, *old_perag;

		xfs_filestream_flush(mp);

		new_perag = kmem_zalloc(sizeof(xfs_perag_t) * nagcount,
					KM_MAYFAIL);
		if (!new_perag)
			return XFS_ERROR(ENOMEM);

		down_write(&mp->m_peraglock);
		memcpy(new_perag, mp->m_perag, sizeof(xfs_perag_t) * oagcount);
		old_perag = mp->m_perag;
		mp->m_perag = new_perag;

		mp->m_flags |= XFS_MOUNT_32BITINODES;
		nagimax = xfs_initialize_perag(mp, nagcount);
		up_write(&mp->m_peraglock);

		kmem_free(old_perag);
	/* allocate the new per-ag structures */
	if (nagcount > oagcount) {
		error = xfs_initialize_perag(mp, nagcount, &nagimax);
		if (error)
			return error;
	}

	tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS);
	tp->t_flags |= XFS_TRANS_RESERVE;
	if ((error = xfs_trans_reserve(tp, XFS_GROWFS_SPACE_RES(mp),
@@ -196,6 +183,11 @@ xfs_growfs_data_private(
		return error;
	}

	/*
	 * Write new AG headers to disk. Non-transactional, but written
	 * synchronously so they are completed prior to the growfs transaction
	 * being logged.
	 */
	nfree = 0;
	for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) {
		/*
@@ -359,6 +351,12 @@ xfs_growfs_data_private(
			goto error0;
		}
	}

	/*
	 * Update changed superblock fields transactionally. These are not
	 * seen by the rest of the world until the transaction commit applies
	 * them atomically to the superblock.
	 */
	if (nagcount > oagcount)
		xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount);
	if (nb > mp->m_sb.sb_dblocks)
@@ -369,9 +367,9 @@ xfs_growfs_data_private(
	if (dpct)
		xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
	error = xfs_trans_commit(tp, 0);
	if (error) {
	if (error)
		return error;
	}

	/* New allocation groups fully initialized, so update mount struct */
	if (nagimax)
		mp->m_maxagi = nagimax;
@@ -381,6 +379,8 @@ xfs_growfs_data_private(
		mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
	} else
		mp->m_maxicount = 0;

	/* update secondary superblocks. */
	for (agno = 1; agno < nagcount; agno++) {
		error = xfs_read_buf(mp, mp->m_ddev_targp,
				  XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
+2 −23
Original line number Diff line number Diff line
@@ -383,11 +383,9 @@ xfs_ialloc_ag_alloc(
	newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
	be32_add_cpu(&agi->agi_count, newlen);
	be32_add_cpu(&agi->agi_freecount, newlen);
	down_read(&args.mp->m_peraglock);
	pag = xfs_perag_get(args.mp, agno);
	pag->pagi_freecount += newlen;
	xfs_perag_put(pag);
	up_read(&args.mp->m_peraglock);
	agi->agi_newino = cpu_to_be32(newino);

	/*
@@ -489,7 +487,6 @@ xfs_ialloc_ag_select(
	 */
	agno = pagno;
	flags = XFS_ALLOC_FLAG_TRYLOCK;
	down_read(&mp->m_peraglock);
	for (;;) {
		pag = xfs_perag_get(mp, agno);
		if (!pag->pagi_init) {
@@ -531,7 +528,6 @@ xfs_ialloc_ag_select(
					goto nextag;
				}
				xfs_perag_put(pag);
				up_read(&mp->m_peraglock);
				return agbp;
			}
		}
@@ -544,18 +540,14 @@ xfs_ialloc_ag_select(
		 * No point in iterating over the rest, if we're shutting
		 * down.
		 */
		if (XFS_FORCED_SHUTDOWN(mp)) {
			up_read(&mp->m_peraglock);
		if (XFS_FORCED_SHUTDOWN(mp))
			return NULL;
		}
		agno++;
		if (agno >= agcount)
			agno = 0;
		if (agno == pagno) {
			if (flags == 0) {
				up_read(&mp->m_peraglock);
			if (flags == 0)
				return NULL;
			}
			flags = 0;
		}
	}
@@ -777,16 +769,13 @@ xfs_dialloc(
			*inop = NULLFSINO;
			return noroom ? ENOSPC : 0;
		}
		down_read(&mp->m_peraglock);
		pag = xfs_perag_get(mp, tagno);
		if (pag->pagi_inodeok == 0) {
			xfs_perag_put(pag);
			up_read(&mp->m_peraglock);
			goto nextag;
		}
		error = xfs_ialloc_read_agi(mp, tp, tagno, &agbp);
		xfs_perag_put(pag);
		up_read(&mp->m_peraglock);
		if (error)
			goto nextag;
		agi = XFS_BUF_TO_AGI(agbp);
@@ -1015,9 +1004,7 @@ xfs_dialloc(
		goto error0;
	be32_add_cpu(&agi->agi_freecount, -1);
	xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
	down_read(&mp->m_peraglock);
	pag->pagi_freecount--;
	up_read(&mp->m_peraglock);

	error = xfs_check_agi_freecount(cur, agi);
	if (error)
@@ -1100,9 +1087,7 @@ xfs_difree(
	/*
	 * Get the allocation group header.
	 */
	down_read(&mp->m_peraglock);
	error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
	up_read(&mp->m_peraglock);
	if (error) {
		cmn_err(CE_WARN,
			"xfs_difree: xfs_ialloc_read_agi() returned an error %d on %s.  Returning error.",
@@ -1169,11 +1154,9 @@ xfs_difree(
		be32_add_cpu(&agi->agi_count, -ilen);
		be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
		xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
		down_read(&mp->m_peraglock);
		pag = xfs_perag_get(mp, agno);
		pag->pagi_freecount -= ilen - 1;
		xfs_perag_put(pag);
		up_read(&mp->m_peraglock);
		xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen);
		xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));

@@ -1202,11 +1185,9 @@ xfs_difree(
		 */
		be32_add_cpu(&agi->agi_freecount, 1);
		xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
		down_read(&mp->m_peraglock);
		pag = xfs_perag_get(mp, agno);
		pag->pagi_freecount++;
		xfs_perag_put(pag);
		up_read(&mp->m_peraglock);
		xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
	}

@@ -1328,9 +1309,7 @@ xfs_imap(
		xfs_buf_t	*agbp;	/* agi buffer */
		int		i;	/* temp state */

		down_read(&mp->m_peraglock);
		error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
		up_read(&mp->m_peraglock);
		if (error) {
			xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
					"xfs_ialloc_read_agi() returned "
Loading