Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit d210a28c authored by Yingping Lu's avatar Yingping Lu Committed by Nathan Scott
Browse files

[XFS] In actual allocation of file system blocks and freeing extents, the


transaction within each such operation may involve multiple locking of AGF
buffer. While the freeing extent function has sorted the extents based on
AGF number before entering into transaction, however, when the file system
space is very limited, the allocation of space would try every AGF to get
space allocated, this could potentially cause out-of-order locking, thus
deadlock could happen. This fix mitigates the scarce space for allocation
by setting aside a few blocks without reservation, and avoid deadlock by
maintaining ascending order of AGF locking.

SGI-PV: 947395
SGI-Modid: xfs-linux-melb:xfs-kern:210801a

Signed-off-by: default avatarYingping Lu <yingping@sgi.com>
Signed-off-by: default avatarNathan Scott <nathans@sgi.com>
parent d3446eac
Loading
Loading
Loading
Loading
+23 −6
Original line number Original line Diff line number Diff line
@@ -1862,7 +1862,7 @@ xfs_alloc_fix_freelist(
		(pag->pagf_longest - delta) :
		(pag->pagf_longest - delta) :
		(pag->pagf_flcount > 0 || pag->pagf_longest > 0);
		(pag->pagf_flcount > 0 || pag->pagf_longest > 0);
	if (args->minlen + args->alignment + args->minalignslop - 1 > longest ||
	if (args->minlen + args->alignment + args->minalignslop - 1 > longest ||
	    (args->minleft &&
	    (!(flags & XFS_ALLOC_FLAG_FREEING) &&
	     (int)(pag->pagf_freeblks + pag->pagf_flcount -
	     (int)(pag->pagf_freeblks + pag->pagf_flcount -
		   need - args->total) <
		   need - args->total) <
	     (int)args->minleft)) {
	     (int)args->minleft)) {
@@ -1898,7 +1898,7 @@ xfs_alloc_fix_freelist(
	longest = (longest > delta) ? (longest - delta) :
	longest = (longest > delta) ? (longest - delta) :
		(be32_to_cpu(agf->agf_flcount) > 0 || longest > 0);
		(be32_to_cpu(agf->agf_flcount) > 0 || longest > 0);
	if (args->minlen + args->alignment + args->minalignslop - 1 > longest ||
	if (args->minlen + args->alignment + args->minalignslop - 1 > longest ||
	     (args->minleft &&
	     (!(flags & XFS_ALLOC_FLAG_FREEING) &&
		(int)(be32_to_cpu(agf->agf_freeblks) +
		(int)(be32_to_cpu(agf->agf_freeblks) +
		   be32_to_cpu(agf->agf_flcount) - need - args->total) <
		   be32_to_cpu(agf->agf_flcount) - need - args->total) <
	     (int)args->minleft)) {
	     (int)args->minleft)) {
@@ -1951,8 +1951,14 @@ xfs_alloc_fix_freelist(
		 * the restrictions correctly.  Can happen for free calls
		 * the restrictions correctly.  Can happen for free calls
		 * on a completely full ag.
		 * on a completely full ag.
		 */
		 */
		if (targs.agbno == NULLAGBLOCK)
		if (targs.agbno == NULLAGBLOCK) {
			if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
				xfs_trans_brelse(tp, agflbp);
				args->agbp = NULL;
				return 0;
			}
			break;
			break;
		}
		/*
		/*
		 * Put each allocated block on the list.
		 * Put each allocated block on the list.
		 */
		 */
@@ -2360,8 +2366,19 @@ xfs_alloc_vextent(
			if (args->agno == sagno &&
			if (args->agno == sagno &&
			    type == XFS_ALLOCTYPE_START_BNO)
			    type == XFS_ALLOCTYPE_START_BNO)
				args->type = XFS_ALLOCTYPE_THIS_AG;
				args->type = XFS_ALLOCTYPE_THIS_AG;
			if (++(args->agno) == mp->m_sb.sb_agcount)
			/*
			* For the first allocation, we can try any AG to get
			* space.  However, if we already have allocated a
			* block, we don't want to try AGs whose number is below
			* sagno. Otherwise, we may end up with out-of-order
			* locking of AGF, which might cause deadlock.
			*/
			if (++(args->agno) == mp->m_sb.sb_agcount) {
				if (args->firstblock != NULLFSBLOCK)
					args->agno = sagno;
				else
					args->agno = 0;
					args->agno = 0;
			}
			/*
			/*
			 * Reached the starting a.g., must either be done
			 * Reached the starting a.g., must either be done
			 * or switch to non-trylock mode.
			 * or switch to non-trylock mode.
@@ -2443,7 +2460,7 @@ xfs_free_extent(
	args.minlen = args.minleft = args.minalignslop = 0;
	args.minlen = args.minleft = args.minalignslop = 0;
	down_read(&args.mp->m_peraglock);
	down_read(&args.mp->m_peraglock);
	args.pag = &args.mp->m_perag[args.agno];
	args.pag = &args.mp->m_perag[args.agno];
	if ((error = xfs_alloc_fix_freelist(&args, 0)))
	if ((error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING)))
		goto error0;
		goto error0;
#ifdef DEBUG
#ifdef DEBUG
	ASSERT(args.agbp != NULL);
	ASSERT(args.agbp != NULL);
+2 −0
Original line number Original line Diff line number Diff line
@@ -41,6 +41,7 @@ typedef enum xfs_alloctype
 * Flags for xfs_alloc_fix_freelist.
 * Flags for xfs_alloc_fix_freelist.
 */
 */
#define	XFS_ALLOC_FLAG_TRYLOCK	0x00000001  /* use trylock for buffer locking */
#define	XFS_ALLOC_FLAG_TRYLOCK	0x00000001  /* use trylock for buffer locking */
#define	XFS_ALLOC_FLAG_FREEING	0x00000002  /* indicate caller is freeing extents*/


/*
/*
 * Argument structure for xfs_alloc routines.
 * Argument structure for xfs_alloc routines.
@@ -70,6 +71,7 @@ typedef struct xfs_alloc_arg {
	char		wasfromfl;	/* set if allocation is from freelist */
	char		wasfromfl;	/* set if allocation is from freelist */
	char		isfl;		/* set if is freelist blocks - !acctg */
	char		isfl;		/* set if is freelist blocks - !acctg */
	char		userdata;	/* set if this is user data */
	char		userdata;	/* set if this is user data */
	xfs_fsblock_t	firstblock;	/* io first block allocated */
} xfs_alloc_arg_t;
} xfs_alloc_arg_t;


/*
/*
+4 −1
Original line number Original line Diff line number Diff line
@@ -2762,6 +2762,7 @@ xfs_bmap_btalloc(
	args.mp = mp;
	args.mp = mp;
	args.fsbno = ap->rval;
	args.fsbno = ap->rval;
	args.maxlen = MIN(ap->alen, mp->m_sb.sb_agblocks);
	args.maxlen = MIN(ap->alen, mp->m_sb.sb_agblocks);
	args.firstblock = ap->firstblock;
	blen = 0;
	blen = 0;
	if (nullfb) {
	if (nullfb) {
		args.type = XFS_ALLOCTYPE_START_BNO;
		args.type = XFS_ALLOCTYPE_START_BNO;
@@ -2821,7 +2822,7 @@ xfs_bmap_btalloc(
		else
		else
			args.minlen = ap->alen;
			args.minlen = ap->alen;
	} else if (ap->low) {
	} else if (ap->low) {
		args.type = XFS_ALLOCTYPE_FIRST_AG;
		args.type = XFS_ALLOCTYPE_START_BNO;
		args.total = args.minlen = ap->minlen;
		args.total = args.minlen = ap->minlen;
	} else {
	} else {
		args.type = XFS_ALLOCTYPE_NEAR_BNO;
		args.type = XFS_ALLOCTYPE_NEAR_BNO;
@@ -3452,6 +3453,7 @@ xfs_bmap_extents_to_btree(
	XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE);
	XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE);
	args.tp = tp;
	args.tp = tp;
	args.mp = mp;
	args.mp = mp;
	args.firstblock = *firstblock;
	if (*firstblock == NULLFSBLOCK) {
	if (*firstblock == NULLFSBLOCK) {
		args.type = XFS_ALLOCTYPE_START_BNO;
		args.type = XFS_ALLOCTYPE_START_BNO;
		args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
		args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
@@ -3587,6 +3589,7 @@ xfs_bmap_local_to_extents(


		args.tp = tp;
		args.tp = tp;
		args.mp = ip->i_mount;
		args.mp = ip->i_mount;
		args.firstblock = *firstblock;
		ASSERT((ifp->if_flags &
		ASSERT((ifp->if_flags &
			(XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == XFS_IFINLINE);
			(XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == XFS_IFINLINE);
		/*
		/*
+4 −6
Original line number Original line Diff line number Diff line
@@ -1569,12 +1569,11 @@ xfs_bmbt_split(
	lbno = XFS_DADDR_TO_FSB(args.mp, XFS_BUF_ADDR(lbp));
	lbno = XFS_DADDR_TO_FSB(args.mp, XFS_BUF_ADDR(lbp));
	left = XFS_BUF_TO_BMBT_BLOCK(lbp);
	left = XFS_BUF_TO_BMBT_BLOCK(lbp);
	args.fsbno = cur->bc_private.b.firstblock;
	args.fsbno = cur->bc_private.b.firstblock;
	args.firstblock = args.fsbno;
	if (args.fsbno == NULLFSBLOCK) {
	if (args.fsbno == NULLFSBLOCK) {
		args.fsbno = lbno;
		args.fsbno = lbno;
		args.type = XFS_ALLOCTYPE_START_BNO;
		args.type = XFS_ALLOCTYPE_START_BNO;
	} else if (cur->bc_private.b.flist->xbf_low)
	} else
		args.type = XFS_ALLOCTYPE_FIRST_AG;
	else
		args.type = XFS_ALLOCTYPE_NEAR_BNO;
		args.type = XFS_ALLOCTYPE_NEAR_BNO;
	args.mod = args.minleft = args.alignment = args.total = args.isfl =
	args.mod = args.minleft = args.alignment = args.total = args.isfl =
		args.userdata = args.minalignslop = 0;
		args.userdata = args.minalignslop = 0;
@@ -2356,6 +2355,7 @@ xfs_bmbt_newroot(
		args.userdata = args.minalignslop = 0;
		args.userdata = args.minalignslop = 0;
	args.minlen = args.maxlen = args.prod = 1;
	args.minlen = args.maxlen = args.prod = 1;
	args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
	args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
	args.firstblock = args.fsbno;
	if (args.fsbno == NULLFSBLOCK) {
	if (args.fsbno == NULLFSBLOCK) {
#ifdef DEBUG
#ifdef DEBUG
		if ((error = xfs_btree_check_lptr(cur, INT_GET(*pp, ARCH_CONVERT), level))) {
		if ((error = xfs_btree_check_lptr(cur, INT_GET(*pp, ARCH_CONVERT), level))) {
@@ -2365,9 +2365,7 @@ xfs_bmbt_newroot(
#endif
#endif
		args.fsbno = INT_GET(*pp, ARCH_CONVERT);
		args.fsbno = INT_GET(*pp, ARCH_CONVERT);
		args.type = XFS_ALLOCTYPE_START_BNO;
		args.type = XFS_ALLOCTYPE_START_BNO;
	} else if (args.wasdel)
	} else
		args.type = XFS_ALLOCTYPE_FIRST_AG;
	else
		args.type = XFS_ALLOCTYPE_NEAR_BNO;
		args.type = XFS_ALLOCTYPE_NEAR_BNO;
	if ((error = xfs_alloc_vextent(&args))) {
	if ((error = xfs_alloc_vextent(&args))) {
		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+22 −2
Original line number Original line Diff line number Diff line
@@ -1254,6 +1254,26 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)


	xfs_trans_log_buf(tp, bp, first, last);
	xfs_trans_log_buf(tp, bp, first, last);
}
}

/*
 * In order to avoid ENOSPC-related deadlock caused by
 * out-of-order locking of AGF buffer (PV 947395), we place
 * constraints on the relationship among actual allocations for
 * data blocks, freelist blocks, and potential file data bmap
 * btree blocks. However, these restrictions may result in no
 * actual space allocated for a delayed extent, for example, a data
 * block in a certain AG is allocated but there is no additional
 * block for the additional bmap btree block due to a split of the
 * bmap btree of the file. The result of this may lead to an
 * infinite loop in xfssyncd when the file gets flushed to disk and
 * all delayed extents need to be actually allocated. To get around
 * this, we explicitly set aside a few blocks which will not be
 * reserved in delayed allocation. Considering the minimum number of
 * needed freelist blocks is 4 fsbs, a potential split of file's bmap
 * btree requires 1 fsb, so we set the number of set-aside blocks to 8.
*/
#define SET_ASIDE_BLOCKS 8

/*
/*
 * xfs_mod_incore_sb_unlocked() is a utility routine common used to apply
 * xfs_mod_incore_sb_unlocked() is a utility routine common used to apply
 * a delta to a specified field in the in-core superblock.  Simply
 * a delta to a specified field in the in-core superblock.  Simply
@@ -1298,7 +1318,7 @@ xfs_mod_incore_sb_unlocked(xfs_mount_t *mp, xfs_sb_field_t field,
		return 0;
		return 0;
	case XFS_SBS_FDBLOCKS:
	case XFS_SBS_FDBLOCKS:


		lcounter = (long long)mp->m_sb.sb_fdblocks;
		lcounter = (long long)mp->m_sb.sb_fdblocks - SET_ASIDE_BLOCKS;
		res_used = (long long)(mp->m_resblks - mp->m_resblks_avail);
		res_used = (long long)(mp->m_resblks - mp->m_resblks_avail);


		if (delta > 0) {		/* Putting blocks back */
		if (delta > 0) {		/* Putting blocks back */
@@ -1332,7 +1352,7 @@ xfs_mod_incore_sb_unlocked(xfs_mount_t *mp, xfs_sb_field_t field,
			}
			}
		}
		}


		mp->m_sb.sb_fdblocks = lcounter;
		mp->m_sb.sb_fdblocks = lcounter + SET_ASIDE_BLOCKS;
		return 0;
		return 0;
	case XFS_SBS_FREXTENTS:
	case XFS_SBS_FREXTENTS:
		lcounter = (long long)mp->m_sb.sb_frextents;
		lcounter = (long long)mp->m_sb.sb_frextents;