Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit ee1a47ab authored by Christoph Hellwig's avatar Christoph Hellwig Committed by Ben Myers
Browse files

xfs: add support for large btree blocks



Add support for larger btree blocks that contains a CRC32C checksum,
a filesystem uuid and block number for detecting filesystem
consistency and out of place writes.

[dchinner@redhat.com] Also include an owner field to allow reverse
mappings to be implemented for improved repairability and a LSN
field to so that log recovery can easily determine the last
modification that made it to disk for each buffer.

[dchinner@redhat.com] Add buffer log format flags to indicate the
type of buffer to recovery so that we don't have to do blind magic
number tests to determine what the buffer is.

[dchinner@redhat.com] Modified to fit into the verifier structure.

Signed-off-by: default avatarChristoph Hellwig <hch@lst.de>
Signed-off-by: default avatarDave Chinner <dchinner@redhat.com>
Reviewed-by: default avatarBen Myers <bpm@sgi.com>
Signed-off-by: default avatarBen Myers <bpm@sgi.com>
parent a2050646
Loading
Loading
Loading
Loading
+73 −32
Original line number Diff line number Diff line
@@ -33,6 +33,7 @@
#include "xfs_extent_busy.h"
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_cksum.h"


STATIC struct xfs_btree_cur *
@@ -272,7 +273,7 @@ xfs_allocbt_key_diff(
	return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
}

static void
static bool
xfs_allocbt_verify(
	struct xfs_buf		*bp)
{
@@ -280,66 +281,103 @@ xfs_allocbt_verify(
	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
	struct xfs_perag	*pag = bp->b_pag;
	unsigned int		level;
	int			sblock_ok; /* block passes checks */

	/*
	 * magic number and level verification
	 *
	 * During growfs operations, we can't verify the exact level as the
	 * perag is not fully initialised and hence not attached to the buffer.
	 * In this case, check against the maximum tree depth.
	 * During growfs operations, we can't verify the exact level or owner as
	 * the perag is not fully initialised and hence not attached to the
	 * buffer.  In this case, check against the maximum tree depth.
	 *
	 * Similarly, during log recovery we will have a perag structure
	 * attached, but the agf information will not yet have been initialised
	 * from the on disk AGF. Again, we can only check against maximum limits
	 * in this case.
	 */
	level = be16_to_cpu(block->bb_level);
	switch (block->bb_magic) {
	case cpu_to_be32(XFS_ABTB_CRC_MAGIC):
		if (!xfs_sb_version_hascrc(&mp->m_sb))
			return false;
		if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
			return false;
		if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
			return false;
		if (pag &&
		    be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
			return false;
		/* fall through */
	case cpu_to_be32(XFS_ABTB_MAGIC):
		if (pag)
			sblock_ok = level < pag->pagf_levels[XFS_BTNUM_BNOi];
		else
			sblock_ok = level < mp->m_ag_maxlevels;
		if (pag && pag->pagf_init) {
			if (level >= pag->pagf_levels[XFS_BTNUM_BNOi])
				return false;
		} else if (level >= mp->m_ag_maxlevels)
			return false;
		break;
	case cpu_to_be32(XFS_ABTC_CRC_MAGIC):
		if (!xfs_sb_version_hascrc(&mp->m_sb))
			return false;
		if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
			return false;
		if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
			return false;
		if (pag &&
		    be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
			return false;
		/* fall through */
	case cpu_to_be32(XFS_ABTC_MAGIC):
		if (pag)
			sblock_ok = level < pag->pagf_levels[XFS_BTNUM_CNTi];
		else
			sblock_ok = level < mp->m_ag_maxlevels;
		if (pag && pag->pagf_init) {
			if (level >= pag->pagf_levels[XFS_BTNUM_CNTi])
				return false;
		} else if (level >= mp->m_ag_maxlevels)
			return false;
		break;
	default:
		sblock_ok = 0;
		break;
		return false;
	}

	/* numrecs verification */
	sblock_ok = sblock_ok &&
		be16_to_cpu(block->bb_numrecs) <= mp->m_alloc_mxr[level != 0];
	if (be16_to_cpu(block->bb_numrecs) > mp->m_alloc_mxr[level != 0])
		return false;

	/* sibling pointer verification */
	sblock_ok = sblock_ok &&
		(block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
		 be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) &&
		block->bb_u.s.bb_leftsib &&
		(block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
		 be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) &&
		block->bb_u.s.bb_rightsib;

	if (!sblock_ok) {
		trace_xfs_btree_corrupt(bp, _RET_IP_);
		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block);
		xfs_buf_ioerror(bp, EFSCORRUPTED);
	}
	if (!block->bb_u.s.bb_leftsib ||
	    (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks &&
	     block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK)))
		return false;
	if (!block->bb_u.s.bb_rightsib ||
	    (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks &&
	     block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK)))
		return false;

	return true;
}

static void
xfs_allocbt_read_verify(
	struct xfs_buf	*bp)
{
	xfs_allocbt_verify(bp);
	if (!(xfs_btree_sblock_verify_crc(bp) &&
	      xfs_allocbt_verify(bp))) {
		trace_xfs_btree_corrupt(bp, _RET_IP_);
		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
				     bp->b_target->bt_mount, bp->b_addr);
		xfs_buf_ioerror(bp, EFSCORRUPTED);
	}
}

static void
xfs_allocbt_write_verify(
	struct xfs_buf	*bp)
{
	xfs_allocbt_verify(bp);
	if (!xfs_allocbt_verify(bp)) {
		trace_xfs_btree_corrupt(bp, _RET_IP_);
		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
				     bp->b_target->bt_mount, bp->b_addr);
		xfs_buf_ioerror(bp, EFSCORRUPTED);
	}
	xfs_btree_sblock_calc_crc(bp);

}

const struct xfs_buf_ops xfs_allocbt_buf_ops = {
@@ -444,6 +482,9 @@ xfs_allocbt_init_cursor(
	cur->bc_private.a.agbp = agbp;
	cur->bc_private.a.agno = agno;

	if (xfs_sb_version_hascrc(&mp->m_sb))
		cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;

	return cur;
}

+7 −5
Original line number Diff line number Diff line
@@ -32,7 +32,9 @@ struct xfs_mount;
 * simpler; if we have time later, we'll make the optimizations.
 */
#define	XFS_ABTB_MAGIC		0x41425442	/* 'ABTB' for bno tree */
#define	XFS_ABTB_CRC_MAGIC	0x41423342	/* 'AB3B' */
#define	XFS_ABTC_MAGIC		0x41425443	/* 'ABTC' for cnt tree */
#define	XFS_ABTC_CRC_MAGIC	0x41423343	/* 'AB3C' */

/*
 * Data record/key structure
@@ -59,10 +61,10 @@ typedef __be32 xfs_alloc_ptr_t;

/*
 * Btree block header size depends on a superblock flag.
 *
 * (not quite yet, but soon)
 */
#define XFS_ALLOC_BLOCK_LEN(mp)	XFS_BTREE_SBLOCK_LEN
#define XFS_ALLOC_BLOCK_LEN(mp) \
	(xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
		XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN)

/*
 * Record, key, and pointer address macros for btree blocks.
+1 −1
Original line number Diff line number Diff line
@@ -232,7 +232,7 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
				return 0;
			return dp->i_d.di_forkoff;
		}
		dsize = XFS_BMAP_BROOT_SPACE(dp->i_df.if_broot);
		dsize = XFS_BMAP_BROOT_SPACE(mp, dp->i_df.if_broot);
		break;
	}

+31 −16
Original line number Diff line number Diff line
@@ -439,11 +439,15 @@ xfs_bmap_sanity_check(
{
	struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);

	if (block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC) ||
	    be16_to_cpu(block->bb_level) != level ||
	if (block->bb_magic != cpu_to_be32(XFS_BMAP_CRC_MAGIC) &&
	    block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC))
		return 0;

	if (be16_to_cpu(block->bb_level) != level ||
	    be16_to_cpu(block->bb_numrecs) == 0 ||
	    be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0])
		return 0;

	return 1;
}

@@ -1031,6 +1035,7 @@ xfs_bmap_extents_to_btree(
	xfs_extnum_t		nextents;	/* number of file extents */
	xfs_bmbt_ptr_t		*pp;		/* root block address pointer */

	mp = ip->i_mount;
	ifp = XFS_IFORK_PTR(ip, whichfork);
	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS);

@@ -1044,16 +1049,18 @@ xfs_bmap_extents_to_btree(
	 * Fill in the root.
	 */
	block = ifp->if_broot;
	block->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
	block->bb_level = cpu_to_be16(1);
	block->bb_numrecs = cpu_to_be16(1);
	block->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
	block->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
	if (xfs_sb_version_hascrc(&mp->m_sb))
		xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL,
				 XFS_BMAP_CRC_MAGIC, 1, 1, ip->i_ino,
				 XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS);
	else
		xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL,
				 XFS_BMAP_MAGIC, 1, 1, ip->i_ino,
				 XFS_BTREE_LONG_PTRS);

	/*
	 * Need a cursor.  Can't allocate until bb_level is filled in.
	 */
	mp = ip->i_mount;
	cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
	cur->bc_private.b.firstblock = *firstblock;
	cur->bc_private.b.flist = flist;
@@ -1102,10 +1109,15 @@ xfs_bmap_extents_to_btree(
	 */
	abp->b_ops = &xfs_bmbt_buf_ops;
	ablock = XFS_BUF_TO_BLOCK(abp);
	ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
	ablock->bb_level = 0;
	ablock->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
	ablock->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
	if (xfs_sb_version_hascrc(&mp->m_sb))
		xfs_btree_init_block_int(mp, ablock, abp->b_bn,
				XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino,
				XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS);
	else
		xfs_btree_init_block_int(mp, ablock, abp->b_bn,
				XFS_BMAP_MAGIC, 0, 0, ip->i_ino,
				XFS_BTREE_LONG_PTRS);

	arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
	for (cnt = i = 0; i < nextents; i++) {
@@ -1155,7 +1167,8 @@ xfs_bmap_local_to_extents(
	xfs_extlen_t	total,		/* total blocks needed by transaction */
	int		*logflagsp,	/* inode logging flags */
	int		whichfork,
	void		(*init_fn)(struct xfs_buf *bp,
	void		(*init_fn)(struct xfs_trans *tp,
				   struct xfs_buf *bp,
				   struct xfs_inode *ip,
				   struct xfs_ifork *ifp))
{
@@ -1207,7 +1220,7 @@ xfs_bmap_local_to_extents(
		bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);

		/* initialise the block and copy the data */
		init_fn(bp, ip, ifp);
		init_fn(tp, bp, ip, ifp);

		/* account for the change in fork size and log everything */
		xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
@@ -1314,16 +1327,19 @@ xfs_bmap_add_attrfork_extents(
 */
STATIC void
xfs_bmap_local_to_extents_init_fn(
	struct xfs_trans	*tp,
	struct xfs_buf		*bp,
	struct xfs_inode	*ip,
	struct xfs_ifork	*ifp)
{
	bp->b_ops = &xfs_bmbt_buf_ops;
	memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
	xfs_trans_buf_set_type(tp, bp, XFS_BLF_BTREE_BUF);
}

STATIC void
xfs_symlink_local_to_remote(
	struct xfs_trans	*tp,
	struct xfs_buf		*bp,
	struct xfs_inode	*ip,
	struct xfs_ifork	*ifp)
@@ -1342,8 +1358,7 @@ xfs_symlink_local_to_remote(
 *
 * XXX (dgc): investigate whether directory conversion can use the generic
 * formatting callout. It should be possible - it's just a very complex
 * formatter. it would also require passing the transaction through to the init
 * function.
 * formatter.
 */
STATIC int					/* error */
xfs_bmap_add_attrfork_local(
+78 −32
Original line number Diff line number Diff line
@@ -37,6 +37,7 @@
#include "xfs_error.h"
#include "xfs_quota.h"
#include "xfs_trace.h"
#include "xfs_cksum.h"

/*
 * Determine the extent state.
@@ -59,24 +60,31 @@ xfs_extent_state(
 */
void
xfs_bmdr_to_bmbt(
	struct xfs_mount	*mp,
	struct xfs_inode	*ip,
	xfs_bmdr_block_t	*dblock,
	int			dblocklen,
	struct xfs_btree_block	*rblock,
	int			rblocklen)
{
	struct xfs_mount	*mp = ip->i_mount;
	int			dmxr;
	xfs_bmbt_key_t		*fkp;
	__be64			*fpp;
	xfs_bmbt_key_t		*tkp;
	__be64			*tpp;

	rblock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
	if (xfs_sb_version_hascrc(&mp->m_sb))
		xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL,
				 XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino,
				 XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS);
	else
		xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL,
				 XFS_BMAP_MAGIC, 0, 0, ip->i_ino,
				 XFS_BTREE_LONG_PTRS);

	rblock->bb_level = dblock->bb_level;
	ASSERT(be16_to_cpu(rblock->bb_level) > 0);
	rblock->bb_numrecs = dblock->bb_numrecs;
	rblock->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
	rblock->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
	dmxr = xfs_bmdr_maxrecs(mp, dblocklen, 0);
	fkp = XFS_BMDR_KEY_ADDR(dblock, 1);
	tkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
@@ -424,6 +432,12 @@ xfs_bmbt_to_bmdr(
	xfs_bmbt_key_t		*tkp;
	__be64			*tpp;

	if (xfs_sb_version_hascrc(&mp->m_sb)) {
		ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_CRC_MAGIC));
		ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid));
		ASSERT(rblock->bb_u.l.bb_blkno ==
		       cpu_to_be64(XFS_BUF_DADDR_NULL));
	} else
		ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC));
	ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO));
	ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO));
@@ -708,59 +722,89 @@ xfs_bmbt_key_diff(
				      cur->bc_rec.b.br_startoff;
}

static void
static int
xfs_bmbt_verify(
	struct xfs_buf		*bp)
{
	struct xfs_mount	*mp = bp->b_target->bt_mount;
	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
	unsigned int		level;
	int			lblock_ok; /* block passes checks */

	/* magic number and level verification.
	switch (block->bb_magic) {
	case cpu_to_be32(XFS_BMAP_CRC_MAGIC):
		if (!xfs_sb_version_hascrc(&mp->m_sb))
			return false;
		if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid))
			return false;
		if (be64_to_cpu(block->bb_u.l.bb_blkno) != bp->b_bn)
			return false;
		/*
		 * XXX: need a better way of verifying the owner here. Right now
		 * just make sure there has been one set.
		 */
		if (be64_to_cpu(block->bb_u.l.bb_owner) == 0)
			return false;
		/* fall through */
	case cpu_to_be32(XFS_BMAP_MAGIC):
		break;
	default:
		return false;
	}

	/*
	 * numrecs and level verification.
	 *
	 * We don't know waht fork we belong to, so just verify that the level
	 * We don't know what fork we belong to, so just verify that the level
	 * is less than the maximum of the two. Later checks will be more
	 * precise.
	 */
	level = be16_to_cpu(block->bb_level);
	lblock_ok = block->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC) &&
		    level < max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1]);

	/* numrecs verification */
	lblock_ok = lblock_ok &&
		be16_to_cpu(block->bb_numrecs) <= mp->m_bmap_dmxr[level != 0];
	if (level > max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1]))
		return false;
	if (be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0])
		return false;

	/* sibling pointer verification */
	lblock_ok = lblock_ok &&
		block->bb_u.l.bb_leftsib &&
		(block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) ||
		 XFS_FSB_SANITY_CHECK(mp,
			be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
		block->bb_u.l.bb_rightsib &&
		(block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) ||
		 XFS_FSB_SANITY_CHECK(mp,
			be64_to_cpu(block->bb_u.l.bb_rightsib)));

	if (!lblock_ok) {
		trace_xfs_btree_corrupt(bp, _RET_IP_);
		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block);
		xfs_buf_ioerror(bp, EFSCORRUPTED);
	}
	if (!block->bb_u.l.bb_leftsib ||
	    (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLDFSBNO) &&
	     !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_leftsib))))
		return false;
	if (!block->bb_u.l.bb_rightsib ||
	    (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLDFSBNO) &&
	     !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_rightsib))))
		return false;

	return true;

}

static void
xfs_bmbt_read_verify(
	struct xfs_buf	*bp)
{
	xfs_bmbt_verify(bp);
	if (!(xfs_btree_lblock_verify_crc(bp) &&
	      xfs_bmbt_verify(bp))) {
		trace_xfs_btree_corrupt(bp, _RET_IP_);
		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
				     bp->b_target->bt_mount, bp->b_addr);
		xfs_buf_ioerror(bp, EFSCORRUPTED);
	}

}

static void
xfs_bmbt_write_verify(
	struct xfs_buf	*bp)
{
	xfs_bmbt_verify(bp);
	if (!xfs_bmbt_verify(bp)) {
		xfs_warn(bp->b_target->bt_mount, "bmbt daddr 0x%llx failed", bp->b_bn);
		trace_xfs_btree_corrupt(bp, _RET_IP_);
		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
				     bp->b_target->bt_mount, bp->b_addr);
		xfs_buf_ioerror(bp, EFSCORRUPTED);
		return;
	}
	xfs_btree_lblock_calc_crc(bp);
}

const struct xfs_buf_ops xfs_bmbt_buf_ops = {
@@ -838,6 +882,8 @@ xfs_bmbt_init_cursor(

	cur->bc_ops = &xfs_bmbt_ops;
	cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE;
	if (xfs_sb_version_hascrc(&mp->m_sb))
		cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;

	cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork);
	cur->bc_private.b.ip = ip;
Loading