Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit d68fc57b authored by Yan, Zheng's avatar Yan, Zheng Committed by Chris Mason
Browse files

Btrfs: Metadata reservation for orphan inodes



reserve metadata space for handling orphan inodes

Signed-off-by: default avatarYan Zheng <zheng.yan@oracle.com>
Signed-off-by: default avatarChris Mason <chris.mason@oracle.com>
parent 8929ecfa
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -151,6 +151,7 @@ struct btrfs_inode {
	 * of these.
	 */
	unsigned ordered_data_close:1;
	unsigned orphan_meta_reserved:1;
	unsigned dummy_inode:1;

	/*
+14 −2
Original line number Diff line number Diff line
@@ -1069,7 +1069,6 @@ struct btrfs_root {
	int ref_cows;
	int track_dirty;
	int in_radix;
	int clean_orphans;

	u64 defrag_trans_start;
	struct btrfs_key defrag_progress;
@@ -1083,8 +1082,11 @@ struct btrfs_root {

	struct list_head root_list;

	spinlock_t list_lock;
	spinlock_t orphan_lock;
	struct list_head orphan_list;
	struct btrfs_block_rsv *orphan_block_rsv;
	int orphan_item_inserted;
	int orphan_cleanup_state;

	spinlock_t inode_lock;
	/* red-black tree that keeps track of in-memory inodes */
@@ -2080,6 +2082,9 @@ int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
				int num_items, int *retries);
void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
				struct btrfs_root *root);
int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
				  struct inode *inode);
void btrfs_orphan_release_metadata(struct inode *inode);
int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
				struct btrfs_pending_snapshot *pending);
int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
@@ -2404,6 +2409,13 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
void btrfs_orphan_cleanup(struct btrfs_root *root);
void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
				struct btrfs_pending_snapshot *pending,
				u64 *bytes_to_reserve);
void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
				struct btrfs_pending_snapshot *pending);
void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
			      struct btrfs_root *root);
int btrfs_cont_expand(struct inode *inode, loff_t size);
int btrfs_invalidate_inodes(struct btrfs_root *root);
void btrfs_add_delayed_iput(struct inode *inode);
+20 −12
Original line number Diff line number Diff line
@@ -894,7 +894,8 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
	root->ref_cows = 0;
	root->track_dirty = 0;
	root->in_radix = 0;
	root->clean_orphans = 0;
	root->orphan_item_inserted = 0;
	root->orphan_cleanup_state = 0;

	root->fs_info = fs_info;
	root->objectid = objectid;
@@ -904,12 +905,13 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
	root->in_sysfs = 0;
	root->inode_tree = RB_ROOT;
	root->block_rsv = NULL;
	root->orphan_block_rsv = NULL;

	INIT_LIST_HEAD(&root->dirty_list);
	INIT_LIST_HEAD(&root->orphan_list);
	INIT_LIST_HEAD(&root->root_list);
	spin_lock_init(&root->node_lock);
	spin_lock_init(&root->list_lock);
	spin_lock_init(&root->orphan_lock);
	spin_lock_init(&root->inode_lock);
	spin_lock_init(&root->accounting_lock);
	mutex_init(&root->objectid_mutex);
@@ -1193,19 +1195,23 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
	if (root)
		return root;

	ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
	if (ret == 0)
		ret = -ENOENT;
	if (ret < 0)
		return ERR_PTR(ret);

	root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
	if (IS_ERR(root))
		return root;

	WARN_ON(btrfs_root_refs(&root->root_item) == 0);
	set_anon_super(&root->anon_super, NULL);

	if (btrfs_root_refs(&root->root_item) == 0) {
		ret = -ENOENT;
		goto fail;
	}

	ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
	if (ret < 0)
		goto fail;
	if (ret == 0)
		root->orphan_item_inserted = 1;

	ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
	if (ret)
		goto fail;
@@ -1214,10 +1220,9 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
	ret = radix_tree_insert(&fs_info->fs_roots_radix,
				(unsigned long)root->root_key.objectid,
				root);
	if (ret == 0) {
	if (ret == 0)
		root->in_radix = 1;
		root->clean_orphans = 1;
	}

	spin_unlock(&fs_info->fs_roots_radix_lock);
	radix_tree_preload_end();
	if (ret) {
@@ -1981,6 +1986,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
	BUG_ON(ret);

	if (!(sb->s_flags & MS_RDONLY)) {
		ret = btrfs_cleanup_fs_roots(fs_info);
		BUG_ON(ret);

		ret = btrfs_recover_relocation(tree_root);
		if (ret < 0) {
			printk(KERN_WARNING
+28 −0
Original line number Diff line number Diff line
@@ -3626,6 +3626,34 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
	trans->bytes_reserved = 0;
}

int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
				  struct inode *inode)
{
	struct btrfs_root *root = BTRFS_I(inode)->root;
	struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
	struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;

	/*
	 * one for deleting orphan item, one for updating inode and
	 * two for calling btrfs_truncate_inode_items.
	 *
	 * btrfs_truncate_inode_items is a delete operation, it frees
	 * more space than it uses in most cases. So two units of
	 * metadata space should be enough for calling it many times.
	 * If all of the metadata space is used, we can commit
	 * transaction and use space it freed.
	 */
	u64 num_bytes = calc_trans_metadata_size(root, 4);
	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
}

void btrfs_orphan_release_metadata(struct inode *inode)
{
	struct btrfs_root *root = BTRFS_I(inode)->root;
	u64 num_bytes = calc_trans_metadata_size(root, 4);
	btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
}

int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
				struct btrfs_pending_snapshot *pending)
{
+257 −42
Original line number Diff line number Diff line
@@ -1981,33 +1981,197 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
	up_read(&root->fs_info->cleanup_work_sem);
}

/*
 * calculate extra metadata reservation when snapshotting a subvolume
 * contains orphan files.
 */
void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
				struct btrfs_pending_snapshot *pending,
				u64 *bytes_to_reserve)
{
	struct btrfs_root *root;
	struct btrfs_block_rsv *block_rsv;
	u64 num_bytes;
	int index;

	root = pending->root;
	if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
		return;

	block_rsv = root->orphan_block_rsv;

	/* orphan block reservation for the snapshot */
	num_bytes = block_rsv->size;

	/*
	 * after the snapshot is created, COWing tree blocks may use more
	 * space than it frees. So we should make sure there is enough
	 * reserved space.
	 */
	index = trans->transid & 0x1;
	if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
		num_bytes += block_rsv->size -
			     (block_rsv->reserved + block_rsv->freed[index]);
	}

	*bytes_to_reserve += num_bytes;
}

void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
				struct btrfs_pending_snapshot *pending)
{
	struct btrfs_root *root = pending->root;
	struct btrfs_root *snap = pending->snap;
	struct btrfs_block_rsv *block_rsv;
	u64 num_bytes;
	int index;
	int ret;

	if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
		return;

	/* refill source subvolume's orphan block reservation */
	block_rsv = root->orphan_block_rsv;
	index = trans->transid & 0x1;
	if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
		num_bytes = block_rsv->size -
			    (block_rsv->reserved + block_rsv->freed[index]);
		ret = btrfs_block_rsv_migrate(&pending->block_rsv,
					      root->orphan_block_rsv,
					      num_bytes);
		BUG_ON(ret);
	}

	/* setup orphan block reservation for the snapshot */
	block_rsv = btrfs_alloc_block_rsv(snap);
	BUG_ON(!block_rsv);

	btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
	snap->orphan_block_rsv = block_rsv;

	num_bytes = root->orphan_block_rsv->size;
	ret = btrfs_block_rsv_migrate(&pending->block_rsv,
				      block_rsv, num_bytes);
	BUG_ON(ret);

#if 0
	/* insert orphan item for the snapshot */
	WARN_ON(!root->orphan_item_inserted);
	ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
				       snap->root_key.objectid);
	BUG_ON(ret);
	snap->orphan_item_inserted = 1;
#endif
}

enum btrfs_orphan_cleanup_state {
	ORPHAN_CLEANUP_STARTED	= 1,
	ORPHAN_CLEANUP_DONE	= 2,
};

/*
 * This is called in transaction commmit time. If there are no orphan
 * files in the subvolume, it removes orphan item and frees block_rsv
 * structure.
 */
void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
			      struct btrfs_root *root)
{
	int ret;

	if (!list_empty(&root->orphan_list) ||
	    root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
		return;

	if (root->orphan_item_inserted &&
	    btrfs_root_refs(&root->root_item) > 0) {
		ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
					    root->root_key.objectid);
		BUG_ON(ret);
		root->orphan_item_inserted = 0;
	}

	if (root->orphan_block_rsv) {
		WARN_ON(root->orphan_block_rsv->size > 0);
		btrfs_free_block_rsv(root, root->orphan_block_rsv);
		root->orphan_block_rsv = NULL;
	}
}

/*
 * This creates an orphan entry for the given inode in case something goes
 * wrong in the middle of an unlink/truncate.
 *
 * NOTE: caller of this function should reserve 5 units of metadata for
 *	 this function.
 */
int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
{
	struct btrfs_root *root = BTRFS_I(inode)->root;
	int ret = 0;
	struct btrfs_block_rsv *block_rsv = NULL;
	int reserve = 0;
	int insert = 0;
	int ret;

	spin_lock(&root->list_lock);
	if (!root->orphan_block_rsv) {
		block_rsv = btrfs_alloc_block_rsv(root);
		BUG_ON(!block_rsv);
	}

	/* already on the orphan list, we're good */
	if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
		spin_unlock(&root->list_lock);
		return 0;
	spin_lock(&root->orphan_lock);
	if (!root->orphan_block_rsv) {
		root->orphan_block_rsv = block_rsv;
	} else if (block_rsv) {
		btrfs_free_block_rsv(root, block_rsv);
		block_rsv = NULL;
	}

	if (list_empty(&BTRFS_I(inode)->i_orphan)) {
		list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);

	spin_unlock(&root->list_lock);

#if 0
		/*
	 * insert an orphan item to track this unlinked/truncated file
		 * For proper ENOSPC handling, we should do orphan
		 * cleanup when mounting. But this introduces backward
		 * compatibility issue.
		 */
		if (!xchg(&root->orphan_item_inserted, 1))
			insert = 2;
		else
			insert = 1;
#endif
		insert = 1;
	} else {
		WARN_ON(!BTRFS_I(inode)->orphan_meta_reserved);
	}

	if (!BTRFS_I(inode)->orphan_meta_reserved) {
		BTRFS_I(inode)->orphan_meta_reserved = 1;
		reserve = 1;
	}
	spin_unlock(&root->orphan_lock);

	if (block_rsv)
		btrfs_add_durable_block_rsv(root->fs_info, block_rsv);

	/* grab metadata reservation from transaction handle */
	if (reserve) {
		ret = btrfs_orphan_reserve_metadata(trans, inode);
		BUG_ON(ret);
	}

	/* insert an orphan item to track this unlinked/truncated file */
	if (insert >= 1) {
		ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
		BUG_ON(ret);
	}

	return ret;
	/* insert an orphan item to track subvolume contains orphan files */
	if (insert >= 2) {
		ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
					       root->root_key.objectid);
		BUG_ON(ret);
	}
	return 0;
}

/*
@@ -2017,26 +2181,31 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
{
	struct btrfs_root *root = BTRFS_I(inode)->root;
	int delete_item = 0;
	int release_rsv = 0;
	int ret = 0;

	spin_lock(&root->list_lock);

	if (list_empty(&BTRFS_I(inode)->i_orphan)) {
		spin_unlock(&root->list_lock);
		return 0;
	}

	spin_lock(&root->orphan_lock);
	if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
		list_del_init(&BTRFS_I(inode)->i_orphan);
	if (!trans) {
		spin_unlock(&root->list_lock);
		return 0;
		delete_item = 1;
	}

	spin_unlock(&root->list_lock);
	if (BTRFS_I(inode)->orphan_meta_reserved) {
		BTRFS_I(inode)->orphan_meta_reserved = 0;
		release_rsv = 1;
	}
	spin_unlock(&root->orphan_lock);

	if (trans && delete_item) {
		ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
		BUG_ON(ret);
	}

	return ret;
	if (release_rsv)
		btrfs_orphan_release_metadata(inode);

	return 0;
}

/*
@@ -2053,7 +2222,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
	struct inode *inode;
	int ret = 0, nr_unlink = 0, nr_truncate = 0;

	if (!xchg(&root->clean_orphans, 0))
	if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
		return;

	path = btrfs_alloc_path();
@@ -2106,16 +2275,15 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
		found_key.type = BTRFS_INODE_ITEM_KEY;
		found_key.offset = 0;
		inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
		if (IS_ERR(inode))
			break;
		BUG_ON(IS_ERR(inode));

		/*
		 * add this inode to the orphan list so btrfs_orphan_del does
		 * the proper thing when we hit it
		 */
		spin_lock(&root->list_lock);
		spin_lock(&root->orphan_lock);
		list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
		spin_unlock(&root->list_lock);
		spin_unlock(&root->orphan_lock);

		/*
		 * if this is a bad inode, means we actually succeeded in
@@ -2142,13 +2310,23 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
		/* this will do delete_inode and everything for us */
		iput(inode);
	}
	btrfs_free_path(path);

	root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;

	if (root->orphan_block_rsv)
		btrfs_block_rsv_release(root, root->orphan_block_rsv,
					(u64)-1);

	if (root->orphan_block_rsv || root->orphan_item_inserted) {
		trans = btrfs_join_transaction(root, 1);
		btrfs_end_transaction(trans, root);
	}

	if (nr_unlink)
		printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
	if (nr_truncate)
		printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);

	btrfs_free_path(path);
}

/*
@@ -3181,6 +3359,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
	if (pending_del_nr) {
		ret = btrfs_del_items(trans, root, path, pending_del_slot,
				      pending_del_nr);
		BUG_ON(ret);
	}
	btrfs_free_path(path);
	return err;
@@ -3386,7 +3565,10 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
		}
	}

	trans = btrfs_start_transaction(root, 1);
	trans = btrfs_start_transaction(root, 5);
	if (IS_ERR(trans))
		return PTR_ERR(trans);

	btrfs_set_trans_block_group(trans, inode);

	ret = btrfs_orphan_add(trans, inode);
@@ -3406,8 +3588,11 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
		i_size_write(inode, attr->ia_size);
		btrfs_ordered_update_i_size(inode, inode->i_size, NULL);

		trans = btrfs_start_transaction(root, 1);
		trans = btrfs_start_transaction(root, 0);
		BUG_ON(IS_ERR(trans));
		btrfs_set_trans_block_group(trans, inode);
		trans->block_rsv = root->orphan_block_rsv;
		BUG_ON(!trans->block_rsv);

		ret = btrfs_update_inode(trans, root, inode);
		BUG_ON(ret);
@@ -3487,10 +3672,21 @@ void btrfs_delete_inode(struct inode *inode)
	btrfs_i_size_write(inode, 0);

	while (1) {
		trans = btrfs_start_transaction(root, 1);
		trans = btrfs_start_transaction(root, 0);
		BUG_ON(IS_ERR(trans));
		btrfs_set_trans_block_group(trans, inode);
		ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
		trans->block_rsv = root->orphan_block_rsv;

		ret = btrfs_block_rsv_check(trans, root,
					    root->orphan_block_rsv, 0, 5);
		if (ret) {
			BUG_ON(ret != -EAGAIN);
			ret = btrfs_commit_transaction(trans, root);
			BUG_ON(ret);
			continue;
		}

		ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
		if (ret != -EAGAIN)
			break;

@@ -3498,6 +3694,7 @@ void btrfs_delete_inode(struct inode *inode)
		btrfs_end_transaction(trans, root);
		trans = NULL;
		btrfs_btree_balance_dirty(root, nr);

	}

	if (ret == 0) {
@@ -5247,8 +5444,10 @@ static void btrfs_truncate(struct inode *inode)
	btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
	btrfs_ordered_update_i_size(inode, inode->i_size, NULL);

	trans = btrfs_start_transaction(root, 1);
	trans = btrfs_start_transaction(root, 0);
	BUG_ON(IS_ERR(trans));
	btrfs_set_trans_block_group(trans, inode);
	trans->block_rsv = root->orphan_block_rsv;

	/*
	 * setattr is responsible for setting the ordered_data_close flag,
@@ -5271,6 +5470,23 @@ static void btrfs_truncate(struct inode *inode)
		btrfs_add_ordered_operation(trans, root, inode);

	while (1) {
		if (!trans) {
			trans = btrfs_start_transaction(root, 0);
			BUG_ON(IS_ERR(trans));
			btrfs_set_trans_block_group(trans, inode);
			trans->block_rsv = root->orphan_block_rsv;
		}

		ret = btrfs_block_rsv_check(trans, root,
					    root->orphan_block_rsv, 0, 5);
		if (ret) {
			BUG_ON(ret != -EAGAIN);
			ret = btrfs_commit_transaction(trans, root);
			BUG_ON(ret);
			trans = NULL;
			continue;
		}

		ret = btrfs_truncate_inode_items(trans, root, inode,
						 inode->i_size,
						 BTRFS_EXTENT_DATA_KEY);
@@ -5282,10 +5498,8 @@ static void btrfs_truncate(struct inode *inode)

		nr = trans->blocks_used;
		btrfs_end_transaction(trans, root);
		trans = NULL;
		btrfs_btree_balance_dirty(root, nr);

		trans = btrfs_start_transaction(root, 1);
		btrfs_set_trans_block_group(trans, inode);
	}

	if (ret == 0 && inode->i_nlink > 0) {
@@ -5371,6 +5585,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
	ei->reserved_extents = 0;

	ei->ordered_data_close = 0;
	ei->orphan_meta_reserved = 0;
	ei->dummy_inode = 0;
	ei->force_compress = 0;

@@ -5417,13 +5632,13 @@ void btrfs_destroy_inode(struct inode *inode)
		spin_unlock(&root->fs_info->ordered_extent_lock);
	}

	spin_lock(&root->list_lock);
	spin_lock(&root->orphan_lock);
	if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
		printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n",
		       inode->i_ino);
		list_del_init(&BTRFS_I(inode)->i_orphan);
	}
	spin_unlock(&root->list_lock);
	spin_unlock(&root->orphan_lock);

	while (1) {
		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
Loading