Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 581bb050 authored by Li Zefan's avatar Li Zefan
Browse files

Btrfs: Cache free inode numbers in memory



Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.

This fixes it, and it works similarly to how we cache free space in block
cgroups.

We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.

Because we are searching the commit root, we have to carefully handle the
cross-transaction case.

The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.

Signed-off-by: default avatarLi Zefan <lizf@cn.fujitsu.com>
parent 34d52cb6
Loading
Loading
Loading
Loading
+9 −6
Original line number Original line Diff line number Diff line
@@ -1102,6 +1102,15 @@ struct btrfs_root {
	spinlock_t accounting_lock;
	spinlock_t accounting_lock;
	struct btrfs_block_rsv *block_rsv;
	struct btrfs_block_rsv *block_rsv;


	/* free ino cache stuff */
	struct mutex fs_commit_mutex;
	struct btrfs_free_space_ctl *free_ino_ctl;
	enum btrfs_caching_type cached;
	spinlock_t cache_lock;
	wait_queue_head_t cache_wait;
	struct btrfs_free_space_ctl *free_ino_pinned;
	u64 cache_progress;

	struct mutex log_mutex;
	struct mutex log_mutex;
	wait_queue_head_t log_writer_wait;
	wait_queue_head_t log_writer_wait;
	wait_queue_head_t log_commit_wait[2];
	wait_queue_head_t log_commit_wait[2];
@@ -2408,12 +2417,6 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
			  struct btrfs_root *root, u64 offset);
			  struct btrfs_root *root, u64 offset);
int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset);
int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset);


/* inode-map.c */
int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
			     struct btrfs_root *fs_root,
			     u64 dirid, u64 *objectid);
int btrfs_find_highest_inode(struct btrfs_root *fs_root, u64 *objectid);

/* inode-item.c */
/* inode-item.c */
int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
			   struct btrfs_root *root,
			   struct btrfs_root *root,
+18 −0
Original line number Original line Diff line number Diff line
@@ -41,6 +41,7 @@
#include "locking.h"
#include "locking.h"
#include "tree-log.h"
#include "tree-log.h"
#include "free-space-cache.h"
#include "free-space-cache.h"
#include "inode-map.h"


static struct extent_io_ops btree_extent_io_ops;
static struct extent_io_ops btree_extent_io_ops;
static void end_workqueue_fn(struct btrfs_work *work);
static void end_workqueue_fn(struct btrfs_work *work);
@@ -1327,6 +1328,19 @@ again:
	if (IS_ERR(root))
	if (IS_ERR(root))
		return root;
		return root;


	root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
	if (!root->free_ino_ctl)
		goto fail;
	root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
					GFP_NOFS);
	if (!root->free_ino_pinned)
		goto fail;

	btrfs_init_free_ino_ctl(root);
	mutex_init(&root->fs_commit_mutex);
	spin_lock_init(&root->cache_lock);
	init_waitqueue_head(&root->cache_wait);

	set_anon_super(&root->anon_super, NULL);
	set_anon_super(&root->anon_super, NULL);


	if (btrfs_root_refs(&root->root_item) == 0) {
	if (btrfs_root_refs(&root->root_item) == 0) {
@@ -2483,6 +2497,8 @@ int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
	if (btrfs_root_refs(&root->root_item) == 0)
	if (btrfs_root_refs(&root->root_item) == 0)
		synchronize_srcu(&fs_info->subvol_srcu);
		synchronize_srcu(&fs_info->subvol_srcu);


	__btrfs_remove_free_space_cache(root->free_ino_pinned);
	__btrfs_remove_free_space_cache(root->free_ino_ctl);
	free_fs_root(root);
	free_fs_root(root);
	return 0;
	return 0;
}
}
@@ -2496,6 +2512,8 @@ static void free_fs_root(struct btrfs_root *root)
	}
	}
	free_extent_buffer(root->node);
	free_extent_buffer(root->node);
	free_extent_buffer(root->commit_root);
	free_extent_buffer(root->commit_root);
	kfree(root->free_ino_ctl);
	kfree(root->free_ino_pinned);
	kfree(root->name);
	kfree(root->name);
	kfree(root);
	kfree(root);
}
}
+76 −20
Original line number Original line Diff line number Diff line
@@ -25,6 +25,7 @@
#include "transaction.h"
#include "transaction.h"
#include "disk-io.h"
#include "disk-io.h"
#include "extent_io.h"
#include "extent_io.h"
#include "inode-map.h"


#define BITS_PER_BITMAP		(PAGE_CACHE_SIZE * 8)
#define BITS_PER_BITMAP		(PAGE_CACHE_SIZE * 8)
#define MAX_CACHE_BYTES_PER_GIG	(32 * 1024)
#define MAX_CACHE_BYTES_PER_GIG	(32 * 1024)
@@ -105,7 +106,7 @@ int create_free_space_inode(struct btrfs_root *root,
	u64 objectid;
	u64 objectid;
	int ret;
	int ret;


	ret = btrfs_find_free_objectid(trans, root, 0, &objectid);
	ret = btrfs_find_free_objectid(root, &objectid);
	if (ret < 0)
	if (ret < 0)
		return ret;
		return ret;


@@ -1496,10 +1497,9 @@ bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
	return merged;
	return merged;
}
}


int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
			   u64 offset, u64 bytes)
			   u64 offset, u64 bytes)
{
{
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
	struct btrfs_free_space *info;
	struct btrfs_free_space *info;
	int ret = 0;
	int ret = 0;


@@ -1751,11 +1751,29 @@ out:
	return 0;
	return 0;
}
}


void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
{
{
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
	struct btrfs_free_space *info;
	struct btrfs_free_space *info;
	struct rb_node *node;
	struct rb_node *node;

	spin_lock(&ctl->tree_lock);
	while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
		info = rb_entry(node, struct btrfs_free_space, offset_index);
		unlink_free_space(ctl, info);
		kfree(info->bitmap);
		kmem_cache_free(btrfs_free_space_cachep, info);
		if (need_resched()) {
			spin_unlock(&ctl->tree_lock);
			cond_resched();
			spin_lock(&ctl->tree_lock);
		}
	}
	spin_unlock(&ctl->tree_lock);
}

void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
{
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
	struct btrfs_free_cluster *cluster;
	struct btrfs_free_cluster *cluster;
	struct list_head *head;
	struct list_head *head;


@@ -1773,21 +1791,9 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
			spin_lock(&ctl->tree_lock);
			spin_lock(&ctl->tree_lock);
		}
		}
	}
	}

	while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
		info = rb_entry(node, struct btrfs_free_space, offset_index);
		unlink_free_space(ctl, info);
		if (info->bitmap)
			kfree(info->bitmap);
		kmem_cache_free(btrfs_free_space_cachep, info);
		if (need_resched()) {
	spin_unlock(&ctl->tree_lock);
	spin_unlock(&ctl->tree_lock);
			cond_resched();
			spin_lock(&ctl->tree_lock);
		}
	}


	spin_unlock(&ctl->tree_lock);
	__btrfs_remove_free_space_cache(ctl);
}
}


u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
@@ -2352,3 +2358,53 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,


	return ret;
	return ret;
}
}

/*
 * Find the left-most item in the cache tree, and then return the
 * smallest inode number in the item.
 *
 * Note: the returned inode number may not be the smallest one in
 * the tree, if the left-most item is a bitmap.
 */
u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root)
{
	struct btrfs_free_space_ctl *ctl = fs_root->free_ino_ctl;
	struct btrfs_free_space *entry = NULL;
	u64 ino = 0;

	spin_lock(&ctl->tree_lock);

	if (RB_EMPTY_ROOT(&ctl->free_space_offset))
		goto out;

	entry = rb_entry(rb_first(&ctl->free_space_offset),
			 struct btrfs_free_space, offset_index);

	if (!entry->bitmap) {
		ino = entry->offset;

		unlink_free_space(ctl, entry);
		entry->offset++;
		entry->bytes--;
		if (!entry->bytes)
			kmem_cache_free(btrfs_free_space_cachep, entry);
		else
			link_free_space(ctl, entry);
	} else {
		u64 offset = 0;
		u64 count = 1;
		int ret;

		ret = search_bitmap(ctl, entry, &offset, &count);
		BUG_ON(ret);

		ino = offset;
		bitmap_clear_bits(ctl, entry, offset, 1);
		if (entry->bytes == 0)
			free_bitmap(ctl, entry);
	}
out:
	spin_unlock(&ctl->tree_lock);

	return ino;
}
+13 −3
Original line number Original line Diff line number Diff line
@@ -64,15 +64,25 @@ int btrfs_write_out_cache(struct btrfs_root *root,
			  struct btrfs_trans_handle *trans,
			  struct btrfs_trans_handle *trans,
			  struct btrfs_block_group_cache *block_group,
			  struct btrfs_block_group_cache *block_group,
			  struct btrfs_path *path);
			  struct btrfs_path *path);

void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group);
void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group);
int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
			   u64 bytenr, u64 size);
			   u64 bytenr, u64 size);
static inline int
btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
		     u64 bytenr, u64 size)
{
	return __btrfs_add_free_space(block_group->free_space_ctl,
				      bytenr, size);
}
int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
			    u64 bytenr, u64 size);
			    u64 bytenr, u64 size);
void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl);
void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
				     *block_group);
				     *block_group);
u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
			       u64 offset, u64 bytes, u64 empty_size);
			       u64 offset, u64 bytes, u64 empty_size);
u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root);
void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
			   u64 bytes);
			   u64 bytes);
int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
+336 −5
Original line number Original line Diff line number Diff line
@@ -16,11 +16,343 @@
 * Boston, MA 021110-1307, USA.
 * Boston, MA 021110-1307, USA.
 */
 */


#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/pagemap.h>

#include "ctree.h"
#include "ctree.h"
#include "disk-io.h"
#include "disk-io.h"
#include "free-space-cache.h"
#include "inode-map.h"
#include "transaction.h"
#include "transaction.h"


int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
static int caching_kthread(void *data)
{
	struct btrfs_root *root = data;
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
	struct btrfs_key key;
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	u64 last = (u64)-1;
	int slot;
	int ret;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	/* Since the commit root is read-only, we can safely skip locking. */
	path->skip_locking = 1;
	path->search_commit_root = 1;
	path->reada = 2;

	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
	key.offset = 0;
	key.type = BTRFS_INODE_ITEM_KEY;
again:
	/* need to make sure the commit_root doesn't disappear */
	mutex_lock(&root->fs_commit_mutex);

	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
	if (ret < 0)
		goto out;

	while (1) {
		smp_mb();
		if (fs_info->closing > 1)
			goto out;

		leaf = path->nodes[0];
		slot = path->slots[0];
		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
			ret = btrfs_next_leaf(root, path);
			if (ret < 0)
				goto out;
			else if (ret > 0)
				break;

			if (need_resched() ||
			    btrfs_transaction_in_commit(fs_info)) {
				leaf = path->nodes[0];

				if (btrfs_header_nritems(leaf) == 0) {
					WARN_ON(1);
					break;
				}

				/*
				 * Save the key so we can advances forward
				 * in the next search.
				 */
				btrfs_item_key_to_cpu(leaf, &key, 0);
				btrfs_release_path(root, path);
				root->cache_progress = last;
				mutex_unlock(&root->fs_commit_mutex);
				schedule_timeout(1);
				goto again;
			} else
				continue;
		}

		btrfs_item_key_to_cpu(leaf, &key, slot);

		if (key.type != BTRFS_INODE_ITEM_KEY)
			goto next;

		if (key.objectid >= BTRFS_LAST_FREE_OBJECTID)
			break;

		if (last != (u64)-1 && last + 1 != key.objectid) {
			__btrfs_add_free_space(ctl, last + 1,
					       key.objectid - last - 1);
			wake_up(&root->cache_wait);
		}

		last = key.objectid;
next:
		path->slots[0]++;
	}

	if (last < BTRFS_LAST_FREE_OBJECTID - 1) {
		__btrfs_add_free_space(ctl, last + 1,
				       BTRFS_LAST_FREE_OBJECTID - last - 1);
	}

	spin_lock(&root->cache_lock);
	root->cached = BTRFS_CACHE_FINISHED;
	spin_unlock(&root->cache_lock);

	root->cache_progress = (u64)-1;
	btrfs_unpin_free_ino(root);
out:
	wake_up(&root->cache_wait);
	mutex_unlock(&root->fs_commit_mutex);

	btrfs_free_path(path);

	return ret;
}

static void start_caching(struct btrfs_root *root)
{
	struct task_struct *tsk;

	spin_lock(&root->cache_lock);
	if (root->cached != BTRFS_CACHE_NO) {
		spin_unlock(&root->cache_lock);
		return;
	}

	root->cached = BTRFS_CACHE_STARTED;
	spin_unlock(&root->cache_lock);

	tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu\n",
			  root->root_key.objectid);
	BUG_ON(IS_ERR(tsk));
}

int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid)
{
again:
	*objectid = btrfs_find_ino_for_alloc(root);

	if (*objectid != 0)
		return 0;

	start_caching(root);

	wait_event(root->cache_wait,
		   root->cached == BTRFS_CACHE_FINISHED ||
		   root->free_ino_ctl->free_space > 0);

	if (root->cached == BTRFS_CACHE_FINISHED &&
	    root->free_ino_ctl->free_space == 0)
		return -ENOSPC;
	else
		goto again;
}

void btrfs_return_ino(struct btrfs_root *root, u64 objectid)
{
	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
	struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
again:
	if (root->cached == BTRFS_CACHE_FINISHED) {
		__btrfs_add_free_space(ctl, objectid, 1);
	} else {
		/*
		 * If we are in the process of caching free ino chunks,
		 * to avoid adding the same inode number to the free_ino
		 * tree twice due to cross transaction, we'll leave it
		 * in the pinned tree until a transaction is committed
		 * or the caching work is done.
		 */

		mutex_lock(&root->fs_commit_mutex);
		spin_lock(&root->cache_lock);
		if (root->cached == BTRFS_CACHE_FINISHED) {
			spin_unlock(&root->cache_lock);
			mutex_unlock(&root->fs_commit_mutex);
			goto again;
		}
		spin_unlock(&root->cache_lock);

		start_caching(root);

		if (objectid <= root->cache_progress)
			__btrfs_add_free_space(ctl, objectid, 1);
		else
			__btrfs_add_free_space(pinned, objectid, 1);

		mutex_unlock(&root->fs_commit_mutex);
	}
}

/*
 * When a transaction is committed, we'll move those inode numbers which
 * are smaller than root->cache_progress from pinned tree to free_ino tree,
 * and others will just be dropped, because the commit root we were
 * searching has changed.
 *
 * Must be called with root->fs_commit_mutex held
 */
void btrfs_unpin_free_ino(struct btrfs_root *root)
{
	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
	struct rb_root *rbroot = &root->free_ino_pinned->free_space_offset;
	struct btrfs_free_space *info;
	struct rb_node *n;
	u64 count;

	while (1) {
		n = rb_first(rbroot);
		if (!n)
			break;

		info = rb_entry(n, struct btrfs_free_space, offset_index);
		BUG_ON(info->bitmap);

		if (info->offset > root->cache_progress)
			goto free;
		else if (info->offset + info->bytes > root->cache_progress)
			count = root->cache_progress - info->offset + 1;
		else
			count = info->bytes;

		__btrfs_add_free_space(ctl, info->offset, count);
free:
		rb_erase(&info->offset_index, rbroot);
		kfree(info);
	}
}

#define INIT_THRESHOLD	(((1024 * 32) / 2) / sizeof(struct btrfs_free_space))
#define INODES_PER_BITMAP (PAGE_CACHE_SIZE * 8)

/*
 * The goal is to keep the memory used by the free_ino tree won't
 * exceed the memory if we use bitmaps only.
 */
static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
{
	struct btrfs_free_space *info;
	struct rb_node *n;
	int max_ino;
	int max_bitmaps;

	n = rb_last(&ctl->free_space_offset);
	if (!n) {
		ctl->extents_thresh = INIT_THRESHOLD;
		return;
	}
	info = rb_entry(n, struct btrfs_free_space, offset_index);

	/*
	 * Find the maximum inode number in the filesystem. Note we
	 * ignore the fact that this can be a bitmap, because we are
	 * not doing precise calculation.
	 */
	max_ino = info->bytes - 1;

	max_bitmaps = ALIGN(max_ino, INODES_PER_BITMAP) / INODES_PER_BITMAP;
	if (max_bitmaps <= ctl->total_bitmaps) {
		ctl->extents_thresh = 0;
		return;
	}

	ctl->extents_thresh = (max_bitmaps - ctl->total_bitmaps) *
				PAGE_CACHE_SIZE / sizeof(*info);
}

/*
 * We don't fall back to bitmap, if we are below the extents threshold
 * or this chunk of inode numbers is a big one.
 */
static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
		       struct btrfs_free_space *info)
{
	if (ctl->free_extents < ctl->extents_thresh ||
	    info->bytes > INODES_PER_BITMAP / 10)
		return false;

	return true;
}

static struct btrfs_free_space_op free_ino_op = {
	.recalc_thresholds	= recalculate_thresholds,
	.use_bitmap		= use_bitmap,
};

static void pinned_recalc_thresholds(struct btrfs_free_space_ctl *ctl)
{
}

static bool pinned_use_bitmap(struct btrfs_free_space_ctl *ctl,
			      struct btrfs_free_space *info)
{
	/*
	 * We always use extents for two reasons:
	 *
	 * - The pinned tree is only used during the process of caching
	 *   work.
	 * - Make code simpler. See btrfs_unpin_free_ino().
	 */
	return false;
}

static struct btrfs_free_space_op pinned_free_ino_op = {
	.recalc_thresholds	= pinned_recalc_thresholds,
	.use_bitmap		= pinned_use_bitmap,
};

void btrfs_init_free_ino_ctl(struct btrfs_root *root)
{
	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
	struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;

	spin_lock_init(&ctl->tree_lock);
	ctl->unit = 1;
	ctl->start = 0;
	ctl->private = NULL;
	ctl->op = &free_ino_op;

	/*
	 * Initially we allow to use 16K of ram to cache chunks of
	 * inode numbers before we resort to bitmaps. This is somewhat
	 * arbitrary, but it will be adjusted in runtime.
	 */
	ctl->extents_thresh = INIT_THRESHOLD;

	spin_lock_init(&pinned->tree_lock);
	pinned->unit = 1;
	pinned->start = 0;
	pinned->private = NULL;
	pinned->extents_thresh = 0;
	pinned->op = &pinned_free_ino_op;
}

static int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid)
{
{
	struct btrfs_path *path;
	struct btrfs_path *path;
	int ret;
	int ret;
@@ -55,15 +387,14 @@ error:
	return ret;
	return ret;
}
}


int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid)
			     struct btrfs_root *root,
			     u64 dirid, u64 *objectid)
{
{
	int ret;
	int ret;
	mutex_lock(&root->objectid_mutex);
	mutex_lock(&root->objectid_mutex);


	if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) {
	if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) {
		ret = btrfs_find_highest_inode(root, &root->highest_objectid);
		ret = btrfs_find_highest_objectid(root,
						  &root->highest_objectid);
		if (ret)
		if (ret)
			goto out;
			goto out;
	}
	}
Loading