Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 6be2ded1 authored by Aneesh Kumar K.V's avatar Aneesh Kumar K.V Committed by Theodore Ts'o
Browse files

ext4: Don't allow lg prealloc list to be grow large.



Currently, the locality group prealloc list is freed only when there
is a block allocation failure. This can result in large number of
entries in the preallocation list making ext4_mb_use_preallocated()
expensive.

To fix this, we convert the locality group prealloc list to a hash
list. The hash index is the order of number of blocks in the prealloc
space with a max order of 9. When adding prealloc space to the list we
make sure total entries for each order does not exceed 8. If it is
more than 8 we discard few entries and make sure the we have only <= 5
entries.

Signed-off-by: default avatarAneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: default avatarTheodore Ts'o <tytso@mit.edu>
parent 1320cbcf
Loading
Loading
Loading
Loading
+185 −28
Original line number Diff line number Diff line
@@ -2480,7 +2480,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
int ext4_mb_init(struct super_block *sb, int needs_recovery)
{
	struct ext4_sb_info *sbi = EXT4_SB(sb);
	unsigned i;
	unsigned i, j;
	unsigned offset;
	unsigned max;
	int ret;
@@ -2552,7 +2552,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
		struct ext4_locality_group *lg;
		lg = &sbi->s_locality_groups[i];
		mutex_init(&lg->lg_mutex);
		INIT_LIST_HEAD(&lg->lg_prealloc_list);
		for (j = 0; j < PREALLOC_TB_SIZE; j++)
			INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
		spin_lock_init(&lg->lg_prealloc_lock);
	}

@@ -3263,6 +3264,7 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
				struct ext4_prealloc_space *pa)
{
	unsigned int len = ac->ac_o_ex.fe_len;

	ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
					&ac->ac_b_ex.fe_group,
					&ac->ac_b_ex.fe_start);
@@ -3285,6 +3287,7 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
static noinline_for_stack int
ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
{
	int order, i;
	struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
	struct ext4_locality_group *lg;
	struct ext4_prealloc_space *pa;
@@ -3325,11 +3328,18 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
	lg = ac->ac_lg;
	if (lg == NULL)
		return 0;
	order  = fls(ac->ac_o_ex.fe_len) - 1;
	if (order > PREALLOC_TB_SIZE - 1)
		/* The max size of hash table is PREALLOC_TB_SIZE */
		order = PREALLOC_TB_SIZE - 1;

	for (i = order; i < PREALLOC_TB_SIZE; i++) {
		rcu_read_lock();
	list_for_each_entry_rcu(pa, &lg->lg_prealloc_list, pa_inode_list) {
		list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
					pa_inode_list) {
			spin_lock(&pa->pa_lock);
		if (pa->pa_deleted == 0 && pa->pa_free >= ac->ac_o_ex.fe_len) {
			if (pa->pa_deleted == 0 &&
					pa->pa_free >= ac->ac_o_ex.fe_len) {
				atomic_inc(&pa->pa_count);
				ext4_mb_use_group_pa(ac, pa);
				spin_unlock(&pa->pa_lock);
@@ -3340,7 +3350,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
			spin_unlock(&pa->pa_lock);
		}
		rcu_read_unlock();

	}
	return 0;
}

@@ -3563,6 +3573,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
	pa->pa_free = pa->pa_len;
	atomic_set(&pa->pa_count, 1);
	spin_lock_init(&pa->pa_lock);
	INIT_LIST_HEAD(&pa->pa_inode_list);
	pa->pa_deleted = 0;
	pa->pa_linear = 1;

@@ -3583,10 +3594,10 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
	list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
	ext4_unlock_group(sb, ac->ac_b_ex.fe_group);

	spin_lock(pa->pa_obj_lock);
	list_add_tail_rcu(&pa->pa_inode_list, &lg->lg_prealloc_list);
	spin_unlock(pa->pa_obj_lock);

	/*
	 * We will later add the new pa to the right bucket
	 * after updating the pa_free in ext4_mb_release_context
	 */
	return 0;
}

@@ -4123,22 +4134,168 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,

}

static noinline_for_stack void
ext4_mb_discard_lg_preallocations(struct super_block *sb,
					struct ext4_locality_group *lg,
					int order, int total_entries)
{
	ext4_group_t group = 0;
	struct ext4_buddy e4b;
	struct list_head discard_list;
	struct ext4_prealloc_space *pa, *tmp;
	struct ext4_allocation_context *ac;

	mb_debug("discard locality group preallocation\n");

	INIT_LIST_HEAD(&discard_list);
	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);

	spin_lock(&lg->lg_prealloc_lock);
	list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
						pa_inode_list) {
		spin_lock(&pa->pa_lock);
		if (atomic_read(&pa->pa_count)) {
			/*
			 * This is the pa that we just used
			 * for block allocation. So don't
			 * free that
			 */
			spin_unlock(&pa->pa_lock);
			continue;
		}
		if (pa->pa_deleted) {
			spin_unlock(&pa->pa_lock);
			continue;
		}
		/* only lg prealloc space */
		BUG_ON(!pa->pa_linear);

		/* seems this one can be freed ... */
		pa->pa_deleted = 1;
		spin_unlock(&pa->pa_lock);

		list_del_rcu(&pa->pa_inode_list);
		list_add(&pa->u.pa_tmp_list, &discard_list);

		total_entries--;
		if (total_entries <= 5) {
			/*
			 * we want to keep only 5 entries
			 * allowing it to grow to 8. This
			 * mak sure we don't call discard
			 * soon for this list.
			 */
			break;
		}
	}
	spin_unlock(&lg->lg_prealloc_lock);

	list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {

		ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
		if (ext4_mb_load_buddy(sb, group, &e4b)) {
			ext4_error(sb, __func__, "Error in loading buddy "
					"information for %lu\n", group);
			continue;
		}
		ext4_lock_group(sb, group);
		list_del(&pa->pa_group_list);
		ext4_mb_release_group_pa(&e4b, pa, ac);
		ext4_unlock_group(sb, group);

		ext4_mb_release_desc(&e4b);
		list_del(&pa->u.pa_tmp_list);
		call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
	}
	if (ac)
		kmem_cache_free(ext4_ac_cachep, ac);
}

/*
 * We have incremented pa_count. So it cannot be freed at this
 * point. Also we hold lg_mutex. So no parallel allocation is
 * possible from this lg. That means pa_free cannot be updated.
 *
 * A parallel ext4_mb_discard_group_preallocations is possible.
 * which can cause the lg_prealloc_list to be updated.
 */

static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
{
	int order, added = 0, lg_prealloc_count = 1;
	struct super_block *sb = ac->ac_sb;
	struct ext4_locality_group *lg = ac->ac_lg;
	struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa;

	order = fls(pa->pa_free) - 1;
	if (order > PREALLOC_TB_SIZE - 1)
		/* The max size of hash table is PREALLOC_TB_SIZE */
		order = PREALLOC_TB_SIZE - 1;
	/* Add the prealloc space to lg */
	rcu_read_lock();
	list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
						pa_inode_list) {
		spin_lock(&tmp_pa->pa_lock);
		if (tmp_pa->pa_deleted) {
			spin_unlock(&pa->pa_lock);
			continue;
		}
		if (!added && pa->pa_free < tmp_pa->pa_free) {
			/* Add to the tail of the previous entry */
			list_add_tail_rcu(&pa->pa_inode_list,
						&tmp_pa->pa_inode_list);
			added = 1;
			/*
			 * we want to count the total
			 * number of entries in the list
			 */
		}
		spin_unlock(&tmp_pa->pa_lock);
		lg_prealloc_count++;
	}
	if (!added)
		list_add_tail_rcu(&pa->pa_inode_list,
					&lg->lg_prealloc_list[order]);
	rcu_read_unlock();

	/* Now trim the list to be not more than 8 elements */
	if (lg_prealloc_count > 8) {
		ext4_mb_discard_lg_preallocations(sb, lg,
						order, lg_prealloc_count);
		return;
	}
	return ;
}

/*
 * release all resource we used in allocation
 */
static int ext4_mb_release_context(struct ext4_allocation_context *ac)
{
	if (ac->ac_pa) {
		if (ac->ac_pa->pa_linear) {
	struct ext4_prealloc_space *pa = ac->ac_pa;
	if (pa) {
		if (pa->pa_linear) {
			/* see comment in ext4_mb_use_group_pa() */
			spin_lock(&ac->ac_pa->pa_lock);
			ac->ac_pa->pa_pstart += ac->ac_b_ex.fe_len;
			ac->ac_pa->pa_lstart += ac->ac_b_ex.fe_len;
			ac->ac_pa->pa_free -= ac->ac_b_ex.fe_len;
			ac->ac_pa->pa_len -= ac->ac_b_ex.fe_len;
			spin_unlock(&ac->ac_pa->pa_lock);
		}
		ext4_mb_put_pa(ac, ac->ac_sb, ac->ac_pa);
			spin_lock(&pa->pa_lock);
			pa->pa_pstart += ac->ac_b_ex.fe_len;
			pa->pa_lstart += ac->ac_b_ex.fe_len;
			pa->pa_free -= ac->ac_b_ex.fe_len;
			pa->pa_len -= ac->ac_b_ex.fe_len;
			spin_unlock(&pa->pa_lock);
			/*
			 * We want to add the pa to the right bucket.
			 * Remove it from the list and while adding
			 * make sure the list to which we are adding
			 * doesn't grow big.
			 */
			if (likely(pa->pa_free)) {
				spin_lock(pa->pa_obj_lock);
				list_del_rcu(&pa->pa_inode_list);
				spin_unlock(pa->pa_obj_lock);
				ext4_mb_add_n_trim(ac);
			}
		}
		ext4_mb_put_pa(ac, ac->ac_sb, pa);
	}
	if (ac->ac_bitmap_page)
		page_cache_release(ac->ac_bitmap_page);
+8 −2
Original line number Diff line number Diff line
@@ -164,11 +164,17 @@ struct ext4_free_extent {
 * Locality group:
 *   we try to group all related changes together
 *   so that writeback can flush/allocate them together as well
 *   Size of lg_prealloc_list hash is determined by MB_DEFAULT_GROUP_PREALLOC
 *   (512). We store prealloc space into the hash based on the pa_free blocks
 *   order value.ie, fls(pa_free)-1;
 */
#define PREALLOC_TB_SIZE 10
struct ext4_locality_group {
	/* for allocator */
	struct mutex		lg_mutex;	/* to serialize allocates */
	struct list_head	lg_prealloc_list;/* list of preallocations */
	/* to serialize allocates */
	struct mutex		lg_mutex;
	/* list of preallocations */
	struct list_head	lg_prealloc_list[PREALLOC_TB_SIZE];
	spinlock_t		lg_prealloc_lock;
};