Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit edaa53ca authored by Zheng Liu's avatar Zheng Liu Committed by Theodore Ts'o
Browse files

ext4: change LRU to round-robin in extent status tree shrinker



In this commit we discard the lru algorithm for inodes with extent
status tree because it takes significant effort to maintain a lru list
in extent status tree shrinker and the shrinker can take a long time to
scan this lru list in order to reclaim some objects.

We replace the lru ordering with a simple round-robin.  After that we
never need to keep a lru list.  That means that the list needn't be
sorted if the shrinker can not reclaim any objects in the first round.

Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Signed-off-by: default avatarZheng Liu <wenqing.lz@taobao.com>
Signed-off-by: default avatarJan Kara <jack@suse.cz>
Signed-off-by: default avatarTheodore Ts'o <tytso@mit.edu>
parent 2f8e0a7c
Loading
Loading
Loading
Loading
+5 −5
Original line number Diff line number Diff line
@@ -878,10 +878,9 @@ struct ext4_inode_info {
	/* extents status tree */
	struct ext4_es_tree i_es_tree;
	rwlock_t i_es_lock;
	struct list_head i_es_lru;
	struct list_head i_es_list;
	unsigned int i_es_all_nr;	/* protected by i_es_lock */
	unsigned int i_es_lru_nr;	/* protected by i_es_lock */
	unsigned long i_touch_when;	/* jiffies of last accessing */
	unsigned int i_es_shk_nr;	/* protected by i_es_lock */

	/* ialloc */
	ext4_group_t	i_last_alloc_group;
@@ -1322,10 +1321,11 @@ struct ext4_sb_info {

	/* Reclaim extents from extent status tree */
	struct shrinker s_es_shrinker;
	struct list_head s_es_lru;
	struct list_head s_es_list;
	long s_es_nr_inode;
	struct ext4_es_stats s_es_stats;
	struct mb_cache *s_mb_cache;
	spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
	spinlock_t s_es_lock ____cacheline_aligned_in_smp;

	/* Ratelimit ext4 messages. */
	struct ratelimit_state s_err_ratelimit_state;
+2 −2
Original line number Diff line number Diff line
@@ -4632,7 +4632,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,

	trace_ext4_ext_map_blocks_exit(inode, flags, map,
				       err ? err : allocated);
	ext4_es_lru_add(inode);
	ext4_es_list_add(inode);
	return err ? err : allocated;
}

@@ -5191,7 +5191,7 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
		error = ext4_fill_fiemap_extents(inode, start_blk,
						 len_blks, fieinfo);
	}
	ext4_es_lru_add(inode);
	ext4_es_list_add(inode);
	return error;
}

+97 −127
Original line number Diff line number Diff line
@@ -149,7 +149,7 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
			      ext4_lblk_t end);
static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
				       int nr_to_scan);
static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
		       struct ext4_inode_info *locked_ei);

int __init ext4_init_es(void)
@@ -298,6 +298,36 @@ void ext4_es_find_delayed_extent_range(struct inode *inode,
	trace_ext4_es_find_delayed_extent_range_exit(inode, es);
}

void ext4_es_list_add(struct inode *inode)
{
	struct ext4_inode_info *ei = EXT4_I(inode);
	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

	if (!list_empty(&ei->i_es_list))
		return;

	spin_lock(&sbi->s_es_lock);
	if (list_empty(&ei->i_es_list)) {
		list_add_tail(&ei->i_es_list, &sbi->s_es_list);
		sbi->s_es_nr_inode++;
	}
	spin_unlock(&sbi->s_es_lock);
}

void ext4_es_list_del(struct inode *inode)
{
	struct ext4_inode_info *ei = EXT4_I(inode);
	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

	spin_lock(&sbi->s_es_lock);
	if (!list_empty(&ei->i_es_list)) {
		list_del_init(&ei->i_es_list);
		sbi->s_es_nr_inode--;
		WARN_ON_ONCE(sbi->s_es_nr_inode < 0);
	}
	spin_unlock(&sbi->s_es_lock);
}

static struct extent_status *
ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
		     ext4_fsblk_t pblk)
@@ -314,9 +344,9 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
	 * We don't count delayed extent because we never try to reclaim them
	 */
	if (!ext4_es_is_delayed(es)) {
		EXT4_I(inode)->i_es_lru_nr++;
		EXT4_I(inode)->i_es_shk_nr++;
		percpu_counter_inc(&EXT4_SB(inode->i_sb)->
					s_es_stats.es_stats_lru_cnt);
					s_es_stats.es_stats_shk_cnt);
	}

	EXT4_I(inode)->i_es_all_nr++;
@@ -330,12 +360,12 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
	EXT4_I(inode)->i_es_all_nr--;
	percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);

	/* Decrease the lru counter when this es is not delayed */
	/* Decrease the shrink counter when this es is not delayed */
	if (!ext4_es_is_delayed(es)) {
		BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0);
		EXT4_I(inode)->i_es_lru_nr--;
		BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0);
		EXT4_I(inode)->i_es_shk_nr--;
		percpu_counter_dec(&EXT4_SB(inode->i_sb)->
					s_es_stats.es_stats_lru_cnt);
					s_es_stats.es_stats_shk_cnt);
	}

	kmem_cache_free(ext4_es_cachep, es);
@@ -683,8 +713,8 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
		goto error;
retry:
	err = __es_insert_extent(inode, &newes);
	if (err == -ENOMEM && __ext4_es_shrink(EXT4_SB(inode->i_sb), 1,
					       EXT4_I(inode)))
	if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb),
					  1, EXT4_I(inode)))
		goto retry;
	if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
		err = 0;
@@ -841,8 +871,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
				es->es_lblk = orig_es.es_lblk;
				es->es_len = orig_es.es_len;
				if ((err == -ENOMEM) &&
				    __ext4_es_shrink(EXT4_SB(inode->i_sb), 1,
						     EXT4_I(inode)))
				    __es_shrink(EXT4_SB(inode->i_sb),
							1, EXT4_I(inode)))
					goto retry;
				goto out;
			}
@@ -914,6 +944,11 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
	end = lblk + len - 1;
	BUG_ON(end < lblk);

	/*
	 * ext4_clear_inode() depends on us taking i_es_lock unconditionally
	 * so that we are sure __es_shrink() is done with the inode before it
	 * is reclaimed.
	 */
	write_lock(&EXT4_I(inode)->i_es_lock);
	err = __es_remove_extent(inode, lblk, end);
	write_unlock(&EXT4_I(inode)->i_es_lock);
@@ -921,114 +956,80 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
	return err;
}

static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
				     struct list_head *b)
{
	struct ext4_inode_info *eia, *eib;
	eia = list_entry(a, struct ext4_inode_info, i_es_lru);
	eib = list_entry(b, struct ext4_inode_info, i_es_lru);

	if (ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
	    !ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
		return 1;
	if (!ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
	    ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
		return -1;
	if (eia->i_touch_when == eib->i_touch_when)
		return 0;
	if (time_after(eia->i_touch_when, eib->i_touch_when))
		return 1;
	else
		return -1;
}

static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
		       struct ext4_inode_info *locked_ei)
{
	struct ext4_inode_info *ei;
	struct ext4_es_stats *es_stats;
	struct list_head *cur, *tmp;
	LIST_HEAD(skipped);
	ktime_t start_time;
	u64 scan_time;
	int nr_to_walk;
	int nr_shrunk = 0;
	int retried = 0, skip_precached = 1, nr_skipped = 0;
	int retried = 0, nr_skipped = 0;

	es_stats = &sbi->s_es_stats;
	start_time = ktime_get();
	spin_lock(&sbi->s_es_lru_lock);

retry:
	list_for_each_safe(cur, tmp, &sbi->s_es_lru) {
	spin_lock(&sbi->s_es_lock);
	nr_to_walk = sbi->s_es_nr_inode;
	while (nr_to_walk-- > 0) {
		int shrunk;

		/*
		 * If we have already reclaimed all extents from extent
		 * status tree, just stop the loop immediately.
		 */
		if (percpu_counter_read_positive(
				&es_stats->es_stats_lru_cnt) == 0)
			break;

		ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
		if (list_empty(&sbi->s_es_list)) {
			spin_unlock(&sbi->s_es_lock);
			goto out;
		}
		ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info,
				      i_es_list);
		/* Move the inode to the tail */
		list_move(&ei->i_es_list, sbi->s_es_list.prev);

		/*
		 * Skip the inode that is newer than the last_sorted
		 * time.  Normally we try hard to avoid shrinking
		 * precached inodes, but we will as a last resort.
		 * Normally we try hard to avoid shrinking precached inodes,
		 * but we will as a last resort.
		 */
		if ((es_stats->es_stats_last_sorted < ei->i_touch_when) ||
		    (skip_precached && ext4_test_inode_state(&ei->vfs_inode,
						EXT4_STATE_EXT_PRECACHED))) {
		if (!retried && ext4_test_inode_state(&ei->vfs_inode,
						EXT4_STATE_EXT_PRECACHED)) {
			nr_skipped++;
			list_move_tail(cur, &skipped);
			continue;
		}

		if (ei->i_es_lru_nr == 0 || ei == locked_ei ||
		    !write_trylock(&ei->i_es_lock))
		if (ei == locked_ei || !write_trylock(&ei->i_es_lock)) {
			nr_skipped++;
			continue;
		}
		/*
		 * Now we hold i_es_lock which protects us from inode reclaim
		 * freeing inode under us
		 */
		spin_unlock(&sbi->s_es_lock);

		shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan);
		if (ei->i_es_lru_nr == 0)
			list_del_init(&ei->i_es_lru);
		write_unlock(&ei->i_es_lock);

		nr_shrunk += shrunk;
		nr_to_scan -= shrunk;

		if (nr_to_scan == 0)
			break;
			goto out;
		spin_lock(&sbi->s_es_lock);
	}

	/* Move the newer inodes into the tail of the LRU list. */
	list_splice_tail(&skipped, &sbi->s_es_lru);
	INIT_LIST_HEAD(&skipped);
	spin_unlock(&sbi->s_es_lock);

	/*
	 * If we skipped any inodes, and we weren't able to make any
	 * forward progress, sort the list and try again.
	 * forward progress, try again to scan precached inodes.
	 */
	if ((nr_shrunk == 0) && nr_skipped && !retried) {
		retried++;
		list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
		es_stats->es_stats_last_sorted = jiffies;
		ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info,
				      i_es_lru);
		/*
		 * If there are no non-precached inodes left on the
		 * list, start releasing precached extents.
		 */
		if (ext4_test_inode_state(&ei->vfs_inode,
					  EXT4_STATE_EXT_PRECACHED))
			skip_precached = 0;
		goto retry;
	}

	spin_unlock(&sbi->s_es_lru_lock);

	if (locked_ei && nr_shrunk == 0)
		nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan);

out:
	scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
	if (likely(es_stats->es_stats_scan_time))
		es_stats->es_stats_scan_time = (scan_time +
@@ -1043,7 +1044,7 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
	else
		es_stats->es_stats_shrunk = nr_shrunk;

	trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time, skip_precached,
	trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time,
			     nr_skipped, retried);
	return nr_shrunk;
}
@@ -1055,7 +1056,7 @@ static unsigned long ext4_es_count(struct shrinker *shrink,
	struct ext4_sb_info *sbi;

	sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker);
	nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt);
	nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
	trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr);
	return nr;
}
@@ -1068,13 +1069,13 @@ static unsigned long ext4_es_scan(struct shrinker *shrink,
	int nr_to_scan = sc->nr_to_scan;
	int ret, nr_shrunk;

	ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt);
	ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
	trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret);

	if (!nr_to_scan)
		return ret;

	nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL);
	nr_shrunk = __es_shrink(sbi, nr_to_scan, NULL);

	trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret);
	return nr_shrunk;
@@ -1102,28 +1103,24 @@ static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v)
		return 0;

	/* here we just find an inode that has the max nr. of objects */
	spin_lock(&sbi->s_es_lru_lock);
	list_for_each_entry(ei, &sbi->s_es_lru, i_es_lru) {
	spin_lock(&sbi->s_es_lock);
	list_for_each_entry(ei, &sbi->s_es_list, i_es_list) {
		inode_cnt++;
		if (max && max->i_es_all_nr < ei->i_es_all_nr)
			max = ei;
		else if (!max)
			max = ei;
	}
	spin_unlock(&sbi->s_es_lru_lock);
	spin_unlock(&sbi->s_es_lock);

	seq_printf(seq, "stats:\n  %lld objects\n  %lld reclaimable objects\n",
		   percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
		   percpu_counter_sum_positive(&es_stats->es_stats_lru_cnt));
		   percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt));
	seq_printf(seq, "  %lu/%lu cache hits/misses\n",
		   es_stats->es_stats_cache_hits,
		   es_stats->es_stats_cache_misses);
	if (es_stats->es_stats_last_sorted != 0)
		seq_printf(seq, "  %u ms last sorted interval\n",
			   jiffies_to_msecs(jiffies -
					    es_stats->es_stats_last_sorted));
	if (inode_cnt)
		seq_printf(seq, "  %d inodes on lru list\n", inode_cnt);
		seq_printf(seq, "  %d inodes on list\n", inode_cnt);

	seq_printf(seq, "average:\n  %llu us scan time\n",
	    div_u64(es_stats->es_stats_scan_time, 1000));
@@ -1132,7 +1129,7 @@ static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v)
		seq_printf(seq,
		    "maximum:\n  %lu inode (%u objects, %u reclaimable)\n"
		    "  %llu us max scan time\n",
		    max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_lru_nr,
		    max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_shk_nr,
		    div_u64(es_stats->es_stats_max_scan_time, 1000));

	return 0;
@@ -1181,9 +1178,9 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
{
	int err;

	INIT_LIST_HEAD(&sbi->s_es_lru);
	spin_lock_init(&sbi->s_es_lru_lock);
	sbi->s_es_stats.es_stats_last_sorted = 0;
	INIT_LIST_HEAD(&sbi->s_es_list);
	sbi->s_es_nr_inode = 0;
	spin_lock_init(&sbi->s_es_lock);
	sbi->s_es_stats.es_stats_shrunk = 0;
	sbi->s_es_stats.es_stats_cache_hits = 0;
	sbi->s_es_stats.es_stats_cache_misses = 0;
@@ -1192,7 +1189,7 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
	err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL);
	if (err)
		return err;
	err = percpu_counter_init(&sbi->s_es_stats.es_stats_lru_cnt, 0, GFP_KERNEL);
	err = percpu_counter_init(&sbi->s_es_stats.es_stats_shk_cnt, 0, GFP_KERNEL);
	if (err)
		goto err1;

@@ -1210,7 +1207,7 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
	return 0;

err2:
	percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt);
	percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
err1:
	percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
	return err;
@@ -1221,37 +1218,10 @@ void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
	if (sbi->s_proc)
		remove_proc_entry("es_shrinker_info", sbi->s_proc);
	percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
	percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt);
	percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
	unregister_shrinker(&sbi->s_es_shrinker);
}

void ext4_es_lru_add(struct inode *inode)
{
	struct ext4_inode_info *ei = EXT4_I(inode);
	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

	ei->i_touch_when = jiffies;

	if (!list_empty(&ei->i_es_lru))
		return;

	spin_lock(&sbi->s_es_lru_lock);
	if (list_empty(&ei->i_es_lru))
		list_add_tail(&ei->i_es_lru, &sbi->s_es_lru);
	spin_unlock(&sbi->s_es_lru_lock);
}

void ext4_es_lru_del(struct inode *inode)
{
	struct ext4_inode_info *ei = EXT4_I(inode);
	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

	spin_lock(&sbi->s_es_lru_lock);
	if (!list_empty(&ei->i_es_lru))
		list_del_init(&ei->i_es_lru);
	spin_unlock(&sbi->s_es_lru_lock);
}

static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
				       int nr_to_scan)
{
@@ -1263,7 +1233,7 @@ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
	static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
				      DEFAULT_RATELIMIT_BURST);

	if (ei->i_es_lru_nr == 0)
	if (ei->i_es_shk_nr == 0)
		return 0;

	if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
+3 −4
Original line number Diff line number Diff line
@@ -65,14 +65,13 @@ struct ext4_es_tree {
};

struct ext4_es_stats {
	unsigned long es_stats_last_sorted;
	unsigned long es_stats_shrunk;
	unsigned long es_stats_cache_hits;
	unsigned long es_stats_cache_misses;
	u64 es_stats_scan_time;
	u64 es_stats_max_scan_time;
	struct percpu_counter es_stats_all_cnt;
	struct percpu_counter es_stats_lru_cnt;
	struct percpu_counter es_stats_shk_cnt;
};

extern int __init ext4_init_es(void);
@@ -151,7 +150,7 @@ static inline void ext4_es_store_pblock_status(struct extent_status *es,

extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi);
extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
extern void ext4_es_lru_add(struct inode *inode);
extern void ext4_es_lru_del(struct inode *inode);
extern void ext4_es_list_add(struct inode *inode);
extern void ext4_es_list_del(struct inode *inode);

#endif /* _EXT4_EXTENTS_STATUS_H */
+2 −2
Original line number Diff line number Diff line
@@ -486,7 +486,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,

	/* Lookup extent status tree firstly */
	if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
		ext4_es_lru_add(inode);
		ext4_es_list_add(inode);
		if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
			map->m_pblk = ext4_es_pblock(&es) +
					map->m_lblk - es.es_lblk;
@@ -1388,7 +1388,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,

	/* Lookup extent status tree firstly */
	if (ext4_es_lookup_extent(inode, iblock, &es)) {
		ext4_es_lru_add(inode);
		ext4_es_list_add(inode);
		if (ext4_es_is_hole(&es)) {
			retval = 0;
			down_read(&EXT4_I(inode)->i_data_sem);
Loading