Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit f10770f5 authored by Artem Bityutskiy's avatar Artem Bityutskiy
Browse files

UBIFS: fully sort GCed nodes



The 'joinup()' function cannot deal with situations when nodes
go in reverse order - it just leaves them in this order. This
patch implement full nodes sorting using n*log(n) algorithm.
It sorts data nodes for bulk-read, and direntry nodes for
readdir().

Signed-off-by: default avatarArtem Bityutskiy <Artem.Bityutskiy@nokia.com>
parent 7d4e9ccb
Loading
Loading
Loading
Loading
+296 −132
Original line number Diff line number Diff line
@@ -47,7 +47,7 @@
 * have to waste large pieces of free space at the end of LEB B, because nodes
 * from LEB A would not fit. And the worst situation is when all nodes are of
 * maximum size. So dark watermark is the amount of free + dirty space in LEB
 * which are guaranteed to be reclaimable. If LEB has less space, the GC migh
 * which are guaranteed to be reclaimable. If LEB has less space, the GC might
 * be unable to reclaim it. So, LEBs with free + dirty greater than dark
 * watermark are "good" LEBs from GC's point of few. The other LEBs are not so
 * good, and GC takes extra care when moving them.
@@ -56,14 +56,6 @@
#include <linux/pagemap.h>
#include "ubifs.h"

/*
 * GC tries to optimize the way it fit nodes to available space, and it sorts
 * nodes a little. The below constants are watermarks which define "large",
 * "medium", and "small" nodes.
 */
#define MEDIUM_NODE_WM (UBIFS_BLOCK_SIZE / 4)
#define SMALL_NODE_WM  UBIFS_MAX_DENT_NODE_SZ

/*
 * GC may need to move more than one LEB to make progress. The below constants
 * define "soft" and "hard" limits on the number of LEBs the garbage collector
@@ -116,83 +108,222 @@ static int switch_gc_head(struct ubifs_info *c)
}

/**
 * joinup - bring data nodes for an inode together.
 * @c: UBIFS file-system description object
 * @sleb: describes scanned LEB
 * @inum: inode number
 * @blk: block number
 * @data: list to which to add data nodes
 * list_sort - sort a list.
 * @priv: private data, passed to @cmp
 * @head: the list to sort
 * @cmp: the elements comparison function
 *
 * This function has been implemented by Mark J Roberts <mjr@znex.org>. It
 * implements "merge sort" which has O(nlog(n)) complexity. The list is sorted
 * in ascending order.
 *
 * This function looks at the first few nodes in the scanned LEB @sleb and adds
 * them to @data if they are data nodes from @inum and have a larger block
 * number than @blk. This function returns %0 on success and a negative error
 * code on failure.
 * The comparison function @cmp is supposed to return a negative value if @a is
 * than @b, and a positive value if @a is greater than @b. If @a and @b are
 * equivalent, then it does not matter what this function returns.
 */
static int joinup(struct ubifs_info *c, struct ubifs_scan_leb *sleb, ino_t inum,
		  unsigned int blk, struct list_head *data)
static void list_sort(void *priv, struct list_head *head,
		      int (*cmp)(void *priv, struct list_head *a,
				 struct list_head *b))
{
	int err, cnt = 6, lnum = sleb->lnum, offs;
	struct ubifs_scan_node *snod, *tmp;
	union ubifs_key *key;
	struct list_head *p, *q, *e, *list, *tail, *oldhead;
	int insize, nmerges, psize, qsize, i;

	if (list_empty(head))
		return;

	list = head->next;
	list_del(head);
	insize = 1;
	for (;;) {
		p = oldhead = list;
		list = tail = NULL;
		nmerges = 0;

		while (p) {
			nmerges++;
			q = p;
			psize = 0;
			for (i = 0; i < insize; i++) {
				psize++;
				q = q->next == oldhead ? NULL : q->next;
				if (!q)
					break;
			}

	list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
		key = &snod->key;
		if (key_inum(c, key) == inum &&
		    key_type(c, key) == UBIFS_DATA_KEY &&
		    key_block(c, key) > blk) {
			offs = snod->offs;
			err = ubifs_tnc_has_node(c, key, 0, lnum, offs, 0);
			if (err < 0)
				return err;
			list_del(&snod->list);
			if (err) {
				list_add_tail(&snod->list, data);
				blk = key_block(c, key);
			} else
				kfree(snod);
			cnt = 6;
		} else if (--cnt == 0)
			qsize = insize;
			while (psize > 0 || (qsize > 0 && q)) {
				if (!psize) {
					e = q;
					q = q->next;
					qsize--;
					if (q == oldhead)
						q = NULL;
				} else if (!qsize || !q) {
					e = p;
					p = p->next;
					psize--;
					if (p == oldhead)
						p = NULL;
				} else if (cmp(priv, p, q) <= 0) {
					e = p;
					p = p->next;
					psize--;
					if (p == oldhead)
						p = NULL;
				} else {
					e = q;
					q = q->next;
					qsize--;
					if (q == oldhead)
						q = NULL;
				}
				if (tail)
					tail->next = e;
				else
					list = e;
				e->prev = tail;
				tail = e;
			}
			p = q;
		}

		tail->next = list;
		list->prev = tail;

		if (nmerges <= 1)
			break;

		insize *= 2;
	}
	return 0;

	head->next = list;
	head->prev = list->prev;
	list->prev->next = head;
	list->prev = head;
}

/**
 * move_nodes - move nodes.
 * data_nodes_cmp - compare 2 data nodes.
 * @priv: UBIFS file-system description object
 * @a: first data node
 * @a: second data node
 *
 * This function compares data nodes @a and @b. Returns %1 if @a has greater
 * inode or block number, and %-1 otherwise.
 */
int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
{
	ino_t inuma, inumb;
	struct ubifs_info *c = priv;
	struct ubifs_scan_node *sa, *sb;

	cond_resched();
	sa = list_entry(a, struct ubifs_scan_node, list);
	sb = list_entry(b, struct ubifs_scan_node, list);
	ubifs_assert(key_type(c, &sa->key) == UBIFS_DATA_KEY);
	ubifs_assert(key_type(c, &sb->key) == UBIFS_DATA_KEY);

	inuma = key_inum(c, &sa->key);
	inumb = key_inum(c, &sb->key);

	if (inuma == inumb) {
		unsigned int blka = key_block(c, &sa->key);
		unsigned int blkb = key_block(c, &sb->key);

		if (blka <= blkb)
			return -1;
	} else if (inuma <= inumb)
		return -1;

	return 1;
}

/*
 * nondata_nodes_cmp - compare 2 non-data nodes.
 * @priv: UBIFS file-system description object
 * @a: first node
 * @a: second node
 *
 * This function compares nodes @a and @b. It makes sure that inode nodes go
 * first and sorted by length in descending order. Directory entry nodes go
 * after inode nodes and are sorted in ascending hash valuer order.
 */
int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
{
	int typea, typeb;
	ino_t inuma, inumb;
	struct ubifs_info *c = priv;
	struct ubifs_scan_node *sa, *sb;

	cond_resched();
	sa = list_entry(a, struct ubifs_scan_node, list);
	sb = list_entry(b, struct ubifs_scan_node, list);
	typea = key_type(c, &sa->key);
	typeb = key_type(c, &sb->key);
	ubifs_assert(typea != UBIFS_DATA_KEY && typeb != UBIFS_DATA_KEY);

	/* Inodes go before directory entries */
	if (typea == UBIFS_INO_KEY) {
		if (typeb == UBIFS_INO_KEY)
			return sb->len - sa->len;
		return -1;
	}
	if (typeb == UBIFS_INO_KEY)
		return 1;

	ubifs_assert(typea == UBIFS_DENT_KEY && typeb == UBIFS_DENT_KEY);
	inuma = key_inum(c, &sa->key);
	inumb = key_inum(c, &sb->key);

	if (inuma == inumb) {
		uint32_t hasha = key_hash(c, &sa->key);
		uint32_t hashb = key_hash(c, &sb->key);

		if (hasha <= hashb)
			return -1;
	} else if (inuma <= inumb)
		return -1;

	return 1;
}

/**
 * sort_nodes - sort nodes for GC.
 * @c: UBIFS file-system description object
 * @sleb: describes nodes to move
 * @sleb: describes nodes to sort and contains the result on exit
 * @nondata: contains non-data nodes on exit
 * @min: minimum node size is returned here
 *
 * This function moves valid nodes from data LEB described by @sleb to the GC
 * journal head. The obsolete nodes are dropped.
 * This function sorts the list of inodes to garbage collect. First of all, it
 * kills obsolete nodes and separates data and non-data nodes to the
 * @sleb->nodes and @nondata lists correspondingly.
 *
 * Data nodes are then sorted in block number order - this is important for
 * bulk-read; data nodes with lower inode number go before data nodes with
 * higher inode number, and data nodes with lower block number go before data
 * nodes with higher block number;
 *
 * When moving nodes we have to deal with classical bin-packing problem: the
 * space in the current GC journal head LEB and in @c->gc_lnum are the "bins",
 * where the nodes in the @sleb->nodes list are the elements which should be
 * fit optimally to the bins. This function uses the "first fit decreasing"
 * strategy, although it does not really sort the nodes but just split them on
 * 3 classes - large, medium, and small, so they are roughly sorted.
 * Non-data nodes are sorted as follows.
 *   o First go inode nodes - they are sorted in descending length order.
 *   o Then go directory entry nodes - they are sorted in hash order, which
 *     should supposedly optimize 'readdir()'. Direntry nodes with lower parent
 *     inode number go before direntry nodes with higher parent inode number,
 *     and direntry nodes with lower name hash values go before direntry nodes
 *     with higher name hash values.
 *
 * This function returns zero in case of success, %-EAGAIN if commit is
 * required, and other negative error codes in case of other failures.
 * This function returns zero in case of success and a negative error code in
 * case of failure.
 */
static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
		      struct list_head *nondata, int *min)
{
	struct ubifs_scan_node *snod, *tmp;
	struct list_head data, large, medium, small;
	struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
	int avail, err, min = INT_MAX;
	unsigned int blk = 0;
	ino_t inum = 0;

	INIT_LIST_HEAD(&data);
	INIT_LIST_HEAD(&large);
	INIT_LIST_HEAD(&medium);
	INIT_LIST_HEAD(&small);
	*min = INT_MAX;

	while (!list_empty(&sleb->nodes)) {
		struct list_head *lst = sleb->nodes.next;

		snod = list_entry(lst, struct ubifs_scan_node, list);
	/* Separate data nodes and non-data nodes */
	list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
		int err;

		ubifs_assert(snod->type != UBIFS_IDX_NODE);
		ubifs_assert(snod->type != UBIFS_REF_NODE);
@@ -201,53 +332,72 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
		err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum,
					 snod->offs, 0);
		if (err < 0)
			goto out;
			return err;

		list_del(lst);
		if (!err) {
			/* The node is obsolete, remove it from the list */
			list_del(&snod->list);
			kfree(snod);
			continue;
		}

		/*
		 * Sort the list of nodes so that data nodes go first, large
		 * nodes go second, and small nodes go last.
		 */
		if (key_type(c, &snod->key) == UBIFS_DATA_KEY) {
			if (inum != key_inum(c, &snod->key)) {
				if (inum) {
					/*
					 * Try to move data nodes from the same
					 * inode together.
					 */
					err = joinup(c, sleb, inum, blk, &data);
					if (err)
						goto out;
		if (snod->len < *min)
			*min = snod->len;

		if (key_type(c, &snod->key) != UBIFS_DATA_KEY)
			list_move_tail(&snod->list, nondata);
	}
				inum = key_inum(c, &snod->key);
				blk = key_block(c, &snod->key);

	/* Sort data and non-data nodes */
	list_sort(c, &sleb->nodes, &data_nodes_cmp);
	list_sort(c, nondata, &nondata_nodes_cmp);
	return 0;
}
			list_add_tail(lst, &data);
		} else if (snod->len > MEDIUM_NODE_WM)
			list_add_tail(lst, &large);
		else if (snod->len > SMALL_NODE_WM)
			list_add_tail(lst, &medium);
		else
			list_add_tail(lst, &small);

		/* And find the smallest node */
		if (snod->len < min)
			min = snod->len;
/**
 * move_node - move a node.
 * @c: UBIFS file-system description object
 * @sleb: describes the LEB to move nodes from
 * @snod: the mode to move
 * @wbuf: write-buffer to move node to
 *
 * This function moves node @snod to @wbuf, changes TNC correspondingly, and
 * destroys @snod. Returns zero in case of success and a negative error code in
 * case of failure.
 */
static int move_node(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
		     struct ubifs_scan_node *snod, struct ubifs_wbuf *wbuf)
{
	int err, new_lnum = wbuf->lnum, new_offs = wbuf->offs + wbuf->used;

	cond_resched();
	err = ubifs_wbuf_write_nolock(wbuf, snod->node, snod->len);
	if (err)
		return err;

	err = ubifs_tnc_replace(c, &snod->key, sleb->lnum,
				snod->offs, new_lnum, new_offs,
				snod->len);
	list_del(&snod->list);
	kfree(snod);
	return err;
}

	/*
	 * Join the tree lists so that we'd have one roughly sorted list
	 * ('large' will be the head of the joined list).
/**
 * move_nodes - move nodes.
 * @c: UBIFS file-system description object
 * @sleb: describes the LEB to move nodes from
 *
 * This function moves valid nodes from data LEB described by @sleb to the GC
 * journal head. This function returns zero in case of success, %-EAGAIN if
 * commit is required, and other negative error codes in case of other
 * failures.
 */
	list_splice(&data, &large);
	list_splice(&medium, large.prev);
	list_splice(&small, large.prev);
static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
{
	int err, min;
	LIST_HEAD(nondata);
	struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;

	if (wbuf->lnum == -1) {
		/*
@@ -256,42 +406,59 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
		 */
		err = switch_gc_head(c);
		if (err)
			goto out;
			return err;
	}

	err = sort_nodes(c, sleb, &nondata, &min);
	if (err)
		goto out;

	/* Write nodes to their new location. Use the first-fit strategy */
	while (1) {
		int avail;
		struct ubifs_scan_node *snod, *tmp;

		/* Move data nodes */
		list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
			avail = c->leb_size - wbuf->offs - wbuf->used;
		list_for_each_entry_safe(snod, tmp, &large, list) {
			int new_lnum, new_offs;
			if  (snod->len > avail)
				/*
				 * Do not skip data nodes in order to optimize
				 * bulk-read.
				 */
				break;

			err = move_node(c, sleb, snod, wbuf);
			if (err)
				goto out;
		}

		/* Move non-data nodes */
		list_for_each_entry_safe(snod, tmp, &nondata, list) {
			avail = c->leb_size - wbuf->offs - wbuf->used;
			if (avail < min)
				break;

			if (snod->len > avail)
				/* This node does not fit */
			if  (snod->len > avail) {
				/*
				 * Keep going only if this is an inode with
				 * some data. Otherwise stop and switch the GC
				 * head. IOW, we assume that data-less inode
				 * nodes and direntry nodes are roughly of the
				 * same size.
				 */
				if (key_type(c, &snod->key) == UBIFS_DENT_KEY ||
				    snod->len == UBIFS_INO_NODE_SZ)
					break;
				continue;
			}

			cond_resched();

			new_lnum = wbuf->lnum;
			new_offs = wbuf->offs + wbuf->used;
			err = ubifs_wbuf_write_nolock(wbuf, snod->node,
						      snod->len);
			if (err)
				goto out;
			err = ubifs_tnc_replace(c, &snod->key, sleb->lnum,
						snod->offs, new_lnum, new_offs,
						snod->len);
			err = move_node(c, sleb, snod, wbuf);
			if (err)
				goto out;

			avail = c->leb_size - wbuf->offs - wbuf->used;
			list_del(&snod->list);
			kfree(snod);
		}

		if (list_empty(&large))
		if (list_empty(&sleb->nodes) && list_empty(&nondata))
			break;

		/*
@@ -306,10 +473,7 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
	return 0;

out:
	list_for_each_entry_safe(snod, tmp, &large, list) {
		list_del(&snod->list);
		kfree(snod);
	}
	list_splice_tail(&nondata, &sleb->nodes);
	return err;
}