Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 440c840c authored by Timofey Titovets's avatar Timofey Titovets Committed by David Sterba
Browse files

Btrfs: compression heuristic: replace heap sort with radix sort



Slowest part of heuristic for now is kernel heap sort()
It's can take up to 55% of runtime on sorting bucket items.

As sorting will always call on most data sets to get correctly
byte_core_set_size, the only way to speed up heuristic, is to
speed up sort on bucket.

Add a general radix_sort function.
Radix sort require 2 buffers, one full size of input array
and one for store counters (jump addresses).

That increase usage per heuristic workspace +1KiB
8KiB + 1KiB -> 8KiB + 2KiB

That is LSD Radix, i use 4 bit as a base for calculating,
to make counters array acceptable small (16 elements * 8 byte).

That Radix sort implementation have several points to adjust,
I added him to make radix sort general usable in kernel,
like heap sort, if needed.

Performance tested in userspace copy of heuristic code,
throughput:
    - average <-> random data: ~3500 MiB/s - heap  sort
    - average <-> random data: ~6000 MiB/s - radix sort

Signed-off-by: default avatarTimofey Titovets <nefelim4ag@gmail.com>
[ coding style fixes ]
Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
parent 1c3063b6
Loading
Loading
Loading
Loading
+123 −7
Original line number Diff line number Diff line
@@ -33,7 +33,6 @@
#include <linux/bit_spinlock.h>
#include <linux/slab.h>
#include <linux/sched/mm.h>
#include <linux/sort.h>
#include <linux/log2.h>
#include "ctree.h"
#include "disk-io.h"
@@ -752,6 +751,8 @@ struct heuristic_ws {
	u32 sample_size;
	/* Buckets store counters for each byte value */
	struct bucket_item *bucket;
	/* Sorting buffer */
	struct bucket_item *bucket_b;
	struct list_head list;
};

@@ -763,6 +764,7 @@ static void free_heuristic_ws(struct list_head *ws)

	kvfree(workspace->sample);
	kfree(workspace->bucket);
	kfree(workspace->bucket_b);
	kfree(workspace);
}

@@ -782,6 +784,10 @@ static struct list_head *alloc_heuristic_ws(void)
	if (!ws->bucket)
		goto fail;

	ws->bucket_b = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket_b), GFP_KERNEL);
	if (!ws->bucket_b)
		goto fail;

	INIT_LIST_HEAD(&ws->list);
	return &ws->list;
fail:
@@ -1278,13 +1284,122 @@ static u32 shannon_entropy(struct heuristic_ws *ws)
	return entropy_sum * 100 / entropy_max;
}

/* Compare buckets by size, ascending */
static int bucket_comp_rev(const void *lv, const void *rv)
#define RADIX_BASE		4U
#define COUNTERS_SIZE		(1U << RADIX_BASE)

static u8 get4bits(u64 num, int shift) {
	u8 low4bits;

	num >>= shift;
	/* Reverse order */
	low4bits = (COUNTERS_SIZE - 1) - (num % COUNTERS_SIZE);
	return low4bits;
}

static void copy_cell(void *dst, int dest_i, void *src, int src_i)
{
	const struct bucket_item *l = (const struct bucket_item *)lv;
	const struct bucket_item *r = (const struct bucket_item *)rv;
	struct bucket_item *dstv = (struct bucket_item *)dst;
	struct bucket_item *srcv = (struct bucket_item *)src;
	dstv[dest_i] = srcv[src_i];
}

static u64 get_num(const void *a, int i)
{
	struct bucket_item *av = (struct bucket_item *)a;
	return av[i].count;
}

	return r->count - l->count;
/*
 * Use 4 bits as radix base
 * Use 16 u32 counters for calculating new possition in buf array
 *
 * @array     - array that will be sorted
 * @array_buf - buffer array to store sorting results
 *              must be equal in size to @array
 * @num       - array size
 * @get_num   - function to extract number from array
 * @copy_cell - function to copy data from array to array_buf and vice versa
 * @get4bits  - function to get 4 bits from number at specified offset
 */
static void radix_sort(void *array, void *array_buf, int num,
		       u64 (*get_num)(const void *, int i),
		       void (*copy_cell)(void *dest, int dest_i,
					 void* src, int src_i),
		       u8 (*get4bits)(u64 num, int shift))
{
	u64 max_num;
	u64 buf_num;
	u32 counters[COUNTERS_SIZE];
	u32 new_addr;
	u32 addr;
	int bitlen;
	int shift;
	int i;

	/*
	 * Try avoid useless loop iterations for small numbers stored in big
	 * counters.  Example: 48 33 4 ... in 64bit array
	 */
	max_num = get_num(array, 0);
	for (i = 1; i < num; i++) {
		buf_num = get_num(array, i);
		if (buf_num > max_num)
			max_num = buf_num;
	}

	buf_num = ilog2(max_num);
	bitlen = ALIGN(buf_num, RADIX_BASE * 2);

	shift = 0;
	while (shift < bitlen) {
		memset(counters, 0, sizeof(counters));

		for (i = 0; i < num; i++) {
			buf_num = get_num(array, i);
			addr = get4bits(buf_num, shift);
			counters[addr]++;
		}

		for (i = 1; i < COUNTERS_SIZE; i++)
			counters[i] += counters[i - 1];

		for (i = num - 1; i >= 0; i--) {
			buf_num = get_num(array, i);
			addr = get4bits(buf_num, shift);
			counters[addr]--;
			new_addr = counters[addr];
			copy_cell(array_buf, new_addr, array, i);
		}

		shift += RADIX_BASE;

		/*
		 * Normal radix expects to move data from a temporary array, to
		 * the main one.  But that requires some CPU time. Avoid that
		 * by doing another sort iteration to original array instead of
		 * memcpy()
		 */
		memset(counters, 0, sizeof(counters));

		for (i = 0; i < num; i ++) {
			buf_num = get_num(array_buf, i);
			addr = get4bits(buf_num, shift);
			counters[addr]++;
		}

		for (i = 1; i < COUNTERS_SIZE; i++)
			counters[i] += counters[i - 1];

		for (i = num - 1; i >= 0; i--) {
			buf_num = get_num(array_buf, i);
			addr = get4bits(buf_num, shift);
			counters[addr]--;
			new_addr = counters[addr];
			copy_cell(array, new_addr, array_buf, i);
		}

		shift += RADIX_BASE;
	}
}

/*
@@ -1314,7 +1429,8 @@ static int byte_core_set_size(struct heuristic_ws *ws)
	struct bucket_item *bucket = ws->bucket;

	/* Sort in reverse order */
	sort(bucket, BUCKET_SIZE, sizeof(*bucket), &bucket_comp_rev, NULL);
	radix_sort(ws->bucket, ws->bucket_b, BUCKET_SIZE, get_num, copy_cell,
			get4bits);

	for (i = 0; i < BYTE_CORE_SET_LOW; i++)
		coreset_sum += bucket[i].count;