Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit a288e92c authored by Timofey Titovets's avatar Timofey Titovets Committed by David Sterba
Browse files

Btrfs: heuristic: add byte set calculation



Calculate byte set size for data sample:
- calculate how many unique bytes have been in the sample
- for all bytes count > 0, check if we're still in the low count range
  (~25%), such data are easily compressible, otherwise furhter analysis
  is needed

Signed-off-by: default avatarTimofey Titovets <nefelim4ag@gmail.com>
Reviewed-by: default avatarDavid Sterba <dsterba@suse.com>
[ update comments ]
Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
parent 1fe4f6fa
Loading
Loading
Loading
Loading
+45 −0
Original line number Diff line number Diff line
@@ -1222,6 +1222,45 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start,
	return 1;
}

/*
 * Count byte values in buckets.
 * This heuristic can detect textual data (configs, xml, json, html, etc).
 * Because in most text-like data byte set is restricted to limited number of
 * possible characters, and that restriction in most cases makes data easy to
 * compress.
 *
 * @BYTE_SET_THRESHOLD - consider all data within this byte set size:
 *	less - compressible
 *	more - need additional analysis
 */
#define BYTE_SET_THRESHOLD		(64)

static u32 byte_set_size(const struct heuristic_ws *ws)
{
	u32 i;
	u32 byte_set_size = 0;

	for (i = 0; i < BYTE_SET_THRESHOLD; i++) {
		if (ws->bucket[i].count > 0)
			byte_set_size++;
	}

	/*
	 * Continue collecting count of byte values in buckets.  If the byte
	 * set size is bigger then the threshold, it's pointless to continue,
	 * the detection technique would fail for this type of data.
	 */
	for (; i < BUCKET_SIZE; i++) {
		if (ws->bucket[i].count > 0) {
			byte_set_size++;
			if (byte_set_size > BYTE_SET_THRESHOLD)
				return byte_set_size;
		}
	}

	return byte_set_size;
}

static bool sample_repeated_patterns(struct heuristic_ws *ws)
{
	const u32 half_of_sample = ws->sample_size / 2;
@@ -1321,6 +1360,12 @@ int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
		ws->bucket[byte].count++;
	}

	i = byte_set_size(ws);
	if (i < BYTE_SET_THRESHOLD) {
		ret = 2;
		goto out;
	}

out:
	__free_workspace(0, ws_list, true);
	return ret;