Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Unverified Commit 5b68ef13 authored by Sandeep Dhavale's avatar Sandeep Dhavale Committed by Michael Bestas
Browse files

BACKPORT: erofs: add per-cpu threads for decompression as an option

Using per-cpu thread pool we can reduce the scheduling latency compared
to workqueue implementation. With this patch scheduling latency and
variation is reduced as per-cpu threads are high priority kthread_workers.

The results were evaluated on arm64 Android devices running 5.10 kernel.

The table below shows resulting improvements of total scheduling latency
for the same app launch benchmark runs with 50 iterations. Scheduling
latency is the latency between when the task (workqueue kworker vs
kthread_worker) became eligible to run to when it actually started
running.
+-------------------------+-----------+----------------+---------+
|                         | workqueue | kthread_worker |  diff   |
+-------------------------+-----------+----------------+---------+
| Average (us)            |     15253 |           2914 | -80.89% |
| Median (us)             |     14001 |           2912 | -79.20% |
| Minimum (us)            |      3117 |           1027 | -67.05% |
| Maximum (us)            |     30170 |           3805 | -87.39% |
| Standard deviation (us) |      7166 |            359 |         |
+-------------------------+-----------+----------------+---------+

Background: Boot times and cold app launch benchmarks are very
important to the Android ecosystem as they directly translate to
responsiveness from user point of view. While EROFS provides
a lot of important features like space savings, we saw some
performance penalty in cold app launch benchmarks in few scenarios.
Analysis showed that the significant variance was coming from the
scheduling cost while decompression cost was more or less the same.

Having per-cpu thread pool we can see from the above table that this
variation is reduced by ~80% on average. This problem was discussed
at LPC 2022. Link to LPC 2022 slides and talk at [1]

[1] https://lpc.events/event/16/contributions/1338/

[ Gao Xiang: At least, we have to add this until WQ_UNBOUND workqueue
             issue [2] on many arm64 devices is resolved. ]
[2] https://lore.kernel.org/r/CAJkfWY490-m6wNubkxiTPsW59sfsQs37Wey279LmiRxKt7aQYg@mail.gmail.com



Bug: 271636421
Bug: 278520205
Test: launch_cvd
Change-Id: I9dce2bfd6f40ec6a210161b80cee7c0417b4edb3
Signed-off-by: default avatarSandeep Dhavale <dhavale@google.com>
Signed-off-by: default avatarGao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20230208093322.75816-1-hsiangkao@linux.alibaba.com


(cherry picked from commit 3fffb589b9a6e331e39cb75373ee7691acd7b109)
[dhavale: Fixed minor conflict as upstream now has zdata.h folded in
zdata.c]
Signed-off-by: default avatarSandeep Dhavale <dhavale@google.com>
(cherry picked from commit 566a7f6c6b3f5f13b766fe749bbdb45918b029ac)
[dhavale: Fixed minor conflicts in Kconfig and zdata.c]
(cherry picked from commit 2de95f5d183c2174c9380a902919c8e59e380293)
parent 260d03bf
Loading
Loading
Loading
Loading
+17 −0
Original line number Diff line number Diff line
@@ -76,3 +76,20 @@ config EROFS_FS_ZIP

	  If you don't want to enable compression feature, say N.

config EROFS_FS_PCPU_KTHREAD
	bool "EROFS per-cpu decompression kthread workers"
	depends on EROFS_FS_ZIP
	help
	  Saying Y here enables per-CPU kthread workers pool to carry out
	  async decompression for low latencies on some architectures.

	  If unsure, say N.

config EROFS_FS_PCPU_KTHREAD_HIPRI
	bool "EROFS high priority per-CPU kthread workers"
	depends on EROFS_FS_ZIP && EROFS_FS_PCPU_KTHREAD
	help
	  This permits EROFS to configure per-CPU kthread workers to run
	  at higher priority.

	  If unsure, say N.
+167 −18
Original line number Diff line number Diff line
@@ -6,7 +6,7 @@
#include "zdata.h"
#include "compress.h"
#include <linux/prefetch.h>

#include <linux/cpuhotplug.h>
#include <trace/events/erofs.h>

/*
@@ -124,24 +124,128 @@ typedef tagptr1_t compressed_page_t;

static struct workqueue_struct *z_erofs_workqueue __read_mostly;

void z_erofs_exit_zip_subsystem(void)
#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
static struct kthread_worker __rcu **z_erofs_pcpu_workers;

static void erofs_destroy_percpu_workers(void)
{
	destroy_workqueue(z_erofs_workqueue);
	z_erofs_destroy_pcluster_pool();
	struct kthread_worker *worker;
	unsigned int cpu;

	for_each_possible_cpu(cpu) {
		worker = rcu_dereference_protected(
					z_erofs_pcpu_workers[cpu], 1);
		rcu_assign_pointer(z_erofs_pcpu_workers[cpu], NULL);
		if (worker)
			kthread_destroy_worker(worker);
	}
	kfree(z_erofs_pcpu_workers);
}

static inline int z_erofs_init_workqueue(void)
static struct kthread_worker *erofs_init_percpu_worker(int cpu)
{
	const unsigned int onlinecpus = num_possible_cpus();
	struct kthread_worker *worker =
		kthread_create_worker_on_cpu(cpu, 0, "erofs_worker/%u", cpu);

	/*
	 * no need to spawn too many threads, limiting threads could minimum
	 * scheduling overhead, perhaps per-CPU threads should be better?
	 */
	z_erofs_workqueue = alloc_workqueue("erofs_unzipd",
					    WQ_UNBOUND | WQ_HIGHPRI,
					    onlinecpus + onlinecpus / 4);
	return z_erofs_workqueue ? 0 : -ENOMEM;
	if (IS_ERR(worker))
		return worker;
	if (IS_ENABLED(CONFIG_EROFS_FS_PCPU_KTHREAD_HIPRI))
		sched_set_fifo_low(worker->task);
	else
		sched_set_normal(worker->task, 0);
	return worker;
}

static int erofs_init_percpu_workers(void)
{
	struct kthread_worker *worker;
	unsigned int cpu;

	z_erofs_pcpu_workers = kcalloc(num_possible_cpus(),
			sizeof(struct kthread_worker *), GFP_ATOMIC);
	if (!z_erofs_pcpu_workers)
		return -ENOMEM;

	for_each_online_cpu(cpu) {	/* could miss cpu{off,on}line? */
		worker = erofs_init_percpu_worker(cpu);
		if (!IS_ERR(worker))
			rcu_assign_pointer(z_erofs_pcpu_workers[cpu], worker);
	}
	return 0;
}
#else
static inline void erofs_destroy_percpu_workers(void) {}
static inline int erofs_init_percpu_workers(void) { return 0; }
#endif

#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_EROFS_FS_PCPU_KTHREAD)
static DEFINE_SPINLOCK(z_erofs_pcpu_worker_lock);
static enum cpuhp_state erofs_cpuhp_state;

static int erofs_cpu_online(unsigned int cpu)
{
	struct kthread_worker *worker, *old;

	worker = erofs_init_percpu_worker(cpu);
	if (IS_ERR(worker))
		return PTR_ERR(worker);

	spin_lock(&z_erofs_pcpu_worker_lock);
	old = rcu_dereference_protected(z_erofs_pcpu_workers[cpu],
			lockdep_is_held(&z_erofs_pcpu_worker_lock));
	if (!old)
		rcu_assign_pointer(z_erofs_pcpu_workers[cpu], worker);
	spin_unlock(&z_erofs_pcpu_worker_lock);
	if (old)
		kthread_destroy_worker(worker);
	return 0;
}

static int erofs_cpu_offline(unsigned int cpu)
{
	struct kthread_worker *worker;

	spin_lock(&z_erofs_pcpu_worker_lock);
	worker = rcu_dereference_protected(z_erofs_pcpu_workers[cpu],
			lockdep_is_held(&z_erofs_pcpu_worker_lock));
	rcu_assign_pointer(z_erofs_pcpu_workers[cpu], NULL);
	spin_unlock(&z_erofs_pcpu_worker_lock);

	synchronize_rcu();
	if (worker)
		kthread_destroy_worker(worker);
	return 0;
}

static int erofs_cpu_hotplug_init(void)
{
	int state;

	state = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
			"fs/erofs:online", erofs_cpu_online, erofs_cpu_offline);
	if (state < 0)
		return state;

	erofs_cpuhp_state = state;
	return 0;
}

static void erofs_cpu_hotplug_destroy(void)
{
	if (erofs_cpuhp_state)
		cpuhp_remove_state_nocalls(erofs_cpuhp_state);
}
#else /* !CONFIG_HOTPLUG_CPU || !CONFIG_EROFS_FS_PCPU_KTHREAD */
static inline int erofs_cpu_hotplug_init(void) { return 0; }
static inline void erofs_cpu_hotplug_destroy(void) {}
#endif

void z_erofs_exit_zip_subsystem(void)
{
	erofs_cpu_hotplug_destroy();
	erofs_destroy_percpu_workers();
	destroy_workqueue(z_erofs_workqueue);
	z_erofs_destroy_pcluster_pool();
}

int __init z_erofs_init_zip_subsystem(void)
@@ -149,10 +253,29 @@ int __init z_erofs_init_zip_subsystem(void)
	int err = z_erofs_create_pcluster_pool();

	if (err)
		return err;
	err = z_erofs_init_workqueue();
		goto out_error_pcluster_pool;

	z_erofs_workqueue = alloc_workqueue("erofs_worker",
			WQ_UNBOUND | WQ_HIGHPRI, num_possible_cpus());
	if (!z_erofs_workqueue)
		goto out_error_workqueue_init;

	err = erofs_init_percpu_workers();
	if (err)
		goto out_error_pcpu_worker;

	err = erofs_cpu_hotplug_init();
	if (err < 0)
		goto out_error_cpuhp_init;
	return err;

out_error_cpuhp_init:
	erofs_destroy_percpu_workers();
out_error_pcpu_worker:
	destroy_workqueue(z_erofs_workqueue);
out_error_workqueue_init:
	z_erofs_destroy_pcluster_pool();
out_error_pcluster_pool:
	return err;
}

@@ -772,6 +895,12 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
}

static void z_erofs_decompressqueue_work(struct work_struct *work);
#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
static void z_erofs_decompressqueue_kthread_work(struct kthread_work *work)
{
	z_erofs_decompressqueue_work((struct work_struct *)work);
}
#endif
static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
				       bool sync, int bios)
{
@@ -789,7 +918,22 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
		return;
	/* Use workqueue and sync decompression for atomic contexts only */
	if (in_atomic() || irqs_disabled()) {
#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
		struct kthread_worker *worker;

		rcu_read_lock();
		worker = rcu_dereference(
				z_erofs_pcpu_workers[raw_smp_processor_id()]);
		if (!worker) {
			INIT_WORK(&io->u.work, z_erofs_decompressqueue_work);
			queue_work(z_erofs_workqueue, &io->u.work);
		} else {
			kthread_queue_work(worker, &io->u.kthread_work);
		}
		rcu_read_unlock();
#else
		queue_work(z_erofs_workqueue, &io->u.work);
#endif
		sbi->readahead_sync_decompress = true;
		return;
	}
@@ -1197,7 +1341,12 @@ jobqueue_init(struct super_block *sb,
			*fg = true;
			goto fg_out;
		}
#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
		kthread_init_work(&q->u.kthread_work,
				  z_erofs_decompressqueue_kthread_work);
#else
		INIT_WORK(&q->u.work, z_erofs_decompressqueue_work);
#endif
	} else {
fg_out:
		q = fgq;
@@ -1335,7 +1484,7 @@ static void z_erofs_submit_queue(struct super_block *sb,

	/*
	 * although background is preferred, no one is pending for submission.
	 * don't issue workqueue for decompression but drop it directly instead.
	 * don't issue decompression but drop it directly instead.
	 */
	if (!*force_fg && !nr_bios) {
		kvfree(q[JQ_SUBMIT]);
+2 −0
Original line number Diff line number Diff line
@@ -6,6 +6,7 @@
#ifndef __EROFS_FS_ZDATA_H
#define __EROFS_FS_ZDATA_H

#include <linux/kthread.h>
#include "internal.h"
#include "zpvec.h"

@@ -91,6 +92,7 @@ struct z_erofs_decompressqueue {
	union {
		struct completion done;
		struct work_struct work;
		struct kthread_work kthread_work;
	} u;
};