Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 5e6feacb authored by Yurii Zubrytskyi's avatar Yurii Zubrytskyi Committed by Paul Lawrence
Browse files

ANDROID: Incremental fs: make remount log buffer change atomic



Read log buffer can have multiple threads doing any of these
operations simultaneously:
- Polling for changes
- Reading log records
- Adding new log records
- Updating log buffer size, or enabling/disabling it completely

As we don't control the userspace, and it turns out that they
all currently originate from different processes, code needs to
be safe against parallel access to a read buffer and a request
for reallocating it.

This CL add an r/w spinlock to protect the buffer and its size.
Each remount takes the write lock, while everything else takes
a read lock. Remount makes sure it doesn't take too long by
preallocating and precalculating all updates, while other
operations don't care much about their critical section size -
they all can still run together.

Bug: 152633648
Test: manual remount + reading
Signed-off-by: default avatarYurii Zubrytskyi <zyy@google.com>
Signed-off-by: default avatarPaul Lawrence <paullawrence@google.com>
Change-Id: I7271b4cb89f1ae2cbee6e5b073758f344c4ba66a
parent 5128381d
Loading
Loading
Loading
Loading
+102 −43
Original line number Diff line number Diff line
@@ -34,7 +34,8 @@ struct mount_info *incfs_alloc_mount_info(struct super_block *sb,
	mutex_init(&mi->mi_pending_reads_mutex);
	init_waitqueue_head(&mi->mi_pending_reads_notif_wq);
	init_waitqueue_head(&mi->mi_log.ml_notif_wq);
	spin_lock_init(&mi->mi_log.rl_writer_lock);
	rwlock_init(&mi->mi_log.rl_access_lock);
	spin_lock_init(&mi->mi_log.rl_logging_lock);
	INIT_LIST_HEAD(&mi->mi_reads_list_head);

	error = incfs_realloc_mount_info(mi, options);
@@ -51,20 +52,38 @@ struct mount_info *incfs_alloc_mount_info(struct super_block *sb,
int incfs_realloc_mount_info(struct mount_info *mi,
			     struct mount_options *options)
{
	kfree(mi->mi_log.rl_ring_buf);
	mi->mi_log.rl_ring_buf = NULL;
	mi->mi_log.rl_size = 0;

	mi->mi_options = *options;
	if (options->read_log_pages != 0) {
		size_t buf_size = PAGE_SIZE * options->read_log_pages;
	void *new_buffer = NULL;
	size_t new_buffer_size = 0;

		mi->mi_log.rl_size = buf_size / sizeof(*mi->mi_log.rl_ring_buf);
		mi->mi_log.rl_ring_buf = kzalloc(buf_size, GFP_NOFS);
		if (!mi->mi_log.rl_ring_buf)
	if (options->read_log_pages != mi->mi_options.read_log_pages) {
		struct read_log_state log_state;
		/*
		 * Even though having two buffers allocated at once isn't
		 * usually good, allocating a multipage buffer under a spinlock
		 * is even worse, so let's optimize for the shorter lock
		 * duration. It's not end of the world if we fail to increase
		 * the buffer size anyway.
		 */
		if (options->read_log_pages > 0) {
			new_buffer_size = PAGE_SIZE * options->read_log_pages;
			new_buffer = kzalloc(new_buffer_size, GFP_NOFS);
			if (!new_buffer)
				return -ENOMEM;
		}

		write_lock(&mi->mi_log.rl_access_lock);
		kfree(mi->mi_log.rl_ring_buf);
		WRITE_ONCE(mi->mi_log.rl_ring_buf, new_buffer);
		WRITE_ONCE(mi->mi_log.rl_size,
			   new_buffer_size / sizeof(*mi->mi_log.rl_ring_buf));
		log_state = READ_ONCE(mi->mi_log.rl_state);
		log_state.generation_id++;
		log_state.next_index = log_state.current_pass_no = 0;
		WRITE_ONCE(mi->mi_log.rl_state, log_state);
		write_unlock(&mi->mi_log.rl_access_lock);
	}

	mi->mi_options = *options;
	return 0;
}

@@ -233,6 +252,7 @@ static void log_block_read(struct mount_info *mi, incfs_uuid_t *id,
	struct read_log *log = &mi->mi_log;
	struct read_log_state state;
	s64 now_us = ktime_to_us(ktime_get());
	int rl_size;
	struct read_log_record record = {
		.file_id = *id,
		.block_index = block_index,
@@ -240,19 +260,22 @@ static void log_block_read(struct mount_info *mi, incfs_uuid_t *id,
		.timestamp_us = now_us
	};

	if (log->rl_size == 0)
		return;

	spin_lock(&log->rl_writer_lock);
	read_lock(&log->rl_access_lock);
	rl_size = READ_ONCE(log->rl_size);
	if (rl_size != 0) {
		spin_lock(&log->rl_logging_lock);
		state = READ_ONCE(log->rl_state);
		log->rl_ring_buf[state.next_index] = record;
	if (++state.next_index == log->rl_size) {
		if (++state.next_index == rl_size) {
			state.next_index = 0;
			++state.current_pass_no;
		}
		WRITE_ONCE(log->rl_state, state);
	spin_unlock(&log->rl_writer_lock);
		spin_unlock(&log->rl_logging_lock);
	}
	read_unlock(&log->rl_access_lock);

	if (rl_size != 0)
		wake_up_all(&log->ml_notif_wq);
}

@@ -1171,9 +1194,11 @@ struct read_log_state incfs_get_log_state(struct mount_info *mi)
	struct read_log *log = &mi->mi_log;
	struct read_log_state result;

	spin_lock(&log->rl_writer_lock);
	read_lock(&log->rl_access_lock);
	spin_lock(&log->rl_logging_lock);
	result = READ_ONCE(log->rl_state);
	spin_unlock(&log->rl_writer_lock);
	spin_unlock(&log->rl_logging_lock);
	read_unlock(&log->rl_access_lock);
	return result;
}

@@ -1186,10 +1211,21 @@ int incfs_get_uncollected_logs_count(struct mount_info *mi,
				     struct read_log_state state)
{
	struct read_log *log = &mi->mi_log;
	struct read_log_state rl_state;
	int rl_size;
	u64 count;

	read_lock(&log->rl_access_lock);
	rl_size = READ_ONCE(log->rl_size);
	spin_lock(&log->rl_logging_lock);
	rl_state = READ_ONCE(log->rl_state);
	spin_unlock(&log->rl_logging_lock);
	read_unlock(&log->rl_access_lock);

	u64 count = calc_record_count(&log->rl_state, log->rl_size) -
		    calc_record_count(&state, log->rl_size);
	return min_t(int, count, log->rl_size);
	count = calc_record_count(&rl_state, rl_size);
	if (rl_state.generation_id == state.generation_id)
		count -= calc_record_count(&state, rl_size);
	return min_t(int, count, rl_size);
}

static void fill_pending_read_from_log_record(
@@ -1209,17 +1245,35 @@ int incfs_collect_logged_reads(struct mount_info *mi,
			       int reads_size)
{
	struct read_log *log = &mi->mi_log;
	struct read_log_state live_state = incfs_get_log_state(mi);
	u64 read_count = calc_record_count(reader_state, log->rl_size);
	u64 written_count = calc_record_count(&live_state, log->rl_size);
	struct read_log_state live_state;
	int dst_idx;
	int rl_size;
	int result = 0;
	u64 read_count;
	u64 written_count;

	if (reader_state->next_index >= log->rl_size ||
	    read_count > written_count)
		return -ERANGE;
	read_lock(&log->rl_access_lock);

	if (read_count == written_count)
		return 0;
	rl_size = READ_ONCE(log->rl_size);
	spin_lock(&log->rl_logging_lock);
	live_state = READ_ONCE(log->rl_state);
	spin_unlock(&log->rl_logging_lock);

	if (reader_state->generation_id != live_state.generation_id) {
		reader_state->generation_id = live_state.generation_id;
		reader_state->current_pass_no = reader_state->next_index = 0;
	}

	read_count = calc_record_count(reader_state, rl_size);
	written_count = calc_record_count(&live_state, rl_size);
	if (read_count == written_count) {
		result = 0;
		goto out;
	}
	if (reader_state->next_index >= rl_size) {
		result = -ERANGE;
		goto out;
	}

	if (read_count > written_count) {
		/* This reader is somehow ahead of the writer. */
@@ -1227,16 +1281,17 @@ int incfs_collect_logged_reads(struct mount_info *mi,
		*reader_state = live_state;
	}

	if (written_count - read_count > log->rl_size) {
	if (written_count - read_count > rl_size) {
		/*
		 * Reading pointer is too far behind,
		 * start from the record following the write pointer.
		 */
		pr_debug("incfs: read pointer is behind, moving: %u/%u -> %u/%u / %u\n",
		pr_debug(
			"incfs: read pointer is behind, moving: %u/%u -> %u/%u / %u\n",
			(u32)reader_state->next_index,
			(u32)reader_state->current_pass_no,
			(u32)live_state.next_index,
			(u32)live_state.current_pass_no - 1, (u32)log->rl_size);
			(u32)live_state.current_pass_no - 1, (u32)rl_size);

		*reader_state = (struct read_log_state){
			.next_index = live_state.next_index,
@@ -1252,15 +1307,19 @@ int incfs_collect_logged_reads(struct mount_info *mi,
		fill_pending_read_from_log_record(
			&reads[dst_idx],
			&log->rl_ring_buf[reader_state->next_index],
			reader_state, log->rl_size);
			reader_state, rl_size);

		reader_state->next_index++;
		if (reader_state->next_index == log->rl_size) {
		if (reader_state->next_index == rl_size) {
			reader_state->next_index = 0;
			reader_state->current_pass_no++;
		}
	}
	return dst_idx;
	result = dst_idx;

out:
	read_unlock(&log->rl_access_lock);
	return result;
}

bool incfs_equal_ranges(struct mem_range lhs, struct mem_range rhs)
+18 −5
Original line number Diff line number Diff line
@@ -31,10 +31,13 @@ struct read_log_record {
} __packed;

struct read_log_state {
	/* Next slot in rl_ring_buf to write to. */
	u32 next_index;
	/* Log buffer generation id, incremented on configuration changes */
	u32 generation_id : 8;

	/* Current number of writer pass over rl_ring_buf */
	/* Next slot in rl_ring_buf to write into. */
	u32 next_index : 24;

	/* Current number of writer passes over rl_ring_buf */
	u32 current_pass_no;
};

@@ -42,11 +45,21 @@ struct read_log_state {
struct read_log {
	struct read_log_record *rl_ring_buf;

	int rl_size;

	struct read_log_state rl_state;

	spinlock_t rl_writer_lock;
	/*
	 * A lock for _all_ accesses to the struct, to protect against remounts.
	 * Taken for writing when resizing the buffer.
	 */
	rwlock_t rl_access_lock;

	int rl_size;
	/*
	 * A lock to protect the actual logging - adding a new record.
	 * Note: ALWAYS taken after and under the |rl_access_lock|.
	 */
	spinlock_t rl_logging_lock;

	/*
	 * A queue of waiters who want to be notified about reads.
+11 −6
Original line number Diff line number Diff line
@@ -581,22 +581,27 @@ static ssize_t log_read(struct file *f, char __user *buf, size_t len,
{
	struct log_file_state *log_state = f->private_data;
	struct mount_info *mi = get_mount_info(file_superblock(f));
	struct incfs_pending_read_info *reads_buf =
		(struct incfs_pending_read_info *)__get_free_page(GFP_NOFS);
	size_t reads_to_collect = len / sizeof(*reads_buf);
	size_t reads_per_page = PAGE_SIZE / sizeof(*reads_buf);
	int total_reads_collected = 0;
	int rl_size;
	ssize_t result = 0;
	struct incfs_pending_read_info *reads_buf;
	ssize_t reads_to_collect = len / sizeof(*reads_buf);
	ssize_t reads_per_page = PAGE_SIZE / sizeof(*reads_buf);

	rl_size = READ_ONCE(mi->mi_log.rl_size);
	if (rl_size == 0)
		return 0;

	reads_buf = (struct incfs_pending_read_info *)__get_free_page(GFP_NOFS);
	if (!reads_buf)
		return -ENOMEM;

	reads_to_collect = min_t(size_t, mi->mi_log.rl_size, reads_to_collect);
	reads_to_collect = min_t(ssize_t, rl_size, reads_to_collect);
	while (reads_to_collect > 0) {
		struct read_log_state next_state = READ_ONCE(log_state->state);
		int reads_collected = incfs_collect_logged_reads(
			mi, &next_state, reads_buf,
			min_t(size_t, reads_to_collect, reads_per_page));
			min_t(ssize_t, reads_to_collect, reads_per_page));
		if (reads_collected <= 0) {
			result = total_reads_collected ?
					 total_reads_collected *