Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 3d733633 authored by Dave Hansen's avatar Dave Hansen Committed by Al Viro
Browse files

[PATCH] r/o bind mounts: track numbers of writers to mounts

This is the real meat of the entire series.  It actually
implements the tracking of the number of writers to a mount.
However, it causes scalability problems because there can be
hundreds of cpus doing open()/close() on files on the same mnt at
the same time.  Even an atomic_t in the mnt has massive scalaing
problems because the cacheline gets so terribly contended.

This uses a statically-allocated percpu variable.  All want/drop
operations are local to a cpu as long that cpu operates on the same
mount, and there are no writer count imbalances.  Writer count
imbalances happen when a write is taken on one cpu, and released
on another, like when an open/close pair is performed on two

Upon a remount,ro request, all of the data from the percpu
variables is collected (expensive, but very rare) and we determine
if there are any outstanding writers to the mount.

I've written a little benchmark to sit in a loop for a couple of
seconds in several cpus in parallel doing open/write/close loops.

http://sr71.net/~dave/linux/openbench.c



The code in here is a a worst-possible case for this patch.  It
does opens on a _pair_ of files in two different mounts in parallel.
This should cause my code to lose its "operate on the same mount"
optimization completely.  This worst-case scenario causes a 3%
degredation in the benchmark.

I could probably get rid of even this 3%, but it would be more
complex than what I have here, and I think this is getting into
acceptable territory.  In practice, I expect writing more than 3
bytes to a file, as well as disk I/O to mask any effects that this
has.

(To get rid of that 3%, we could have an #defined number of mounts
in the percpu variable.  So, instead of a CPU getting operate only
on percpu data when it accesses only one mount, it could stay on
percpu data when it only accesses N or fewer mounts.)

[AV] merged fix for __clear_mnt_mount() stepping on freed vfsmount

Acked-by: default avatarAl Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: default avatarChristoph Hellwig <hch@infradead.org>
Signed-off-by: default avatarDave Hansen <haveblue@us.ibm.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarAl Viro <viro@zeniv.linux.org.uk>
parent 2c463e95
Loading
Loading
Loading
Loading
+237 −15
Original line number Diff line number Diff line
@@ -17,6 +17,7 @@
#include <linux/quotaops.h>
#include <linux/acct.h>
#include <linux/capability.h>
#include <linux/cpumask.h>
#include <linux/module.h>
#include <linux/sysfs.h>
#include <linux/seq_file.h>
@@ -55,6 +56,8 @@ static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
	return tmp & (HASH_SIZE - 1);
}

#define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16)

struct vfsmount *alloc_vfsmnt(const char *name)
{
	struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
@@ -68,6 +71,7 @@ struct vfsmount *alloc_vfsmnt(const char *name)
		INIT_LIST_HEAD(&mnt->mnt_share);
		INIT_LIST_HEAD(&mnt->mnt_slave_list);
		INIT_LIST_HEAD(&mnt->mnt_slave);
		atomic_set(&mnt->__mnt_writers, 0);
		if (name) {
			int size = strlen(name) + 1;
			char *newname = kmalloc(size, GFP_KERNEL);
@@ -80,6 +84,92 @@ struct vfsmount *alloc_vfsmnt(const char *name)
	return mnt;
}

/*
 * Most r/o checks on a fs are for operations that take
 * discrete amounts of time, like a write() or unlink().
 * We must keep track of when those operations start
 * (for permission checks) and when they end, so that
 * we can determine when writes are able to occur to
 * a filesystem.
 */
/*
 * __mnt_is_readonly: check whether a mount is read-only
 * @mnt: the mount to check for its write status
 *
 * This shouldn't be used directly ouside of the VFS.
 * It does not guarantee that the filesystem will stay
 * r/w, just that it is right *now*.  This can not and
 * should not be used in place of IS_RDONLY(inode).
 * mnt_want/drop_write() will _keep_ the filesystem
 * r/w.
 */
int __mnt_is_readonly(struct vfsmount *mnt)
{
	return (mnt->mnt_sb->s_flags & MS_RDONLY);
}
EXPORT_SYMBOL_GPL(__mnt_is_readonly);

struct mnt_writer {
	/*
	 * If holding multiple instances of this lock, they
	 * must be ordered by cpu number.
	 */
	spinlock_t lock;
	struct lock_class_key lock_class; /* compiles out with !lockdep */
	unsigned long count;
	struct vfsmount *mnt;
} ____cacheline_aligned_in_smp;
static DEFINE_PER_CPU(struct mnt_writer, mnt_writers);

static int __init init_mnt_writers(void)
{
	int cpu;
	for_each_possible_cpu(cpu) {
		struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);
		spin_lock_init(&writer->lock);
		lockdep_set_class(&writer->lock, &writer->lock_class);
		writer->count = 0;
	}
	return 0;
}
fs_initcall(init_mnt_writers);

static void unlock_mnt_writers(void)
{
	int cpu;
	struct mnt_writer *cpu_writer;

	for_each_possible_cpu(cpu) {
		cpu_writer = &per_cpu(mnt_writers, cpu);
		spin_unlock(&cpu_writer->lock);
	}
}

static inline void __clear_mnt_count(struct mnt_writer *cpu_writer)
{
	if (!cpu_writer->mnt)
		return;
	/*
	 * This is in case anyone ever leaves an invalid,
	 * old ->mnt and a count of 0.
	 */
	if (!cpu_writer->count)
		return;
	atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers);
	cpu_writer->count = 0;
}
 /*
 * must hold cpu_writer->lock
 */
static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
					  struct vfsmount *mnt)
{
	if (cpu_writer->mnt == mnt)
		return;
	__clear_mnt_count(cpu_writer);
	cpu_writer->mnt = mnt;
}

/*
 * Most r/o checks on a fs are for operations that take
 * discrete amounts of time, like a write() or unlink().
@@ -100,12 +190,77 @@ struct vfsmount *alloc_vfsmnt(const char *name)
 */
int mnt_want_write(struct vfsmount *mnt)
{
	if (__mnt_is_readonly(mnt))
		return -EROFS;
	return 0;
	int ret = 0;
	struct mnt_writer *cpu_writer;

	cpu_writer = &get_cpu_var(mnt_writers);
	spin_lock(&cpu_writer->lock);
	if (__mnt_is_readonly(mnt)) {
		ret = -EROFS;
		goto out;
	}
	use_cpu_writer_for_mount(cpu_writer, mnt);
	cpu_writer->count++;
out:
	spin_unlock(&cpu_writer->lock);
	put_cpu_var(mnt_writers);
	return ret;
}
EXPORT_SYMBOL_GPL(mnt_want_write);

static void lock_mnt_writers(void)
{
	int cpu;
	struct mnt_writer *cpu_writer;

	for_each_possible_cpu(cpu) {
		cpu_writer = &per_cpu(mnt_writers, cpu);
		spin_lock(&cpu_writer->lock);
		__clear_mnt_count(cpu_writer);
		cpu_writer->mnt = NULL;
	}
}

/*
 * These per-cpu write counts are not guaranteed to have
 * matched increments and decrements on any given cpu.
 * A file open()ed for write on one cpu and close()d on
 * another cpu will imbalance this count.  Make sure it
 * does not get too far out of whack.
 */
static void handle_write_count_underflow(struct vfsmount *mnt)
{
	if (atomic_read(&mnt->__mnt_writers) >=
	    MNT_WRITER_UNDERFLOW_LIMIT)
		return;
	/*
	 * It isn't necessary to hold all of the locks
	 * at the same time, but doing it this way makes
	 * us share a lot more code.
	 */
	lock_mnt_writers();
	/*
	 * vfsmount_lock is for mnt_flags.
	 */
	spin_lock(&vfsmount_lock);
	/*
	 * If coalescing the per-cpu writer counts did not
	 * get us back to a positive writer count, we have
	 * a bug.
	 */
	if ((atomic_read(&mnt->__mnt_writers) < 0) &&
	    !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) {
		printk(KERN_DEBUG "leak detected on mount(%p) writers "
				"count: %d\n",
			mnt, atomic_read(&mnt->__mnt_writers));
		WARN_ON(1);
		/* use the flag to keep the dmesg spam down */
		mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT;
	}
	spin_unlock(&vfsmount_lock);
	unlock_mnt_writers();
}

/**
 * mnt_drop_write - give up write access to a mount
 * @mnt: the mount on which to give up write access
@@ -116,23 +271,61 @@ EXPORT_SYMBOL_GPL(mnt_want_write);
 */
void mnt_drop_write(struct vfsmount *mnt)
{
	int must_check_underflow = 0;
	struct mnt_writer *cpu_writer;

	cpu_writer = &get_cpu_var(mnt_writers);
	spin_lock(&cpu_writer->lock);

	use_cpu_writer_for_mount(cpu_writer, mnt);
	if (cpu_writer->count > 0) {
		cpu_writer->count--;
	} else {
		must_check_underflow = 1;
		atomic_dec(&mnt->__mnt_writers);
	}
EXPORT_SYMBOL_GPL(mnt_drop_write);

	spin_unlock(&cpu_writer->lock);
	/*
 * __mnt_is_readonly: check whether a mount is read-only
 * @mnt: the mount to check for its write status
 *
 * This shouldn't be used directly ouside of the VFS.
 * It does not guarantee that the filesystem will stay
 * r/w, just that it is right *now*.  This can not and
 * should not be used in place of IS_RDONLY(inode).
	 * Logically, we could call this each time,
	 * but the __mnt_writers cacheline tends to
	 * be cold, and makes this expensive.
	 */
int __mnt_is_readonly(struct vfsmount *mnt)
	if (must_check_underflow)
		handle_write_count_underflow(mnt);
	/*
	 * This could be done right after the spinlock
	 * is taken because the spinlock keeps us on
	 * the cpu, and disables preemption.  However,
	 * putting it here bounds the amount that
	 * __mnt_writers can underflow.  Without it,
	 * we could theoretically wrap __mnt_writers.
	 */
	put_cpu_var(mnt_writers);
}
EXPORT_SYMBOL_GPL(mnt_drop_write);

int mnt_make_readonly(struct vfsmount *mnt)
{
	return (mnt->mnt_sb->s_flags & MS_RDONLY);
	int ret = 0;

	lock_mnt_writers();
	/*
	 * With all the locks held, this value is stable
	 */
	if (atomic_read(&mnt->__mnt_writers) > 0) {
		ret = -EBUSY;
		goto out;
	}
	/*
	 * actually set mount's r/o flag here to make
	 * __mnt_is_readonly() true, which keeps anyone
	 * from doing a successful mnt_want_write().
	 */
out:
	unlock_mnt_writers();
	return ret;
}
EXPORT_SYMBOL_GPL(__mnt_is_readonly);

int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
{
@@ -325,7 +518,36 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,

static inline void __mntput(struct vfsmount *mnt)
{
	int cpu;
	struct super_block *sb = mnt->mnt_sb;
	/*
	 * We don't have to hold all of the locks at the
	 * same time here because we know that we're the
	 * last reference to mnt and that no new writers
	 * can come in.
	 */
	for_each_possible_cpu(cpu) {
		struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
		if (cpu_writer->mnt != mnt)
			continue;
		spin_lock(&cpu_writer->lock);
		atomic_add(cpu_writer->count, &mnt->__mnt_writers);
		cpu_writer->count = 0;
		/*
		 * Might as well do this so that no one
		 * ever sees the pointer and expects
		 * it to be valid.
		 */
		cpu_writer->mnt = NULL;
		spin_unlock(&cpu_writer->lock);
	}
	/*
	 * This probably indicates that somebody messed
	 * up a mnt_want/drop_write() pair.  If this
	 * happens, the filesystem was probably unable
	 * to make r/w->r/o transitions.
	 */
	WARN_ON(atomic_read(&mnt->__mnt_writers));
	dput(mnt->mnt_root);
	free_vfsmnt(mnt);
	deactivate_super(sb);
+7 −0
Original line number Diff line number Diff line
@@ -14,6 +14,7 @@

#include <linux/types.h>
#include <linux/list.h>
#include <linux/nodemask.h>
#include <linux/spinlock.h>
#include <asm/atomic.h>

@@ -30,6 +31,7 @@ struct mnt_namespace;
#define MNT_RELATIME	0x20

#define MNT_SHRINKABLE	0x100
#define MNT_IMBALANCED_WRITE_COUNT	0x200 /* just for debugging */

#define MNT_SHARED	0x1000	/* if the vfsmount is a shared mount */
#define MNT_UNBINDABLE	0x2000	/* if the vfsmount is a unbindable mount */
@@ -62,6 +64,11 @@ struct vfsmount {
	int mnt_expiry_mark;		/* true if marked for expiry */
	int mnt_pinned;
	int mnt_ghosts;
	/*
	 * This value is not stable unless all of the mnt_writers[] spinlocks
	 * are held, and all mnt_writer[]s on this mount have 0 as their ->count
	 */
	atomic_t __mnt_writers;
};

static inline struct vfsmount *mntget(struct vfsmount *mnt)