Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit b3e19d92 authored by Nick Piggin's avatar Nick Piggin
Browse files

fs: scale mntget/mntput



The problem that this patch aims to fix is vfsmount refcounting scalability.
We need to take a reference on the vfsmount for every successful path lookup,
which often go to the same mount point.

The fundamental difficulty is that a "simple" reference count can never be made
scalable, because any time a reference is dropped, we must check whether that
was the last reference. To do that requires communication with all other CPUs
that may have taken a reference count.

We can make refcounts more scalable in a couple of ways, involving keeping
distributed counters, and checking for the global-zero condition less
frequently.

- check the global sum once every interval (this will delay zero detection
  for some interval, so it's probably a showstopper for vfsmounts).

- keep a local count and only taking the global sum when local reaches 0 (this
  is difficult for vfsmounts, because we can't hold preempt off for the life of
  a reference, so a counter would need to be per-thread or tied strongly to a
  particular CPU which requires more locking).

- keep a local difference of increments and decrements, which allows us to sum
  the total difference and hence find the refcount when summing all CPUs. Then,
  keep a single integer "long" refcount for slow and long lasting references,
  and only take the global sum of local counters when the long refcount is 0.

This last scheme is what I implemented here. Attached mounts and process root
and working directory references are "long" references, and everything else is
a short reference.

This allows scalable vfsmount references during path walking over mounted
subtrees and unattached (lazy umounted) mounts with processes still running
in them.

This results in one fewer atomic op in the fastpath: mntget is now just a
per-CPU inc, rather than an atomic inc; and mntput just requires a spinlock
and non-atomic decrement in the common case. However code is otherwise bigger
and heavier, so single threaded performance is basically a wash.

Signed-off-by: default avatarNick Piggin <npiggin@kernel.dk>
parent c6653a83
Loading
Loading
Loading
Loading
+1 −1
Original line number Original line Diff line number Diff line
@@ -1542,7 +1542,7 @@ pfm_exit_smpl_buffer(pfm_buffer_fmt_t *fmt)
 * any operations on the root directory. However, we need a non-trivial
 * any operations on the root directory. However, we need a non-trivial
 * d_name - pfm: will go nicely and kill the special-casing in procfs.
 * d_name - pfm: will go nicely and kill the special-casing in procfs.
 */
 */
static struct vfsmount *pfmfs_mnt;
static struct vfsmount *pfmfs_mnt __read_mostly;


static int __init
static int __init
init_pfm_fs(void)
init_pfm_fs(void)
+1 −1
Original line number Original line Diff line number Diff line
@@ -1201,7 +1201,7 @@ static int __init init_mtdchar(void)
static void __exit cleanup_mtdchar(void)
static void __exit cleanup_mtdchar(void)
{
{
	unregister_mtd_user(&mtdchar_notifier);
	unregister_mtd_user(&mtdchar_notifier);
	mntput(mtd_inode_mnt);
	mntput_long(mtd_inode_mnt);
	unregister_filesystem(&mtd_inodefs_type);
	unregister_filesystem(&mtd_inodefs_type);
	__unregister_chrdev(MTD_CHAR_MAJOR, 0, 1 << MINORBITS, "mtd");
	__unregister_chrdev(MTD_CHAR_MAJOR, 0, 1 << MINORBITS, "mtd");
}
}
+1 −1
Original line number Original line Diff line number Diff line
@@ -232,7 +232,7 @@ static int __init anon_inode_init(void)
	return 0;
	return 0;


err_mntput:
err_mntput:
	mntput(anon_inode_mnt);
	mntput_long(anon_inode_mnt);
err_unregister_filesystem:
err_unregister_filesystem:
	unregister_filesystem(&anon_inode_fs_type);
	unregister_filesystem(&anon_inode_fs_type);
err_exit:
err_exit:
+16 −10
Original line number Original line Diff line number Diff line
@@ -17,11 +17,11 @@ void set_fs_root(struct fs_struct *fs, struct path *path)
	write_seqcount_begin(&fs->seq);
	write_seqcount_begin(&fs->seq);
	old_root = fs->root;
	old_root = fs->root;
	fs->root = *path;
	fs->root = *path;
	path_get(path);
	path_get_long(path);
	write_seqcount_end(&fs->seq);
	write_seqcount_end(&fs->seq);
	spin_unlock(&fs->lock);
	spin_unlock(&fs->lock);
	if (old_root.dentry)
	if (old_root.dentry)
		path_put(&old_root);
		path_put_long(&old_root);
}
}


/*
/*
@@ -36,12 +36,12 @@ void set_fs_pwd(struct fs_struct *fs, struct path *path)
	write_seqcount_begin(&fs->seq);
	write_seqcount_begin(&fs->seq);
	old_pwd = fs->pwd;
	old_pwd = fs->pwd;
	fs->pwd = *path;
	fs->pwd = *path;
	path_get(path);
	path_get_long(path);
	write_seqcount_end(&fs->seq);
	write_seqcount_end(&fs->seq);
	spin_unlock(&fs->lock);
	spin_unlock(&fs->lock);


	if (old_pwd.dentry)
	if (old_pwd.dentry)
		path_put(&old_pwd);
		path_put_long(&old_pwd);
}
}


void chroot_fs_refs(struct path *old_root, struct path *new_root)
void chroot_fs_refs(struct path *old_root, struct path *new_root)
@@ -59,13 +59,13 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
			write_seqcount_begin(&fs->seq);
			write_seqcount_begin(&fs->seq);
			if (fs->root.dentry == old_root->dentry
			if (fs->root.dentry == old_root->dentry
			    && fs->root.mnt == old_root->mnt) {
			    && fs->root.mnt == old_root->mnt) {
				path_get(new_root);
				path_get_long(new_root);
				fs->root = *new_root;
				fs->root = *new_root;
				count++;
				count++;
			}
			}
			if (fs->pwd.dentry == old_root->dentry
			if (fs->pwd.dentry == old_root->dentry
			    && fs->pwd.mnt == old_root->mnt) {
			    && fs->pwd.mnt == old_root->mnt) {
				path_get(new_root);
				path_get_long(new_root);
				fs->pwd = *new_root;
				fs->pwd = *new_root;
				count++;
				count++;
			}
			}
@@ -76,13 +76,13 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
	} while_each_thread(g, p);
	} while_each_thread(g, p);
	read_unlock(&tasklist_lock);
	read_unlock(&tasklist_lock);
	while (count--)
	while (count--)
		path_put(old_root);
		path_put_long(old_root);
}
}


void free_fs_struct(struct fs_struct *fs)
void free_fs_struct(struct fs_struct *fs)
{
{
	path_put(&fs->root);
	path_put_long(&fs->root);
	path_put(&fs->pwd);
	path_put_long(&fs->pwd);
	kmem_cache_free(fs_cachep, fs);
	kmem_cache_free(fs_cachep, fs);
}
}


@@ -115,7 +115,13 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
		spin_lock_init(&fs->lock);
		spin_lock_init(&fs->lock);
		seqcount_init(&fs->seq);
		seqcount_init(&fs->seq);
		fs->umask = old->umask;
		fs->umask = old->umask;
		get_fs_root_and_pwd(old, &fs->root, &fs->pwd);

		spin_lock(&old->lock);
		fs->root = old->root;
		path_get_long(&fs->root);
		fs->pwd = old->pwd;
		path_get_long(&fs->pwd);
		spin_unlock(&old->lock);
	}
	}
	return fs;
	return fs;
}
}
+1 −0
Original line number Original line Diff line number Diff line
@@ -63,6 +63,7 @@ extern int copy_mount_string(const void __user *, char **);


extern void free_vfsmnt(struct vfsmount *);
extern void free_vfsmnt(struct vfsmount *);
extern struct vfsmount *alloc_vfsmnt(const char *);
extern struct vfsmount *alloc_vfsmnt(const char *);
extern unsigned int mnt_get_count(struct vfsmount *mnt);
extern struct vfsmount *__lookup_mnt(struct vfsmount *, struct dentry *, int);
extern struct vfsmount *__lookup_mnt(struct vfsmount *, struct dentry *, int);
extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *,
extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *,
				struct vfsmount *);
				struct vfsmount *);
Loading