Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 949854d0 authored by Nick Piggin's avatar Nick Piggin
Browse files

fs: Use rename lock and RCU for multi-step operations



The remaining usages for dcache_lock is to allow atomic, multi-step read-side
operations over the directory tree by excluding modifications to the tree.
Also, to walk in the leaf->root direction in the tree where we don't have
a natural d_lock ordering.

This could be accomplished by taking every d_lock, but this would mean a
huge number of locks and actually gets very tricky.

Solve this instead by using the rename seqlock for multi-step read-side
operations, retry in case of a rename so we don't walk up the wrong parent.
Concurrent dentry insertions are not serialised against.  Concurrent deletes
are tricky when walking up the directory: our parent might have been deleted
when dropping locks so also need to check and retry for that.

We can also use the rename lock in cases where livelock is a worry (and it
is introduced in subsequent patch).

Signed-off-by: default avatarNick Piggin <npiggin@kernel.dk>
parent 9abca360
Loading
Loading
Loading
Loading
+12 −3
Original line number Diff line number Diff line
@@ -83,10 +83,11 @@ out:
int pohmelfs_path_length(struct pohmelfs_inode *pi)
{
	struct dentry *d, *root, *first;
	int len = 1; /* Root slash */
	int len;
	unsigned seq;

	first = d = d_find_alias(&pi->vfs_inode);
	if (!d) {
	first = d_find_alias(&pi->vfs_inode);
	if (!first) {
		dprintk("%s: ino: %llu, mode: %o.\n", __func__, pi->ino, pi->vfs_inode.i_mode);
		return -ENOENT;
	}
@@ -95,6 +96,11 @@ int pohmelfs_path_length(struct pohmelfs_inode *pi)
	root = dget(current->fs->root.dentry);
	spin_unlock(&current->fs->lock);

rename_retry:
	len = 1; /* Root slash */
	d = first;
	seq = read_seqbegin(&rename_lock);
	rcu_read_lock();
	spin_lock(&dcache_lock);

	if (!IS_ROOT(d) && d_unhashed(d))
@@ -105,6 +111,9 @@ int pohmelfs_path_length(struct pohmelfs_inode *pi)
		d = d->d_parent;
	}
	spin_unlock(&dcache_lock);
	rcu_read_unlock();
	if (read_seqretry(&rename_lock, seq))
		goto rename_retry;

	dput(root);
	dput(first);
+15 −3
Original line number Diff line number Diff line
@@ -186,16 +186,25 @@ static int autofs4_getpath(struct autofs_sb_info *sbi,
{
	struct dentry *root = sbi->sb->s_root;
	struct dentry *tmp;
	char *buf = *name;
	char *buf;
	char *p;
	int len = 0;

	int len;
	unsigned seq;

rename_retry:
	buf = *name;
	len = 0;
	seq = read_seqbegin(&rename_lock);
	rcu_read_lock();
	spin_lock(&dcache_lock);
	for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent)
		len += tmp->d_name.len + 1;

	if (!len || --len > NAME_MAX) {
		spin_unlock(&dcache_lock);
		rcu_read_unlock();
		if (read_seqretry(&rename_lock, seq))
			goto rename_retry;
		return 0;
	}

@@ -209,6 +218,9 @@ static int autofs4_getpath(struct autofs_sb_info *sbi,
		strncpy(p, tmp->d_name.name, tmp->d_name.len);
	}
	spin_unlock(&dcache_lock);
	rcu_read_unlock();
	if (read_seqretry(&rename_lock, seq))
		goto rename_retry;

	return len;
}
+111 −23
Original line number Diff line number Diff line
@@ -80,6 +80,7 @@ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lru_lock);
__cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lock);
__cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);

EXPORT_SYMBOL(rename_lock);
EXPORT_SYMBOL(dcache_inode_lock);
EXPORT_SYMBOL(dcache_lock);

@@ -243,6 +244,7 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
	__releases(dcache_inode_lock)
	__releases(dcache_lock)
{
	dentry->d_parent = NULL;
	list_del(&dentry->d_u.d_child);
	if (parent)
		spin_unlock(&parent->d_lock);
@@ -1017,11 +1019,15 @@ void shrink_dcache_for_umount(struct super_block *sb)
 * Return true if the parent or its subdirectories contain
 * a mount point
 */
 
int have_submounts(struct dentry *parent)
{
	struct dentry *this_parent = parent;
	struct dentry *this_parent;
	struct list_head *next;
	unsigned seq;

rename_retry:
	this_parent = parent;
	seq = read_seqbegin(&rename_lock);

	spin_lock(&dcache_lock);
	if (d_mountpoint(parent))
@@ -1055,17 +1061,37 @@ resume:
	 * All done at this level ... ascend and resume the search.
	 */
	if (this_parent != parent) {
		next = this_parent->d_u.d_child.next;
		struct dentry *tmp;
		struct dentry *child;

		tmp = this_parent->d_parent;
		rcu_read_lock();
		spin_unlock(&this_parent->d_lock);
		this_parent = this_parent->d_parent;
		child = this_parent;
		this_parent = tmp;
		spin_lock(&this_parent->d_lock);
		/* might go back up the wrong parent if we have had a rename
		 * or deletion */
		if (this_parent != child->d_parent ||
				read_seqretry(&rename_lock, seq)) {
			spin_unlock(&this_parent->d_lock);
			spin_unlock(&dcache_lock);
			rcu_read_unlock();
			goto rename_retry;
		}
		rcu_read_unlock();
		next = child->d_u.d_child.next;
		goto resume;
	}
	spin_unlock(&this_parent->d_lock);
	spin_unlock(&dcache_lock);
	if (read_seqretry(&rename_lock, seq))
		goto rename_retry;
	return 0; /* No mount points found in tree */
positive:
	spin_unlock(&dcache_lock);
	if (read_seqretry(&rename_lock, seq))
		goto rename_retry;
	return 1;
}
EXPORT_SYMBOL(have_submounts);
@@ -1086,10 +1112,15 @@ EXPORT_SYMBOL(have_submounts);
 */
static int select_parent(struct dentry * parent)
{
	struct dentry *this_parent = parent;
	struct dentry *this_parent;
	struct list_head *next;
	unsigned seq;
	int found = 0;

rename_retry:
	this_parent = parent;
	seq = read_seqbegin(&rename_lock);

	spin_lock(&dcache_lock);
	spin_lock(&this_parent->d_lock);
repeat:
@@ -1099,7 +1130,6 @@ resume:
		struct list_head *tmp = next;
		struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
		next = tmp->next;
		BUG_ON(this_parent == dentry);

		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);

@@ -1142,17 +1172,32 @@ resume:
	 */
	if (this_parent != parent) {
		struct dentry *tmp;
		next = this_parent->d_u.d_child.next;
		struct dentry *child;

		tmp = this_parent->d_parent;
		rcu_read_lock();
		spin_unlock(&this_parent->d_lock);
		BUG_ON(tmp == this_parent);
		child = this_parent;
		this_parent = tmp;
		spin_lock(&this_parent->d_lock);
		/* might go back up the wrong parent if we have had a rename
		 * or deletion */
		if (this_parent != child->d_parent ||
				read_seqretry(&rename_lock, seq)) {
			spin_unlock(&this_parent->d_lock);
			spin_unlock(&dcache_lock);
			rcu_read_unlock();
			goto rename_retry;
		}
		rcu_read_unlock();
		next = child->d_u.d_child.next;
		goto resume;
	}
out:
	spin_unlock(&this_parent->d_lock);
	spin_unlock(&dcache_lock);
	if (read_seqretry(&rename_lock, seq))
		goto rename_retry;
	return found;
}

@@ -1654,7 +1699,7 @@ EXPORT_SYMBOL(d_add_ci);
struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
{
	struct dentry * dentry = NULL;
	unsigned long seq;
	unsigned seq;

        do {
                seq = read_seqbegin(&rename_lock);
@@ -2290,7 +2335,7 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
 * @buffer: pointer to the end of the buffer
 * @buflen: pointer to buffer length
 *
 * Caller holds the dcache_lock.
 * Caller holds the rename_lock.
 *
 * If path is not reachable from the supplied root, then the value of
 * root is changed (without modifying refcounts).
@@ -2377,7 +2422,9 @@ char *__d_path(const struct path *path, struct path *root,

	prepend(&res, &buflen, "\0", 1);
	spin_lock(&dcache_lock);
	write_seqlock(&rename_lock);
	error = prepend_path(path, root, &res, &buflen);
	write_sequnlock(&rename_lock);
	spin_unlock(&dcache_lock);

	if (error)
@@ -2441,10 +2488,12 @@ char *d_path(const struct path *path, char *buf, int buflen)

	get_fs_root(current->fs, &root);
	spin_lock(&dcache_lock);
	write_seqlock(&rename_lock);
	tmp = root;
	error = path_with_deleted(path, &tmp, &res, &buflen);
	if (error)
		res = ERR_PTR(error);
	write_sequnlock(&rename_lock);
	spin_unlock(&dcache_lock);
	path_put(&root);
	return res;
@@ -2472,10 +2521,12 @@ char *d_path_with_unreachable(const struct path *path, char *buf, int buflen)

	get_fs_root(current->fs, &root);
	spin_lock(&dcache_lock);
	write_seqlock(&rename_lock);
	tmp = root;
	error = path_with_deleted(path, &tmp, &res, &buflen);
	if (!error && !path_equal(&tmp, &root))
		error = prepend_unreachable(&res, &buflen);
	write_sequnlock(&rename_lock);
	spin_unlock(&dcache_lock);
	path_put(&root);
	if (error)
@@ -2544,7 +2595,9 @@ char *dentry_path_raw(struct dentry *dentry, char *buf, int buflen)
	char *retval;

	spin_lock(&dcache_lock);
	write_seqlock(&rename_lock);
	retval = __dentry_path(dentry, buf, buflen);
	write_sequnlock(&rename_lock);
	spin_unlock(&dcache_lock);

	return retval;
@@ -2557,6 +2610,7 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen)
	char *retval;

	spin_lock(&dcache_lock);
	write_seqlock(&rename_lock);
	if (d_unlinked(dentry)) {
		p = buf + buflen;
		if (prepend(&p, &buflen, "//deleted", 10) != 0)
@@ -2564,6 +2618,7 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen)
		buflen++;
	}
	retval = __dentry_path(dentry, buf, buflen);
	write_sequnlock(&rename_lock);
	spin_unlock(&dcache_lock);
	if (!IS_ERR(retval) && p)
		*p = '/';	/* restore '/' overriden with '\0' */
@@ -2604,6 +2659,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)

	error = -ENOENT;
	spin_lock(&dcache_lock);
	write_seqlock(&rename_lock);
	if (!d_unlinked(pwd.dentry)) {
		unsigned long len;
		struct path tmp = root;
@@ -2612,6 +2668,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)

		prepend(&cwd, &buflen, "\0", 1);
		error = prepend_path(&pwd, &tmp, &cwd, &buflen);
		write_sequnlock(&rename_lock);
		spin_unlock(&dcache_lock);

		if (error)
@@ -2631,8 +2688,10 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
			if (copy_to_user(buf, cwd, len))
				error = -EFAULT;
		}
	} else
	} else {
		write_sequnlock(&rename_lock);
		spin_unlock(&dcache_lock);
	}

out:
	path_put(&pwd);
@@ -2660,25 +2719,25 @@ out:
int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
{
	int result;
	unsigned long seq;
	unsigned seq;

	if (new_dentry == old_dentry)
		return 1;

	do {
		/* for restarting inner loop in case of seq retry */
		seq = read_seqbegin(&rename_lock);
		/*
		 * Need rcu_readlock to protect against the d_parent trashing
		 * due to d_move
		 */
		rcu_read_lock();
	do {
		/* for restarting inner loop in case of seq retry */
		seq = read_seqbegin(&rename_lock);
		if (d_ancestor(old_dentry, new_dentry))
			result = 1;
		else
			result = 0;
	} while (read_seqretry(&rename_lock, seq));
		rcu_read_unlock();
	} while (read_seqretry(&rename_lock, seq));

	return result;
}
@@ -2710,9 +2769,13 @@ EXPORT_SYMBOL(path_is_under);

void d_genocide(struct dentry *root)
{
	struct dentry *this_parent = root;
	struct dentry *this_parent;
	struct list_head *next;
	unsigned seq;

rename_retry:
	this_parent = root;
	seq = read_seqbegin(&rename_lock);
	spin_lock(&dcache_lock);
	spin_lock(&this_parent->d_lock);
repeat:
@@ -2722,6 +2785,7 @@ resume:
		struct list_head *tmp = next;
		struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
		next = tmp->next;

		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
		if (d_unhashed(dentry) || !dentry->d_inode) {
			spin_unlock(&dentry->d_lock);
@@ -2734,19 +2798,43 @@ resume:
			spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
			goto repeat;
		}
		if (!(dentry->d_flags & DCACHE_GENOCIDE)) {
			dentry->d_flags |= DCACHE_GENOCIDE;
			dentry->d_count--;
		}
		spin_unlock(&dentry->d_lock);
	}
	if (this_parent != root) {
		next = this_parent->d_u.d_child.next;
		struct dentry *tmp;
		struct dentry *child;

		tmp = this_parent->d_parent;
		if (!(this_parent->d_flags & DCACHE_GENOCIDE)) {
			this_parent->d_flags |= DCACHE_GENOCIDE;
			this_parent->d_count--;
		}
		rcu_read_lock();
		spin_unlock(&this_parent->d_lock);
		this_parent = this_parent->d_parent;
		child = this_parent;
		this_parent = tmp;
		spin_lock(&this_parent->d_lock);
		/* might go back up the wrong parent if we have had a rename
		 * or deletion */
		if (this_parent != child->d_parent ||
				read_seqretry(&rename_lock, seq)) {
			spin_unlock(&this_parent->d_lock);
			spin_unlock(&dcache_lock);
			rcu_read_unlock();
			goto rename_retry;
		}
		rcu_read_unlock();
		next = child->d_u.d_child.next;
		goto resume;
	}
	spin_unlock(&this_parent->d_lock);
	spin_unlock(&dcache_lock);
	if (read_seqretry(&rename_lock, seq))
		goto rename_retry;
}

/**
+13 −1
Original line number Diff line number Diff line
@@ -49,11 +49,17 @@ char *nfs_path(const char *base,
	       const struct dentry *dentry,
	       char *buffer, ssize_t buflen)
{
	char *end = buffer+buflen;
	char *end;
	int namelen;
	unsigned seq;

rename_retry:
	end = buffer+buflen;
	*--end = '\0';
	buflen--;

	seq = read_seqbegin(&rename_lock);
	rcu_read_lock();
	spin_lock(&dcache_lock);
	while (!IS_ROOT(dentry) && dentry != droot) {
		namelen = dentry->d_name.len;
@@ -66,6 +72,9 @@ char *nfs_path(const char *base,
		dentry = dentry->d_parent;
	}
	spin_unlock(&dcache_lock);
	rcu_read_unlock();
	if (read_seqretry(&rename_lock, seq))
		goto rename_retry;
	if (*end != '/') {
		if (--buflen < 0)
			goto Elong;
@@ -83,6 +92,9 @@ char *nfs_path(const char *base,
	return end;
Elong_unlock:
	spin_unlock(&dcache_lock);
	rcu_read_unlock();
	if (read_seqretry(&rename_lock, seq))
		goto rename_retry;
Elong:
	return ERR_PTR(-ENAMETOOLONG);
}
+1 −0
Original line number Diff line number Diff line
@@ -180,6 +180,7 @@ struct dentry_operations {
#define DCACHE_FSNOTIFY_PARENT_WATCHED	0x0080 /* Parent inode is watched by some fsnotify listener */

#define DCACHE_CANT_MOUNT	0x0100
#define DCACHE_GENOCIDE		0x0200

extern spinlock_t dcache_inode_lock;
extern spinlock_t dcache_lock;