Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit a07b2000 authored by Al Viro's avatar Al Viro
Browse files

vfs: syscall: Add open_tree(2) to reference or clone a mount



open_tree(dfd, pathname, flags)

Returns an O_PATH-opened file descriptor or an error.
dfd and pathname specify the location to open, in usual
fashion (see e.g. fstatat(2)).  flags should be an OR of
some of the following:
	* AT_PATH_EMPTY, AT_NO_AUTOMOUNT, AT_SYMLINK_NOFOLLOW -
same meanings as usual
	* OPEN_TREE_CLOEXEC - make the resulting descriptor
close-on-exec
	* OPEN_TREE_CLONE or OPEN_TREE_CLONE | AT_RECURSIVE -
instead of opening the location in question, create a detached
mount tree matching the subtree rooted at location specified by
dfd/pathname.  With AT_RECURSIVE the entire subtree is cloned,
without it - only the part within in the mount containing the
location in question.  In other words, the same as mount --rbind
or mount --bind would've taken.  The detached tree will be
dissolved on the final close of obtained file.  Creation of such
detached trees requires the same capabilities as doing mount --bind.

Signed-off-by: default avatarAl Viro <viro@zeniv.linux.org.uk>
Signed-off-by: default avatarDavid Howells <dhowells@redhat.com>
cc: linux-api@vger.kernel.org
Signed-off-by: default avatarAl Viro <viro@zeniv.linux.org.uk>
parent 9e98c678
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -398,7 +398,8 @@
384	i386	arch_prctl		sys_arch_prctl			__ia32_compat_sys_arch_prctl
385	i386	io_pgetevents		sys_io_pgetevents_time32	__ia32_compat_sys_io_pgetevents
386	i386	rseq			sys_rseq			__ia32_sys_rseq
# don't use numbers 387 through 392, add new calls at the end
387	i386	open_tree		sys_open_tree			__ia32_sys_open_tree
# don't use numbers 388 through 392, add new calls at the end
393	i386	semget			sys_semget    			__ia32_sys_semget
394	i386	semctl			sys_semctl    			__ia32_compat_sys_semctl
395	i386	shmget			sys_shmget    			__ia32_sys_shmget
+1 −0
Original line number Diff line number Diff line
@@ -343,6 +343,7 @@
332	common	statx			__x64_sys_statx
333	common	io_pgetevents		__x64_sys_io_pgetevents
334	common	rseq			__x64_sys_rseq
335	common	open_tree		__x64_sys_open_tree
# don't use numbers 387 through 423, add new calls after the last
# 'common' entry
424	common	pidfd_send_signal	__x64_sys_pidfd_send_signal
+6 −3
Original line number Diff line number Diff line
@@ -255,6 +255,7 @@ static void __fput(struct file *file)
	struct dentry *dentry = file->f_path.dentry;
	struct vfsmount *mnt = file->f_path.mnt;
	struct inode *inode = file->f_inode;
	fmode_t mode = file->f_mode;

	if (unlikely(!(file->f_mode & FMODE_OPENED)))
		goto out;
@@ -277,18 +278,20 @@ static void __fput(struct file *file)
	if (file->f_op->release)
		file->f_op->release(inode, file);
	if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL &&
		     !(file->f_mode & FMODE_PATH))) {
		     !(mode & FMODE_PATH))) {
		cdev_put(inode->i_cdev);
	}
	fops_put(file->f_op);
	put_pid(file->f_owner.pid);
	if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
	if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
		i_readcount_dec(inode);
	if (file->f_mode & FMODE_WRITER) {
	if (mode & FMODE_WRITER) {
		put_write_access(inode);
		__mnt_drop_write(mnt);
	}
	dput(dentry);
	if (unlikely(mode & FMODE_NEED_UNMOUNT))
		dissolve_on_fput(mnt);
	mntput(mnt);
out:
	file_free(file);
+1 −0
Original line number Diff line number Diff line
@@ -94,6 +94,7 @@ extern int __mnt_want_write_file(struct file *);
extern void __mnt_drop_write(struct vfsmount *);
extern void __mnt_drop_write_file(struct file *);

extern void dissolve_on_fput(struct vfsmount *);
/*
 * fs_struct.c
 */
+135 −22
Original line number Diff line number Diff line
@@ -20,6 +20,7 @@
#include <linux/init.h>		/* init_rootfs */
#include <linux/fs_struct.h>	/* get_fs_root et.al. */
#include <linux/fsnotify.h>	/* fsnotify_vfsmount_delete */
#include <linux/file.h>
#include <linux/uaccess.h>
#include <linux/proc_ns.h>
#include <linux/magic.h>
@@ -1832,6 +1833,21 @@ struct vfsmount *collect_mounts(const struct path *path)
	return &tree->mnt;
}

static void free_mnt_ns(struct mnt_namespace *);
static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool);

void dissolve_on_fput(struct vfsmount *mnt)
{
	struct mnt_namespace *ns;
	namespace_lock();
	lock_mount_hash();
	ns = real_mount(mnt)->mnt_ns;
	umount_tree(real_mount(mnt), UMOUNT_CONNECTED);
	unlock_mount_hash();
	namespace_unlock();
	free_mnt_ns(ns);
}

void drop_collected_mounts(struct vfsmount *mnt)
{
	namespace_lock();
@@ -2222,6 +2238,30 @@ static bool has_locked_children(struct mount *mnt, struct dentry *dentry)
	return false;
}

static struct mount *__do_loopback(struct path *old_path, int recurse)
{
	struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt);

	if (IS_MNT_UNBINDABLE(old))
		return mnt;

	if (!check_mnt(old) && old_path->dentry->d_op != &ns_dentry_operations)
		return mnt;

	if (!recurse && has_locked_children(old, old_path->dentry))
		return mnt;

	if (recurse)
		mnt = copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE);
	else
		mnt = clone_mnt(old, old_path->dentry, 0);

	if (!IS_ERR(mnt))
		mnt->mnt.mnt_flags &= ~MNT_LOCKED;

	return mnt;
}

/*
 * do loopback mount.
 */
@@ -2229,7 +2269,7 @@ static int do_loopback(struct path *path, const char *old_name,
				int recurse)
{
	struct path old_path;
	struct mount *mnt = NULL, *old, *parent;
	struct mount *mnt = NULL, *parent;
	struct mountpoint *mp;
	int err;
	if (!old_name || !*old_name)
@@ -2243,38 +2283,21 @@ static int do_loopback(struct path *path, const char *old_name,
		goto out;

	mp = lock_mount(path);
	if (IS_ERR(mp)) {
		err = PTR_ERR(mp);
	if (IS_ERR(mp))
		goto out;
	}

	old = real_mount(old_path.mnt);
	parent = real_mount(path->mnt);

	err = -EINVAL;
	if (IS_MNT_UNBINDABLE(old))
		goto out2;

	if (!check_mnt(parent))
		goto out2;

	if (!check_mnt(old) && old_path.dentry->d_op != &ns_dentry_operations)
		goto out2;

	if (!recurse && has_locked_children(old, old_path.dentry))
		goto out2;

	if (recurse)
		mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE);
	else
		mnt = clone_mnt(old, old_path.dentry, 0);

	mnt = __do_loopback(&old_path, recurse);
	if (IS_ERR(mnt)) {
		err = PTR_ERR(mnt);
		goto out2;
	}

	mnt->mnt.mnt_flags &= ~MNT_LOCKED;

	err = graft_tree(mnt, parent, mp);
	if (err) {
		lock_mount_hash();
@@ -2288,6 +2311,96 @@ static int do_loopback(struct path *path, const char *old_name,
	return err;
}

static struct file *open_detached_copy(struct path *path, bool recursive)
{
	struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
	struct mnt_namespace *ns = alloc_mnt_ns(user_ns, true);
	struct mount *mnt, *p;
	struct file *file;

	if (IS_ERR(ns))
		return ERR_CAST(ns);

	namespace_lock();
	mnt = __do_loopback(path, recursive);
	if (IS_ERR(mnt)) {
		namespace_unlock();
		free_mnt_ns(ns);
		return ERR_CAST(mnt);
	}

	lock_mount_hash();
	for (p = mnt; p; p = next_mnt(p, mnt)) {
		p->mnt_ns = ns;
		ns->mounts++;
	}
	ns->root = mnt;
	list_add_tail(&ns->list, &mnt->mnt_list);
	mntget(&mnt->mnt);
	unlock_mount_hash();
	namespace_unlock();

	mntput(path->mnt);
	path->mnt = &mnt->mnt;
	file = dentry_open(path, O_PATH, current_cred());
	if (IS_ERR(file))
		dissolve_on_fput(path->mnt);
	else
		file->f_mode |= FMODE_NEED_UNMOUNT;
	return file;
}

SYSCALL_DEFINE3(open_tree, int, dfd, const char *, filename, unsigned, flags)
{
	struct file *file;
	struct path path;
	int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
	bool detached = flags & OPEN_TREE_CLONE;
	int error;
	int fd;

	BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);

	if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE |
		      AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE |
		      OPEN_TREE_CLOEXEC))
		return -EINVAL;

	if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE)
		return -EINVAL;

	if (flags & AT_NO_AUTOMOUNT)
		lookup_flags &= ~LOOKUP_AUTOMOUNT;
	if (flags & AT_SYMLINK_NOFOLLOW)
		lookup_flags &= ~LOOKUP_FOLLOW;
	if (flags & AT_EMPTY_PATH)
		lookup_flags |= LOOKUP_EMPTY;

	if (detached && !may_mount())
		return -EPERM;

	fd = get_unused_fd_flags(flags & O_CLOEXEC);
	if (fd < 0)
		return fd;

	error = user_path_at(dfd, filename, lookup_flags, &path);
	if (unlikely(error)) {
		file = ERR_PTR(error);
	} else {
		if (detached)
			file = open_detached_copy(&path, flags & AT_RECURSIVE);
		else
			file = dentry_open(&path, O_PATH, current_cred());
		path_put(&path);
	}
	if (IS_ERR(file)) {
		put_unused_fd(fd);
		return PTR_ERR(file);
	}
	fd_install(fd, file);
	return fd;
}

/*
 * Don't allow locked mount flags to be cleared.
 *
Loading