Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 43788453 authored by Alexander Mikhalitsyn's avatar Alexander Mikhalitsyn Committed by Greg Kroah-Hartman
Browse files

shm: extend forced shm destroy to support objects from several IPC nses

commit 85b6d24646e4125c591639841169baa98a2da503 upstream.

Currently, the exit_shm() function not designed to work properly when
task->sysvshm.shm_clist holds shm objects from different IPC namespaces.

This is a real pain when sysctl kernel.shm_rmid_forced = 1, because it
leads to use-after-free (reproducer exists).

This is an attempt to fix the problem by extending exit_shm mechanism to
handle shm's destroy from several IPC ns'es.

To achieve that we do several things:

1. add a namespace (non-refcounted) pointer to the struct shmid_kernel

2. during new shm object creation (newseg()/shmget syscall) we
   initialize this pointer by current task IPC ns

3. exit_shm() fully reworked such that it traverses over all shp's in
   task->sysvshm.shm_clist and gets IPC namespace not from current task
   as it was before but from shp's object itself, then call
   shm_destroy(shp, ns).

Note: We need to be really careful here, because as it was said before
(1), our pointer to IPC ns non-refcnt'ed.  To be on the safe side we
using special helper get_ipc_ns_not_zero() which allows to get IPC ns
refcounter only if IPC ns not in the "state of destruction".

Q/A

Q: Why can we access shp->ns memory using non-refcounted pointer?
A: Because shp object lifetime is always shorther than IPC namespace
   lifetime, so, if we get shp object from the task->sysvshm.shm_clist
   while holding task_lock(task) nobody can steal our namespace.

Q: Does this patch change semantics of unshare/setns/clone syscalls?
A: No. It's just fixes non-covered case when process may leave IPC
   namespace without getting task->sysvshm.shm_clist list cleaned up.

Link: https://lkml.kernel.org/r/67bb03e5-f79c-1815-e2bf-949c67047418@colorfullife.com
Link: https://lkml.kernel.org/r/20211109151501.4921-1-manfred@colorfullife.com


Fixes: ab602f79 ("shm: make exit_shm work proportional to task activity")
Co-developed-by: default avatarManfred Spraul <manfred@colorfullife.com>
Signed-off-by: default avatarManfred Spraul <manfred@colorfullife.com>
Signed-off-by: default avatarAlexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
Cc: Vasily Averin <vvs@virtuozzo.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
parent b23c0c4c
Loading
Loading
Loading
Loading
+15 −0
Original line number Diff line number Diff line
@@ -130,6 +130,16 @@ static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns)
	return ns;
}

static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns)
{
	if (ns) {
		if (refcount_inc_not_zero(&ns->count))
			return ns;
	}

	return NULL;
}

extern void put_ipc_ns(struct ipc_namespace *ns);
#else
static inline struct ipc_namespace *copy_ipcs(unsigned long flags,
@@ -146,6 +156,11 @@ static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns)
	return ns;
}

static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns)
{
	return ns;
}

static inline void put_ipc_ns(struct ipc_namespace *ns)
{
}
+1 −1
Original line number Diff line number Diff line
@@ -157,7 +157,7 @@ static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
 * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
 * subscriptions and synchronises with wait4().  Also used in procfs.  Also
 * pins the final release of task.io_context.  Also protects ->cpuset and
 * ->cgroup.subsys[]. And ->vfork_done.
 * ->cgroup.subsys[]. And ->vfork_done. And ->sysvshm.shm_clist.
 *
 * Nests both inside and outside of read_lock(&tasklist_lock).
 * It must not be nested with write_lock_irq(&tasklist_lock),
+143 −46
Original line number Diff line number Diff line
@@ -62,9 +62,18 @@ struct shmid_kernel /* private to the kernel */
	struct pid		*shm_lprid;
	struct user_struct	*mlock_user;

	/* The task created the shm object.  NULL if the task is dead. */
	/*
	 * The task created the shm object, for
	 * task_lock(shp->shm_creator)
	 */
	struct task_struct	*shm_creator;
	struct list_head	shm_clist;	/* list by creator */

	/*
	 * List by creator. task_lock(->shm_creator) required for read/write.
	 * If list_empty(), then the creator is dead already.
	 */
	struct list_head	shm_clist;
	struct ipc_namespace	*ns;
} __randomize_layout;

/* shm_mode upper byte flags */
@@ -115,6 +124,7 @@ static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
	struct shmid_kernel *shp;

	shp = container_of(ipcp, struct shmid_kernel, shm_perm);
	WARN_ON(ns != shp->ns);

	if (shp->shm_nattch) {
		shp->shm_perm.mode |= SHM_DEST;
@@ -225,10 +235,43 @@ static void shm_rcu_free(struct rcu_head *head)
	kvfree(shp);
}

static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s)
/*
 * It has to be called with shp locked.
 * It must be called before ipc_rmid()
 */
static inline void shm_clist_rm(struct shmid_kernel *shp)
{
	list_del(&s->shm_clist);
	ipc_rmid(&shm_ids(ns), &s->shm_perm);
	struct task_struct *creator;

	/* ensure that shm_creator does not disappear */
	rcu_read_lock();

	/*
	 * A concurrent exit_shm may do a list_del_init() as well.
	 * Just do nothing if exit_shm already did the work
	 */
	if (!list_empty(&shp->shm_clist)) {
		/*
		 * shp->shm_creator is guaranteed to be valid *only*
		 * if shp->shm_clist is not empty.
		 */
		creator = shp->shm_creator;

		task_lock(creator);
		/*
		 * list_del_init() is a nop if the entry was already removed
		 * from the list.
		 */
		list_del_init(&shp->shm_clist);
		task_unlock(creator);
	}
	rcu_read_unlock();
}

static inline void shm_rmid(struct shmid_kernel *s)
{
	shm_clist_rm(s);
	ipc_rmid(&shm_ids(s->ns), &s->shm_perm);
}


@@ -283,7 +326,7 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
	shm_file = shp->shm_file;
	shp->shm_file = NULL;
	ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT;
	shm_rmid(ns, shp);
	shm_rmid(shp);
	shm_unlock(shp);
	if (!is_file_hugepages(shm_file))
		shmem_lock(shm_file, 0, shp->mlock_user);
@@ -306,10 +349,10 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
 *
 * 2) sysctl kernel.shm_rmid_forced is set to 1.
 */
static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
static bool shm_may_destroy(struct shmid_kernel *shp)
{
	return (shp->shm_nattch == 0) &&
	       (ns->shm_rmid_forced ||
	       (shp->ns->shm_rmid_forced ||
		(shp->shm_perm.mode & SHM_DEST));
}

@@ -340,7 +383,7 @@ static void shm_close(struct vm_area_struct *vma)
	ipc_update_pid(&shp->shm_lprid, task_tgid(current));
	shp->shm_dtim = ktime_get_real_seconds();
	shp->shm_nattch--;
	if (shm_may_destroy(ns, shp))
	if (shm_may_destroy(shp))
		shm_destroy(ns, shp);
	else
		shm_unlock(shp);
@@ -361,10 +404,10 @@ static int shm_try_destroy_orphaned(int id, void *p, void *data)
	 *
	 * As shp->* are changed under rwsem, it's safe to skip shp locking.
	 */
	if (shp->shm_creator != NULL)
	if (!list_empty(&shp->shm_clist))
		return 0;

	if (shm_may_destroy(ns, shp)) {
	if (shm_may_destroy(shp)) {
		shm_lock_by_ptr(shp);
		shm_destroy(ns, shp);
	}
@@ -382,48 +425,97 @@ void shm_destroy_orphaned(struct ipc_namespace *ns)
/* Locking assumes this will only be called with task == current */
void exit_shm(struct task_struct *task)
{
	struct ipc_namespace *ns = task->nsproxy->ipc_ns;
	struct shmid_kernel *shp, *n;
	for (;;) {
		struct shmid_kernel *shp;
		struct ipc_namespace *ns;

		task_lock(task);

		if (list_empty(&task->sysvshm.shm_clist)) {
			task_unlock(task);
			break;
		}

	if (list_empty(&task->sysvshm.shm_clist))
		return;
		shp = list_first_entry(&task->sysvshm.shm_clist, struct shmid_kernel,
				shm_clist);

		/*
	 * If kernel.shm_rmid_forced is not set then only keep track of
		 * 1) Get pointer to the ipc namespace. It is worth to say
		 * that this pointer is guaranteed to be valid because
		 * shp lifetime is always shorter than namespace lifetime
		 * in which shp lives.
		 * We taken task_lock it means that shp won't be freed.
		 */
		ns = shp->ns;

		/*
		 * 2) If kernel.shm_rmid_forced is not set then only keep track of
		 * which shmids are orphaned, so that a later set of the sysctl
		 * can clean them up.
		 */
	if (!ns->shm_rmid_forced) {
		down_read(&shm_ids(ns).rwsem);
		list_for_each_entry(shp, &task->sysvshm.shm_clist, shm_clist)
			shp->shm_creator = NULL;
		if (!ns->shm_rmid_forced)
			goto unlink_continue;

		/*
		 * Only under read lock but we are only called on current
		 * so no entry on the list will be shared.
		 * 3) get a reference to the namespace.
		 *    The refcount could be already 0. If it is 0, then
		 *    the shm objects will be free by free_ipc_work().
		 */
		list_del(&task->sysvshm.shm_clist);
		up_read(&shm_ids(ns).rwsem);
		return;
		ns = get_ipc_ns_not_zero(ns);
		if (!ns) {
unlink_continue:
			list_del_init(&shp->shm_clist);
			task_unlock(task);
			continue;
		}

		/*
	 * Destroy all already created segments, that were not yet mapped,
	 * and mark any mapped as orphan to cover the sysctl toggling.
	 * Destroy is skipped if shm_may_destroy() returns false.
		 * 4) get a reference to shp.
		 *   This cannot fail: shm_clist_rm() is called before
		 *   ipc_rmid(), thus the refcount cannot be 0.
		 */
	down_write(&shm_ids(ns).rwsem);
	list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist) {
		shp->shm_creator = NULL;
		WARN_ON(!ipc_rcu_getref(&shp->shm_perm));

		if (shm_may_destroy(ns, shp)) {
		/*
		 * 5) unlink the shm segment from the list of segments
		 *    created by current.
		 *    This must be done last. After unlinking,
		 *    only the refcounts obtained above prevent IPC_RMID
		 *    from destroying the segment or the namespace.
		 */
		list_del_init(&shp->shm_clist);

		task_unlock(task);

		/*
		 * 6) we have all references
		 *    Thus lock & if needed destroy shp.
		 */
		down_write(&shm_ids(ns).rwsem);
		shm_lock_by_ptr(shp);
		/*
		 * rcu_read_lock was implicitly taken in shm_lock_by_ptr, it's
		 * safe to call ipc_rcu_putref here
		 */
		ipc_rcu_putref(&shp->shm_perm, shm_rcu_free);

		if (ipc_valid_object(&shp->shm_perm)) {
			if (shm_may_destroy(shp))
				shm_destroy(ns, shp);
		}
			else
				shm_unlock(shp);
		} else {
			/*
			 * Someone else deleted the shp from namespace
			 * idr/kht while we have waited.
			 * Just unlock and continue.
			 */
			shm_unlock(shp);
		}

	/* Remove the list head from any segments still attached. */
	list_del(&task->sysvshm.shm_clist);
		up_write(&shm_ids(ns).rwsem);
		put_ipc_ns(ns); /* paired with get_ipc_ns_not_zero */
	}
}

static vm_fault_t shm_fault(struct vm_fault *vmf)
@@ -680,7 +772,11 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
	if (error < 0)
		goto no_id;

	shp->ns = ns;

	task_lock(current);
	list_add(&shp->shm_clist, &current->sysvshm.shm_clist);
	task_unlock(current);

	/*
	 * shmid gets reported as "inode#" in /proc/pid/maps.
@@ -1575,7 +1671,8 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg,
	down_write(&shm_ids(ns).rwsem);
	shp = shm_lock(ns, shmid);
	shp->shm_nattch--;
	if (shm_may_destroy(ns, shp))

	if (shm_may_destroy(shp))
		shm_destroy(ns, shp);
	else
		shm_unlock(shp);