Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit b6daa51b authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull percpu updates from Tejun Heo:

 - Nick improved generic implementations of percpu operations which
   modify the variable and return so that they calculate the physical
   address only once.

 - percpu_ref percpu <-> atomic mode switching improvements. The
   patchset was originally posted about a year ago but fell through the
   crack.

 - misc non-critical fixes.

* 'for-4.9' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu:
  mm/percpu.c: fix potential memory leakage for pcpu_embed_first_chunk()
  mm/percpu.c: correct max_distance calculation for pcpu_embed_first_chunk()
  percpu: eliminate two sparse warnings
  percpu: improve generic percpu modify-return implementation
  percpu-refcount: init ->confirm_switch member properly
  percpu_ref: allow operation mode switching operations to be called concurrently
  percpu_ref: restructure operation mode switching
  percpu_ref: unify staggered atomic switching wait behavior
  percpu_ref: reorganize __percpu_ref_switch_to_atomic() and relocate percpu_ref_switch_to_atomic()
  percpu_ref: remove unnecessary RCU grace period for staggered atomic switching confirmation
parents f96ed261 9b739662
Loading
Loading
Loading
Loading
+3 −2
Original line number Diff line number Diff line
@@ -521,7 +521,8 @@ do { \
static __always_inline bool x86_this_cpu_constant_test_bit(unsigned int nr,
                        const unsigned long __percpu *addr)
{
	unsigned long __percpu *a = (unsigned long *)addr + nr / BITS_PER_LONG;
	unsigned long __percpu *a =
		(unsigned long __percpu *)addr + nr / BITS_PER_LONG;

#ifdef CONFIG_X86_64
	return ((1UL << (nr % BITS_PER_LONG)) & raw_cpu_read_8(*a)) != 0;
@@ -538,7 +539,7 @@ static inline bool x86_this_cpu_variable_test_bit(int nr,
	asm volatile("bt "__percpu_arg(2)",%1\n\t"
			CC_SET(c)
			: CC_OUT(c) (oldbit)
			: "m" (*(unsigned long *)addr), "Ir" (nr));
			: "m" (*(unsigned long __percpu *)addr), "Ir" (nr));

	return oldbit;
}
+30 −23
Original line number Diff line number Diff line
@@ -65,6 +65,11 @@ extern void setup_per_cpu_areas(void);
#define PER_CPU_DEF_ATTRIBUTES
#endif

#define raw_cpu_generic_read(pcp)					\
({									\
	*raw_cpu_ptr(&(pcp));						\
})

#define raw_cpu_generic_to_op(pcp, val, op)				\
do {									\
	*raw_cpu_ptr(&(pcp)) op val;					\
@@ -72,34 +77,39 @@ do { \

#define raw_cpu_generic_add_return(pcp, val)				\
({									\
	raw_cpu_add(pcp, val);						\
	raw_cpu_read(pcp);						\
	typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp));			\
									\
	*__p += val;							\
	*__p;								\
})

#define raw_cpu_generic_xchg(pcp, nval)					\
({									\
	typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp));			\
	typeof(pcp) __ret;						\
	__ret = raw_cpu_read(pcp);					\
	raw_cpu_write(pcp, nval);					\
	__ret = *__p;							\
	*__p = nval;							\
	__ret;								\
})

#define raw_cpu_generic_cmpxchg(pcp, oval, nval)			\
({									\
	typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp));			\
	typeof(pcp) __ret;						\
	__ret = raw_cpu_read(pcp);					\
	__ret = *__p;							\
	if (__ret == (oval))						\
		raw_cpu_write(pcp, nval);				\
		*__p = nval;						\
	__ret;								\
})

#define raw_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \
({									\
	typeof(&(pcp1)) __p1 = raw_cpu_ptr(&(pcp1));			\
	typeof(&(pcp2)) __p2 = raw_cpu_ptr(&(pcp2));			\
	int __ret = 0;							\
	if (raw_cpu_read(pcp1) == (oval1) &&				\
			 raw_cpu_read(pcp2)  == (oval2)) {		\
		raw_cpu_write(pcp1, nval1);				\
		raw_cpu_write(pcp2, nval2);				\
	if (*__p1 == (oval1) && *__p2  == (oval2)) {			\
		*__p1 = nval1;						\
		*__p2 = nval2;						\
		__ret = 1;						\
	}								\
	(__ret);							\
@@ -109,7 +119,7 @@ do { \
({									\
	typeof(pcp) __ret;						\
	preempt_disable();						\
	__ret = *this_cpu_ptr(&(pcp));					\
	__ret = raw_cpu_generic_read(pcp);				\
	preempt_enable();						\
	__ret;								\
})
@@ -118,17 +128,17 @@ do { \
do {									\
	unsigned long __flags;						\
	raw_local_irq_save(__flags);					\
	*raw_cpu_ptr(&(pcp)) op val;					\
	raw_cpu_generic_to_op(pcp, val, op);				\
	raw_local_irq_restore(__flags);					\
} while (0)


#define this_cpu_generic_add_return(pcp, val)				\
({									\
	typeof(pcp) __ret;						\
	unsigned long __flags;						\
	raw_local_irq_save(__flags);					\
	raw_cpu_add(pcp, val);						\
	__ret = raw_cpu_read(pcp);					\
	__ret = raw_cpu_generic_add_return(pcp, val);			\
	raw_local_irq_restore(__flags);					\
	__ret;								\
})
@@ -138,8 +148,7 @@ do { \
	typeof(pcp) __ret;						\
	unsigned long __flags;						\
	raw_local_irq_save(__flags);					\
	__ret = raw_cpu_read(pcp);					\
	raw_cpu_write(pcp, nval);					\
	__ret = raw_cpu_generic_xchg(pcp, nval);			\
	raw_local_irq_restore(__flags);					\
	__ret;								\
})
@@ -149,9 +158,7 @@ do { \
	typeof(pcp) __ret;						\
	unsigned long __flags;						\
	raw_local_irq_save(__flags);					\
	__ret = raw_cpu_read(pcp);					\
	if (__ret == (oval))						\
		raw_cpu_write(pcp, nval);				\
	__ret = raw_cpu_generic_cmpxchg(pcp, oval, nval);		\
	raw_local_irq_restore(__flags);					\
	__ret;								\
})
@@ -168,16 +175,16 @@ do { \
})

#ifndef raw_cpu_read_1
#define raw_cpu_read_1(pcp)		(*raw_cpu_ptr(&(pcp)))
#define raw_cpu_read_1(pcp)		raw_cpu_generic_read(pcp)
#endif
#ifndef raw_cpu_read_2
#define raw_cpu_read_2(pcp)		(*raw_cpu_ptr(&(pcp)))
#define raw_cpu_read_2(pcp)		raw_cpu_generic_read(pcp)
#endif
#ifndef raw_cpu_read_4
#define raw_cpu_read_4(pcp)		(*raw_cpu_ptr(&(pcp)))
#define raw_cpu_read_4(pcp)		raw_cpu_generic_read(pcp)
#endif
#ifndef raw_cpu_read_8
#define raw_cpu_read_8(pcp)		(*raw_cpu_ptr(&(pcp)))
#define raw_cpu_read_8(pcp)		raw_cpu_generic_read(pcp)
#endif

#ifndef raw_cpu_write_1
+95 −74
Original line number Diff line number Diff line
@@ -33,6 +33,7 @@

#define PERCPU_COUNT_BIAS	(1LU << (BITS_PER_LONG - 1))

static DEFINE_SPINLOCK(percpu_ref_switch_lock);
static DECLARE_WAIT_QUEUE_HEAD(percpu_ref_switch_waitq);

static unsigned long __percpu *percpu_count_ptr(struct percpu_ref *ref)
@@ -82,6 +83,7 @@ int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release,
	atomic_long_set(&ref->count, start_count);

	ref->release = release;
	ref->confirm_switch = NULL;
	return 0;
}
EXPORT_SYMBOL_GPL(percpu_ref_init);
@@ -101,6 +103,8 @@ void percpu_ref_exit(struct percpu_ref *ref)
	unsigned long __percpu *percpu_count = percpu_count_ptr(ref);

	if (percpu_count) {
		/* non-NULL confirm_switch indicates switching in progress */
		WARN_ON_ONCE(ref->confirm_switch);
		free_percpu(percpu_count);
		ref->percpu_count_ptr = __PERCPU_REF_ATOMIC_DEAD;
	}
@@ -161,34 +165,67 @@ static void percpu_ref_noop_confirm_switch(struct percpu_ref *ref)
static void __percpu_ref_switch_to_atomic(struct percpu_ref *ref,
					  percpu_ref_func_t *confirm_switch)
{
	if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC)) {
	if (ref->percpu_count_ptr & __PERCPU_REF_ATOMIC) {
		if (confirm_switch)
			confirm_switch(ref);
		return;
	}

	/* switching from percpu to atomic */
	ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC;

	/*
		 * Non-NULL ->confirm_switch is used to indicate that
		 * switching is in progress.  Use noop one if unspecified.
	 * Non-NULL ->confirm_switch is used to indicate that switching is
	 * in progress.  Use noop one if unspecified.
	 */
		WARN_ON_ONCE(ref->confirm_switch);
		ref->confirm_switch =
			confirm_switch ?: percpu_ref_noop_confirm_switch;
	ref->confirm_switch = confirm_switch ?: percpu_ref_noop_confirm_switch;

	percpu_ref_get(ref);	/* put after confirmation */
	call_rcu_sched(&ref->rcu, percpu_ref_switch_to_atomic_rcu);
	} else if (confirm_switch) {
}

static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref)
{
	unsigned long __percpu *percpu_count = percpu_count_ptr(ref);
	int cpu;

	BUG_ON(!percpu_count);

	if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC))
		return;

	atomic_long_add(PERCPU_COUNT_BIAS, &ref->count);

	/*
		 * Somebody already set ATOMIC.  Switching may still be in
		 * progress.  @confirm_switch must be invoked after the
		 * switching is complete and a full sched RCU grace period
		 * has passed.  Wait synchronously for the previous
		 * switching and schedule @confirm_switch invocation.
	 * Restore per-cpu operation.  smp_store_release() is paired with
	 * smp_read_barrier_depends() in __ref_is_percpu() and guarantees
	 * that the zeroing is visible to all percpu accesses which can see
	 * the following __PERCPU_REF_ATOMIC clearing.
	 */
		wait_event(percpu_ref_switch_waitq, !ref->confirm_switch);
		ref->confirm_switch = confirm_switch;
	for_each_possible_cpu(cpu)
		*per_cpu_ptr(percpu_count, cpu) = 0;

		percpu_ref_get(ref);	/* put after confirmation */
		call_rcu_sched(&ref->rcu, percpu_ref_call_confirm_rcu);
	smp_store_release(&ref->percpu_count_ptr,
			  ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC);
}

static void __percpu_ref_switch_mode(struct percpu_ref *ref,
				     percpu_ref_func_t *confirm_switch)
{
	lockdep_assert_held(&percpu_ref_switch_lock);

	/*
	 * If the previous ATOMIC switching hasn't finished yet, wait for
	 * its completion.  If the caller ensures that ATOMIC switching
	 * isn't in progress, this function can be called from any context.
	 */
	wait_event_lock_irq(percpu_ref_switch_waitq, !ref->confirm_switch,
			    percpu_ref_switch_lock);

	if (ref->force_atomic || (ref->percpu_count_ptr & __PERCPU_REF_DEAD))
		__percpu_ref_switch_to_atomic(ref, confirm_switch);
	else
		__percpu_ref_switch_to_percpu(ref);
}

/**
@@ -207,47 +244,21 @@ static void __percpu_ref_switch_to_atomic(struct percpu_ref *ref,
 * operations.  Note that @ref will stay in atomic mode across kill/reinit
 * cycles until percpu_ref_switch_to_percpu() is called.
 *
 * This function normally doesn't block and can be called from any context
 * but it may block if @confirm_kill is specified and @ref is already in
 * the process of switching to atomic mode.  In such cases, @confirm_switch
 * will be invoked after the switching is complete.
 *
 * Due to the way percpu_ref is implemented, @confirm_switch will be called
 * after at least one full sched RCU grace period has passed but this is an
 * implementation detail and must not be depended upon.
 * This function may block if @ref is in the process of switching to atomic
 * mode.  If the caller ensures that @ref is not in the process of
 * switching to atomic mode, this function can be called from any context.
 */
void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
				 percpu_ref_func_t *confirm_switch)
{
	ref->force_atomic = true;
	__percpu_ref_switch_to_atomic(ref, confirm_switch);
}

static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref)
{
	unsigned long __percpu *percpu_count = percpu_count_ptr(ref);
	int cpu;

	BUG_ON(!percpu_count);

	if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC))
		return;

	wait_event(percpu_ref_switch_waitq, !ref->confirm_switch);
	unsigned long flags;

	atomic_long_add(PERCPU_COUNT_BIAS, &ref->count);
	spin_lock_irqsave(&percpu_ref_switch_lock, flags);

	/*
	 * Restore per-cpu operation.  smp_store_release() is paired with
	 * smp_read_barrier_depends() in __ref_is_percpu() and guarantees
	 * that the zeroing is visible to all percpu accesses which can see
	 * the following __PERCPU_REF_ATOMIC clearing.
	 */
	for_each_possible_cpu(cpu)
		*per_cpu_ptr(percpu_count, cpu) = 0;
	ref->force_atomic = true;
	__percpu_ref_switch_mode(ref, confirm_switch);

	smp_store_release(&ref->percpu_count_ptr,
			  ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC);
	spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
}

/**
@@ -264,17 +275,20 @@ static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref)
 * dying or dead, the actual switching takes place on the following
 * percpu_ref_reinit().
 *
 * This function normally doesn't block and can be called from any context
 * but it may block if @ref is in the process of switching to atomic mode
 * by percpu_ref_switch_atomic().
 * This function may block if @ref is in the process of switching to atomic
 * mode.  If the caller ensures that @ref is not in the process of
 * switching to atomic mode, this function can be called from any context.
 */
void percpu_ref_switch_to_percpu(struct percpu_ref *ref)
{
	unsigned long flags;

	spin_lock_irqsave(&percpu_ref_switch_lock, flags);

	ref->force_atomic = false;
	__percpu_ref_switch_mode(ref, NULL);

	/* a dying or dead ref can't be switched to percpu mode w/o reinit */
	if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD))
		__percpu_ref_switch_to_percpu(ref);
	spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
}

/**
@@ -290,21 +304,23 @@ void percpu_ref_switch_to_percpu(struct percpu_ref *ref)
 *
 * This function normally doesn't block and can be called from any context
 * but it may block if @confirm_kill is specified and @ref is in the
 * process of switching to atomic mode by percpu_ref_switch_atomic().
 *
 * Due to the way percpu_ref is implemented, @confirm_switch will be called
 * after at least one full sched RCU grace period has passed but this is an
 * implementation detail and must not be depended upon.
 * process of switching to atomic mode by percpu_ref_switch_to_atomic().
 */
void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
				 percpu_ref_func_t *confirm_kill)
{
	unsigned long flags;

	spin_lock_irqsave(&percpu_ref_switch_lock, flags);

	WARN_ONCE(ref->percpu_count_ptr & __PERCPU_REF_DEAD,
		  "%s called more than once on %pf!", __func__, ref->release);

	ref->percpu_count_ptr |= __PERCPU_REF_DEAD;
	__percpu_ref_switch_to_atomic(ref, confirm_kill);
	__percpu_ref_switch_mode(ref, confirm_kill);
	percpu_ref_put(ref);

	spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
}
EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm);

@@ -321,11 +337,16 @@ EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm);
 */
void percpu_ref_reinit(struct percpu_ref *ref)
{
	unsigned long flags;

	spin_lock_irqsave(&percpu_ref_switch_lock, flags);

	WARN_ON_ONCE(!percpu_ref_is_zero(ref));

	ref->percpu_count_ptr &= ~__PERCPU_REF_DEAD;
	percpu_ref_get(ref);
	if (!ref->force_atomic)
		__percpu_ref_switch_to_percpu(ref);
	__percpu_ref_switch_mode(ref, NULL);

	spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
}
EXPORT_SYMBOL_GPL(percpu_ref_reinit);
+20 −18
Original line number Diff line number Diff line
@@ -1961,8 +1961,9 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
	void *base = (void *)ULONG_MAX;
	void **areas = NULL;
	struct pcpu_alloc_info *ai;
	size_t size_sum, areas_size, max_distance;
	int group, i, rc;
	size_t size_sum, areas_size;
	unsigned long max_distance;
	int group, i, highest_group, rc;

	ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
				   cpu_distance_fn);
@@ -1978,7 +1979,8 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
		goto out_free;
	}

	/* allocate, copy and determine base address */
	/* allocate, copy and determine base address & max_distance */
	highest_group = 0;
	for (group = 0; group < ai->nr_groups; group++) {
		struct pcpu_group_info *gi = &ai->groups[group];
		unsigned int cpu = NR_CPUS;
@@ -1999,6 +2001,21 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
		areas[group] = ptr;

		base = min(ptr, base);
		if (ptr > areas[highest_group])
			highest_group = group;
	}
	max_distance = areas[highest_group] - base;
	max_distance += ai->unit_size * ai->groups[highest_group].nr_units;

	/* warn if maximum distance is further than 75% of vmalloc space */
	if (max_distance > VMALLOC_TOTAL * 3 / 4) {
		pr_warn("max_distance=0x%lx too large for vmalloc space 0x%lx\n",
				max_distance, VMALLOC_TOTAL);
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
		/* and fail if we have fallback */
		rc = -EINVAL;
		goto out_free_areas;
#endif
	}

	/*
@@ -2023,23 +2040,8 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
	}

	/* base address is now known, determine group base offsets */
	max_distance = 0;
	for (group = 0; group < ai->nr_groups; group++) {
		ai->groups[group].base_offset = areas[group] - base;
		max_distance = max_t(size_t, max_distance,
				     ai->groups[group].base_offset);
	}
	max_distance += ai->unit_size;

	/* warn if maximum distance is further than 75% of vmalloc space */
	if (max_distance > VMALLOC_TOTAL * 3 / 4) {
		pr_warn("max_distance=0x%zx too large for vmalloc space 0x%lx\n",
			max_distance, VMALLOC_TOTAL);
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
		/* and fail if we have fallback */
		rc = -EINVAL;
		goto out_free;
#endif
	}

	pr_info("Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",