Loading include/linux/percpu-rwsem.h +75 −9 Original line number Diff line number Diff line Loading @@ -10,30 +10,96 @@ struct percpu_rw_semaphore { struct rcu_sync rss; unsigned int __percpu *fast_read_ctr; unsigned int __percpu *read_count; struct rw_semaphore rw_sem; atomic_t slow_read_ctr; wait_queue_head_t write_waitq; wait_queue_head_t writer; int readers_block; }; extern void percpu_down_read(struct percpu_rw_semaphore *); extern int percpu_down_read_trylock(struct percpu_rw_semaphore *); extern void percpu_up_read(struct percpu_rw_semaphore *); extern int __percpu_down_read(struct percpu_rw_semaphore *, int); extern void __percpu_up_read(struct percpu_rw_semaphore *); static inline void percpu_down_read(struct percpu_rw_semaphore *sem) { might_sleep(); rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_); preempt_disable(); /* * We are in an RCU-sched read-side critical section, so the writer * cannot both change sem->state from readers_fast and start checking * counters while we are here. So if we see !sem->state, we know that * the writer won't be checking until we're past the preempt_enable() * and that one the synchronize_sched() is done, the writer will see * anything we did within this RCU-sched read-size critical section. */ __this_cpu_inc(*sem->read_count); if (unlikely(!rcu_sync_is_idle(&sem->rss))) __percpu_down_read(sem, false); /* Unconditional memory barrier */ preempt_enable(); /* * The barrier() from preempt_enable() prevents the compiler from * bleeding the critical section out. */ } static inline int percpu_down_read_trylock(struct percpu_rw_semaphore *sem) { int ret = 1; preempt_disable(); /* * Same as in percpu_down_read(). */ __this_cpu_inc(*sem->read_count); if (unlikely(!rcu_sync_is_idle(&sem->rss))) ret = __percpu_down_read(sem, true); /* Unconditional memory barrier */ preempt_enable(); /* * The barrier() from preempt_enable() prevents the compiler from * bleeding the critical section out. */ if (ret) rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 1, _RET_IP_); return ret; } static inline void percpu_up_read(struct percpu_rw_semaphore *sem) { /* * The barrier() in preempt_disable() prevents the compiler from * bleeding the critical section out. */ preempt_disable(); /* * Same as in percpu_down_read(). */ if (likely(rcu_sync_is_idle(&sem->rss))) __this_cpu_dec(*sem->read_count); else __percpu_up_read(sem); /* Unconditional memory barrier */ preempt_enable(); rwsem_release(&sem->rw_sem.dep_map, 1, _RET_IP_); } extern void percpu_down_write(struct percpu_rw_semaphore *); extern void percpu_up_write(struct percpu_rw_semaphore *); extern int __percpu_init_rwsem(struct percpu_rw_semaphore *, const char *, struct lock_class_key *); extern void percpu_free_rwsem(struct percpu_rw_semaphore *); #define percpu_init_rwsem(brw) \ #define percpu_init_rwsem(sem) \ ({ \ static struct lock_class_key rwsem_key; \ __percpu_init_rwsem(brw, #brw, &rwsem_key); \ __percpu_init_rwsem(sem, #sem, &rwsem_key); \ }) #define percpu_rwsem_is_held(sem) lockdep_is_held(&(sem)->rw_sem) static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem, Loading include/linux/rcu_sync.h +1 −0 Original line number Diff line number Diff line Loading @@ -59,6 +59,7 @@ static inline bool rcu_sync_is_idle(struct rcu_sync *rsp) } extern void rcu_sync_init(struct rcu_sync *, enum rcu_sync_type); extern void rcu_sync_enter_start(struct rcu_sync *); extern void rcu_sync_enter(struct rcu_sync *); extern void rcu_sync_exit(struct rcu_sync *); extern void rcu_sync_dtor(struct rcu_sync *); Loading kernel/cgroup.c +6 −0 Original line number Diff line number Diff line Loading @@ -5376,6 +5376,12 @@ int __init cgroup_init(void) BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); /* * The latency of the synchronize_sched() is too high for cgroups, * avoid it at the cost of forcing all readers into the slow path. */ rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss); mutex_lock(&cgroup_mutex); /* Add init_css_set to the hash table */ Loading kernel/fork.c +2 −2 Original line number Diff line number Diff line Loading @@ -1375,7 +1375,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->real_start_time = ktime_get_boot_ns(); p->io_context = NULL; p->audit_context = NULL; threadgroup_change_begin(current); cgroup_fork(p); #ifdef CONFIG_NUMA p->mempolicy = mpol_dup(p->mempolicy); Loading Loading @@ -1527,6 +1526,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, INIT_LIST_HEAD(&p->thread_group); p->task_works = NULL; threadgroup_change_begin(current); /* * Ensure that the cgroup subsystem policies allow the new process to be * forked. It should be noted the the new process's css_set can be changed Loading Loading @@ -1627,6 +1627,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, bad_fork_cancel_cgroup: cgroup_cancel_fork(p, cgrp_ss_priv); bad_fork_free_pid: threadgroup_change_end(current); if (pid != &init_struct_pid) free_pid(pid); bad_fork_cleanup_io: Loading Loading @@ -1657,7 +1658,6 @@ bad_fork_cleanup_policy: mpol_put(p->mempolicy); bad_fork_cleanup_threadgroup_lock: #endif threadgroup_change_end(current); delayacct_tsk_free(p); bad_fork_cleanup_count: atomic_dec(&p->cred->user->processes); Loading kernel/locking/percpu-rwsem.c +132 −97 Original line number Diff line number Diff line Loading @@ -8,151 +8,186 @@ #include <linux/sched.h> #include <linux/errno.h> int __percpu_init_rwsem(struct percpu_rw_semaphore *brw, int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, const char *name, struct lock_class_key *rwsem_key) { brw->fast_read_ctr = alloc_percpu(int); if (unlikely(!brw->fast_read_ctr)) sem->read_count = alloc_percpu(int); if (unlikely(!sem->read_count)) return -ENOMEM; /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ __init_rwsem(&brw->rw_sem, name, rwsem_key); rcu_sync_init(&brw->rss, RCU_SCHED_SYNC); atomic_set(&brw->slow_read_ctr, 0); init_waitqueue_head(&brw->write_waitq); rcu_sync_init(&sem->rss, RCU_SCHED_SYNC); __init_rwsem(&sem->rw_sem, name, rwsem_key); init_waitqueue_head(&sem->writer); sem->readers_block = 0; return 0; } EXPORT_SYMBOL_GPL(__percpu_init_rwsem); void percpu_free_rwsem(struct percpu_rw_semaphore *brw) void percpu_free_rwsem(struct percpu_rw_semaphore *sem) { /* * XXX: temporary kludge. The error path in alloc_super() * assumes that percpu_free_rwsem() is safe after kzalloc(). */ if (!brw->fast_read_ctr) if (!sem->read_count) return; rcu_sync_dtor(&brw->rss); free_percpu(brw->fast_read_ctr); brw->fast_read_ctr = NULL; /* catch use after free bugs */ rcu_sync_dtor(&sem->rss); free_percpu(sem->read_count); sem->read_count = NULL; /* catch use after free bugs */ } EXPORT_SYMBOL_GPL(percpu_free_rwsem); int __percpu_down_read(struct percpu_rw_semaphore *sem, int try) { /* * This is the fast-path for down_read/up_read. If it succeeds we rely * on the barriers provided by rcu_sync_enter/exit; see the comments in * percpu_down_write() and percpu_up_write(). * Due to having preemption disabled the decrement happens on * the same CPU as the increment, avoiding the * increment-on-one-CPU-and-decrement-on-another problem. * * If the reader misses the writer's assignment of readers_block, then * the writer is guaranteed to see the reader's increment. * * If this helper fails the callers rely on the normal rw_semaphore and * atomic_dec_and_test(), so in this case we have the necessary barriers. * Conversely, any readers that increment their sem->read_count after * the writer looks are guaranteed to see the readers_block value, * which in turn means that they are guaranteed to immediately * decrement their sem->read_count, so that it doesn't matter that the * writer missed them. */ static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val) { bool success; preempt_disable(); success = rcu_sync_is_idle(&brw->rss); if (likely(success)) __this_cpu_add(*brw->fast_read_ctr, val); preempt_enable(); smp_mb(); /* A matches D */ return success; } /* * If !readers_block the critical section starts here, matched by the * release in percpu_up_write(). */ if (likely(!smp_load_acquire(&sem->readers_block))) return 1; /* * Like the normal down_read() this is not recursive, the writer can * come after the first percpu_down_read() and create the deadlock. * * Note: returns with lock_is_held(brw->rw_sem) == T for lockdep, * percpu_up_read() does rwsem_release(). This pairs with the usage * of ->rw_sem in percpu_down/up_write(). * Per the above comment; we still have preemption disabled and * will thus decrement on the same CPU as we incremented. */ void percpu_down_read(struct percpu_rw_semaphore *brw) { might_sleep(); rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_); __percpu_up_read(sem); if (likely(update_fast_ctr(brw, +1))) return; if (try) return 0; /* Avoid rwsem_acquire_read() and rwsem_release() */ __down_read(&brw->rw_sem); atomic_inc(&brw->slow_read_ctr); __up_read(&brw->rw_sem); } EXPORT_SYMBOL_GPL(percpu_down_read); /* * We either call schedule() in the wait, or we'll fall through * and reschedule on the preempt_enable() in percpu_down_read(). */ preempt_enable_no_resched(); int percpu_down_read_trylock(struct percpu_rw_semaphore *brw) { if (unlikely(!update_fast_ctr(brw, +1))) { if (!__down_read_trylock(&brw->rw_sem)) return 0; atomic_inc(&brw->slow_read_ctr); __up_read(&brw->rw_sem); } /* * Avoid lockdep for the down/up_read() we already have them. */ __down_read(&sem->rw_sem); this_cpu_inc(*sem->read_count); __up_read(&sem->rw_sem); rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 1, _RET_IP_); preempt_disable(); return 1; } EXPORT_SYMBOL_GPL(__percpu_down_read); void percpu_up_read(struct percpu_rw_semaphore *brw) void __percpu_up_read(struct percpu_rw_semaphore *sem) { rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_); if (likely(update_fast_ctr(brw, -1))) return; smp_mb(); /* B matches C */ /* * In other words, if they see our decrement (presumably to aggregate * zero, as that is the only time it matters) they will also see our * critical section. */ __this_cpu_dec(*sem->read_count); /* false-positive is possible but harmless */ if (atomic_dec_and_test(&brw->slow_read_ctr)) wake_up_all(&brw->write_waitq); /* Prod writer to recheck readers_active */ wake_up(&sem->writer); } EXPORT_SYMBOL_GPL(percpu_up_read); EXPORT_SYMBOL_GPL(__percpu_up_read); #define per_cpu_sum(var) \ ({ \ typeof(var) __sum = 0; \ int cpu; \ compiletime_assert_atomic_type(__sum); \ for_each_possible_cpu(cpu) \ __sum += per_cpu(var, cpu); \ __sum; \ }) static int clear_fast_ctr(struct percpu_rw_semaphore *brw) /* * Return true if the modular sum of the sem->read_count per-CPU variable is * zero. If this sum is zero, then it is stable due to the fact that if any * newly arriving readers increment a given counter, they will immediately * decrement that same counter. */ static bool readers_active_check(struct percpu_rw_semaphore *sem) { unsigned int sum = 0; int cpu; if (per_cpu_sum(*sem->read_count) != 0) return false; for_each_possible_cpu(cpu) { sum += per_cpu(*brw->fast_read_ctr, cpu); per_cpu(*brw->fast_read_ctr, cpu) = 0; } /* * If we observed the decrement; ensure we see the entire critical * section. */ smp_mb(); /* C matches B */ return sum; return true; } void percpu_down_write(struct percpu_rw_semaphore *brw) void percpu_down_write(struct percpu_rw_semaphore *sem) { /* Notify readers to take the slow path. */ rcu_sync_enter(&sem->rss); down_write(&sem->rw_sem); /* * Make rcu_sync_is_idle() == F and thus disable the fast-path in * percpu_down_read() and percpu_up_read(), and wait for gp pass. * * The latter synchronises us with the preceding readers which used * the fast-past, so we can not miss the result of __this_cpu_add() * or anything else inside their criticial sections. * Notify new readers to block; up until now, and thus throughout the * longish rcu_sync_enter() above, new readers could still come in. */ rcu_sync_enter(&brw->rss); WRITE_ONCE(sem->readers_block, 1); /* exclude other writers, and block the new readers completely */ down_write(&brw->rw_sem); smp_mb(); /* D matches A */ /* nobody can use fast_read_ctr, move its sum into slow_read_ctr */ atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr); /* * If they don't see our writer of readers_block, then we are * guaranteed to see their sem->read_count increment, and therefore * will wait for them. */ /* wait for all readers to complete their percpu_up_read() */ wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr)); /* Wait for all now active readers to complete. */ wait_event(sem->writer, readers_active_check(sem)); } EXPORT_SYMBOL_GPL(percpu_down_write); void percpu_up_write(struct percpu_rw_semaphore *brw) void percpu_up_write(struct percpu_rw_semaphore *sem) { /* release the lock, but the readers can't use the fast-path */ up_write(&brw->rw_sem); /* * Enable the fast-path in percpu_down_read() and percpu_up_read() * but only after another gp pass; this adds the necessary barrier * to ensure the reader can't miss the changes done by us. * Signal the writer is done, no fast path yet. * * One reason that we cannot just immediately flip to readers_fast is * that new readers might fail to see the results of this writer's * critical section. * * Therefore we force it through the slow path which guarantees an * acquire and thereby guarantees the critical section's consistency. */ smp_store_release(&sem->readers_block, 0); /* * Release the write lock, this will allow readers back in the game. */ up_write(&sem->rw_sem); /* * Once this completes (at least one RCU-sched grace period hence) the * reader fast path will be available again. Safe to use outside the * exclusive write lock because its counting. */ rcu_sync_exit(&brw->rss); rcu_sync_exit(&sem->rss); } EXPORT_SYMBOL_GPL(percpu_up_write); Loading
include/linux/percpu-rwsem.h +75 −9 Original line number Diff line number Diff line Loading @@ -10,30 +10,96 @@ struct percpu_rw_semaphore { struct rcu_sync rss; unsigned int __percpu *fast_read_ctr; unsigned int __percpu *read_count; struct rw_semaphore rw_sem; atomic_t slow_read_ctr; wait_queue_head_t write_waitq; wait_queue_head_t writer; int readers_block; }; extern void percpu_down_read(struct percpu_rw_semaphore *); extern int percpu_down_read_trylock(struct percpu_rw_semaphore *); extern void percpu_up_read(struct percpu_rw_semaphore *); extern int __percpu_down_read(struct percpu_rw_semaphore *, int); extern void __percpu_up_read(struct percpu_rw_semaphore *); static inline void percpu_down_read(struct percpu_rw_semaphore *sem) { might_sleep(); rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_); preempt_disable(); /* * We are in an RCU-sched read-side critical section, so the writer * cannot both change sem->state from readers_fast and start checking * counters while we are here. So if we see !sem->state, we know that * the writer won't be checking until we're past the preempt_enable() * and that one the synchronize_sched() is done, the writer will see * anything we did within this RCU-sched read-size critical section. */ __this_cpu_inc(*sem->read_count); if (unlikely(!rcu_sync_is_idle(&sem->rss))) __percpu_down_read(sem, false); /* Unconditional memory barrier */ preempt_enable(); /* * The barrier() from preempt_enable() prevents the compiler from * bleeding the critical section out. */ } static inline int percpu_down_read_trylock(struct percpu_rw_semaphore *sem) { int ret = 1; preempt_disable(); /* * Same as in percpu_down_read(). */ __this_cpu_inc(*sem->read_count); if (unlikely(!rcu_sync_is_idle(&sem->rss))) ret = __percpu_down_read(sem, true); /* Unconditional memory barrier */ preempt_enable(); /* * The barrier() from preempt_enable() prevents the compiler from * bleeding the critical section out. */ if (ret) rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 1, _RET_IP_); return ret; } static inline void percpu_up_read(struct percpu_rw_semaphore *sem) { /* * The barrier() in preempt_disable() prevents the compiler from * bleeding the critical section out. */ preempt_disable(); /* * Same as in percpu_down_read(). */ if (likely(rcu_sync_is_idle(&sem->rss))) __this_cpu_dec(*sem->read_count); else __percpu_up_read(sem); /* Unconditional memory barrier */ preempt_enable(); rwsem_release(&sem->rw_sem.dep_map, 1, _RET_IP_); } extern void percpu_down_write(struct percpu_rw_semaphore *); extern void percpu_up_write(struct percpu_rw_semaphore *); extern int __percpu_init_rwsem(struct percpu_rw_semaphore *, const char *, struct lock_class_key *); extern void percpu_free_rwsem(struct percpu_rw_semaphore *); #define percpu_init_rwsem(brw) \ #define percpu_init_rwsem(sem) \ ({ \ static struct lock_class_key rwsem_key; \ __percpu_init_rwsem(brw, #brw, &rwsem_key); \ __percpu_init_rwsem(sem, #sem, &rwsem_key); \ }) #define percpu_rwsem_is_held(sem) lockdep_is_held(&(sem)->rw_sem) static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem, Loading
include/linux/rcu_sync.h +1 −0 Original line number Diff line number Diff line Loading @@ -59,6 +59,7 @@ static inline bool rcu_sync_is_idle(struct rcu_sync *rsp) } extern void rcu_sync_init(struct rcu_sync *, enum rcu_sync_type); extern void rcu_sync_enter_start(struct rcu_sync *); extern void rcu_sync_enter(struct rcu_sync *); extern void rcu_sync_exit(struct rcu_sync *); extern void rcu_sync_dtor(struct rcu_sync *); Loading
kernel/cgroup.c +6 −0 Original line number Diff line number Diff line Loading @@ -5376,6 +5376,12 @@ int __init cgroup_init(void) BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); /* * The latency of the synchronize_sched() is too high for cgroups, * avoid it at the cost of forcing all readers into the slow path. */ rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss); mutex_lock(&cgroup_mutex); /* Add init_css_set to the hash table */ Loading
kernel/fork.c +2 −2 Original line number Diff line number Diff line Loading @@ -1375,7 +1375,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->real_start_time = ktime_get_boot_ns(); p->io_context = NULL; p->audit_context = NULL; threadgroup_change_begin(current); cgroup_fork(p); #ifdef CONFIG_NUMA p->mempolicy = mpol_dup(p->mempolicy); Loading Loading @@ -1527,6 +1526,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, INIT_LIST_HEAD(&p->thread_group); p->task_works = NULL; threadgroup_change_begin(current); /* * Ensure that the cgroup subsystem policies allow the new process to be * forked. It should be noted the the new process's css_set can be changed Loading Loading @@ -1627,6 +1627,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, bad_fork_cancel_cgroup: cgroup_cancel_fork(p, cgrp_ss_priv); bad_fork_free_pid: threadgroup_change_end(current); if (pid != &init_struct_pid) free_pid(pid); bad_fork_cleanup_io: Loading Loading @@ -1657,7 +1658,6 @@ bad_fork_cleanup_policy: mpol_put(p->mempolicy); bad_fork_cleanup_threadgroup_lock: #endif threadgroup_change_end(current); delayacct_tsk_free(p); bad_fork_cleanup_count: atomic_dec(&p->cred->user->processes); Loading
kernel/locking/percpu-rwsem.c +132 −97 Original line number Diff line number Diff line Loading @@ -8,151 +8,186 @@ #include <linux/sched.h> #include <linux/errno.h> int __percpu_init_rwsem(struct percpu_rw_semaphore *brw, int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, const char *name, struct lock_class_key *rwsem_key) { brw->fast_read_ctr = alloc_percpu(int); if (unlikely(!brw->fast_read_ctr)) sem->read_count = alloc_percpu(int); if (unlikely(!sem->read_count)) return -ENOMEM; /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ __init_rwsem(&brw->rw_sem, name, rwsem_key); rcu_sync_init(&brw->rss, RCU_SCHED_SYNC); atomic_set(&brw->slow_read_ctr, 0); init_waitqueue_head(&brw->write_waitq); rcu_sync_init(&sem->rss, RCU_SCHED_SYNC); __init_rwsem(&sem->rw_sem, name, rwsem_key); init_waitqueue_head(&sem->writer); sem->readers_block = 0; return 0; } EXPORT_SYMBOL_GPL(__percpu_init_rwsem); void percpu_free_rwsem(struct percpu_rw_semaphore *brw) void percpu_free_rwsem(struct percpu_rw_semaphore *sem) { /* * XXX: temporary kludge. The error path in alloc_super() * assumes that percpu_free_rwsem() is safe after kzalloc(). */ if (!brw->fast_read_ctr) if (!sem->read_count) return; rcu_sync_dtor(&brw->rss); free_percpu(brw->fast_read_ctr); brw->fast_read_ctr = NULL; /* catch use after free bugs */ rcu_sync_dtor(&sem->rss); free_percpu(sem->read_count); sem->read_count = NULL; /* catch use after free bugs */ } EXPORT_SYMBOL_GPL(percpu_free_rwsem); int __percpu_down_read(struct percpu_rw_semaphore *sem, int try) { /* * This is the fast-path for down_read/up_read. If it succeeds we rely * on the barriers provided by rcu_sync_enter/exit; see the comments in * percpu_down_write() and percpu_up_write(). * Due to having preemption disabled the decrement happens on * the same CPU as the increment, avoiding the * increment-on-one-CPU-and-decrement-on-another problem. * * If the reader misses the writer's assignment of readers_block, then * the writer is guaranteed to see the reader's increment. * * If this helper fails the callers rely on the normal rw_semaphore and * atomic_dec_and_test(), so in this case we have the necessary barriers. * Conversely, any readers that increment their sem->read_count after * the writer looks are guaranteed to see the readers_block value, * which in turn means that they are guaranteed to immediately * decrement their sem->read_count, so that it doesn't matter that the * writer missed them. */ static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val) { bool success; preempt_disable(); success = rcu_sync_is_idle(&brw->rss); if (likely(success)) __this_cpu_add(*brw->fast_read_ctr, val); preempt_enable(); smp_mb(); /* A matches D */ return success; } /* * If !readers_block the critical section starts here, matched by the * release in percpu_up_write(). */ if (likely(!smp_load_acquire(&sem->readers_block))) return 1; /* * Like the normal down_read() this is not recursive, the writer can * come after the first percpu_down_read() and create the deadlock. * * Note: returns with lock_is_held(brw->rw_sem) == T for lockdep, * percpu_up_read() does rwsem_release(). This pairs with the usage * of ->rw_sem in percpu_down/up_write(). * Per the above comment; we still have preemption disabled and * will thus decrement on the same CPU as we incremented. */ void percpu_down_read(struct percpu_rw_semaphore *brw) { might_sleep(); rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_); __percpu_up_read(sem); if (likely(update_fast_ctr(brw, +1))) return; if (try) return 0; /* Avoid rwsem_acquire_read() and rwsem_release() */ __down_read(&brw->rw_sem); atomic_inc(&brw->slow_read_ctr); __up_read(&brw->rw_sem); } EXPORT_SYMBOL_GPL(percpu_down_read); /* * We either call schedule() in the wait, or we'll fall through * and reschedule on the preempt_enable() in percpu_down_read(). */ preempt_enable_no_resched(); int percpu_down_read_trylock(struct percpu_rw_semaphore *brw) { if (unlikely(!update_fast_ctr(brw, +1))) { if (!__down_read_trylock(&brw->rw_sem)) return 0; atomic_inc(&brw->slow_read_ctr); __up_read(&brw->rw_sem); } /* * Avoid lockdep for the down/up_read() we already have them. */ __down_read(&sem->rw_sem); this_cpu_inc(*sem->read_count); __up_read(&sem->rw_sem); rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 1, _RET_IP_); preempt_disable(); return 1; } EXPORT_SYMBOL_GPL(__percpu_down_read); void percpu_up_read(struct percpu_rw_semaphore *brw) void __percpu_up_read(struct percpu_rw_semaphore *sem) { rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_); if (likely(update_fast_ctr(brw, -1))) return; smp_mb(); /* B matches C */ /* * In other words, if they see our decrement (presumably to aggregate * zero, as that is the only time it matters) they will also see our * critical section. */ __this_cpu_dec(*sem->read_count); /* false-positive is possible but harmless */ if (atomic_dec_and_test(&brw->slow_read_ctr)) wake_up_all(&brw->write_waitq); /* Prod writer to recheck readers_active */ wake_up(&sem->writer); } EXPORT_SYMBOL_GPL(percpu_up_read); EXPORT_SYMBOL_GPL(__percpu_up_read); #define per_cpu_sum(var) \ ({ \ typeof(var) __sum = 0; \ int cpu; \ compiletime_assert_atomic_type(__sum); \ for_each_possible_cpu(cpu) \ __sum += per_cpu(var, cpu); \ __sum; \ }) static int clear_fast_ctr(struct percpu_rw_semaphore *brw) /* * Return true if the modular sum of the sem->read_count per-CPU variable is * zero. If this sum is zero, then it is stable due to the fact that if any * newly arriving readers increment a given counter, they will immediately * decrement that same counter. */ static bool readers_active_check(struct percpu_rw_semaphore *sem) { unsigned int sum = 0; int cpu; if (per_cpu_sum(*sem->read_count) != 0) return false; for_each_possible_cpu(cpu) { sum += per_cpu(*brw->fast_read_ctr, cpu); per_cpu(*brw->fast_read_ctr, cpu) = 0; } /* * If we observed the decrement; ensure we see the entire critical * section. */ smp_mb(); /* C matches B */ return sum; return true; } void percpu_down_write(struct percpu_rw_semaphore *brw) void percpu_down_write(struct percpu_rw_semaphore *sem) { /* Notify readers to take the slow path. */ rcu_sync_enter(&sem->rss); down_write(&sem->rw_sem); /* * Make rcu_sync_is_idle() == F and thus disable the fast-path in * percpu_down_read() and percpu_up_read(), and wait for gp pass. * * The latter synchronises us with the preceding readers which used * the fast-past, so we can not miss the result of __this_cpu_add() * or anything else inside their criticial sections. * Notify new readers to block; up until now, and thus throughout the * longish rcu_sync_enter() above, new readers could still come in. */ rcu_sync_enter(&brw->rss); WRITE_ONCE(sem->readers_block, 1); /* exclude other writers, and block the new readers completely */ down_write(&brw->rw_sem); smp_mb(); /* D matches A */ /* nobody can use fast_read_ctr, move its sum into slow_read_ctr */ atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr); /* * If they don't see our writer of readers_block, then we are * guaranteed to see their sem->read_count increment, and therefore * will wait for them. */ /* wait for all readers to complete their percpu_up_read() */ wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr)); /* Wait for all now active readers to complete. */ wait_event(sem->writer, readers_active_check(sem)); } EXPORT_SYMBOL_GPL(percpu_down_write); void percpu_up_write(struct percpu_rw_semaphore *brw) void percpu_up_write(struct percpu_rw_semaphore *sem) { /* release the lock, but the readers can't use the fast-path */ up_write(&brw->rw_sem); /* * Enable the fast-path in percpu_down_read() and percpu_up_read() * but only after another gp pass; this adds the necessary barrier * to ensure the reader can't miss the changes done by us. * Signal the writer is done, no fast path yet. * * One reason that we cannot just immediately flip to readers_fast is * that new readers might fail to see the results of this writer's * critical section. * * Therefore we force it through the slow path which guarantees an * acquire and thereby guarantees the critical section's consistency. */ smp_store_release(&sem->readers_block, 0); /* * Release the write lock, this will allow readers back in the game. */ up_write(&sem->rw_sem); /* * Once this completes (at least one RCU-sched grace period hence) the * reader fast path will be available again. Safe to use outside the * exclusive write lock because its counting. */ rcu_sync_exit(&brw->rss); rcu_sync_exit(&sem->rss); } EXPORT_SYMBOL_GPL(percpu_up_write);