Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 942e4a2b authored by Stephen Hemminger's avatar Stephen Hemminger Committed by David S. Miller
Browse files

netfilter: revised locking for x_tables



The x_tables are organized with a table structure and a per-cpu copies
of the counters and rules. On older kernels there was a reader/writer 
lock per table which was a performance bottleneck. In 2.6.30-rc, this
was converted to use RCU and the counters/rules which solved the performance
problems for do_table but made replacing rules much slower because of
the necessary RCU grace period.

This version uses a per-cpu set of spinlocks and counters to allow to
table processing to proceed without the cache thrashing of a global
reader lock and keeps the same performance for table updates.

Signed-off-by: default avatarStephen Hemminger <shemminger@vyatta.com>
Acked-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent bf368e4e
Loading
Loading
Loading
Loading
+68 −5
Original line number Original line Diff line number Diff line
@@ -354,9 +354,6 @@ struct xt_table
	/* What hooks you will enter on */
	/* What hooks you will enter on */
	unsigned int valid_hooks;
	unsigned int valid_hooks;


	/* Lock for the curtain */
	struct mutex lock;

	/* Man behind the curtain... */
	/* Man behind the curtain... */
	struct xt_table_info *private;
	struct xt_table_info *private;


@@ -434,8 +431,74 @@ extern void xt_proto_fini(struct net *net, u_int8_t af);


extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
extern void xt_free_table_info(struct xt_table_info *info);
extern void xt_free_table_info(struct xt_table_info *info);
extern void xt_table_entry_swap_rcu(struct xt_table_info *old,

				    struct xt_table_info *new);
/*
 * Per-CPU spinlock associated with per-cpu table entries, and
 * with a counter for the "reading" side that allows a recursive
 * reader to avoid taking the lock and deadlocking.
 *
 * "reading" is used by ip/arp/ip6 tables rule processing which runs per-cpu.
 * It needs to ensure that the rules are not being changed while the packet
 * is being processed. In some cases, the read lock will be acquired
 * twice on the same CPU; this is okay because of the count.
 *
 * "writing" is used when reading counters.
 *  During replace any readers that are using the old tables have to complete
 *  before freeing the old table. This is handled by the write locking
 *  necessary for reading the counters.
 */
struct xt_info_lock {
	spinlock_t lock;
	unsigned char readers;
};
DECLARE_PER_CPU(struct xt_info_lock, xt_info_locks);

/*
 * Note: we need to ensure that preemption is disabled before acquiring
 * the per-cpu-variable, so we do it as a two step process rather than
 * using "spin_lock_bh()".
 *
 * We _also_ need to disable bottom half processing before updating our
 * nesting count, to make sure that the only kind of re-entrancy is this
 * code being called by itself: since the count+lock is not an atomic
 * operation, we can allow no races.
 *
 * _Only_ that special combination of being per-cpu and never getting
 * re-entered asynchronously means that the count is safe.
 */
static inline void xt_info_rdlock_bh(void)
{
	struct xt_info_lock *lock;

	local_bh_disable();
	lock = &__get_cpu_var(xt_info_locks);
	if (!lock->readers++)
		spin_lock(&lock->lock);
}

static inline void xt_info_rdunlock_bh(void)
{
	struct xt_info_lock *lock = &__get_cpu_var(xt_info_locks);

	if (!--lock->readers)
		spin_unlock(&lock->lock);
	local_bh_enable();
}

/*
 * The "writer" side needs to get exclusive access to the lock,
 * regardless of readers.  This must be called with bottom half
 * processing (and thus also preemption) disabled.
 */
static inline void xt_info_wrlock(unsigned int cpu)
{
	spin_lock(&per_cpu(xt_info_locks, cpu).lock);
}

static inline void xt_info_wrunlock(unsigned int cpu)
{
	spin_unlock(&per_cpu(xt_info_locks, cpu).lock);
}


/*
/*
 * This helper is performance critical and must be inlined
 * This helper is performance critical and must be inlined
+36 −89
Original line number Original line Diff line number Diff line
@@ -253,9 +253,9 @@ unsigned int arpt_do_table(struct sk_buff *skb,
	indev = in ? in->name : nulldevname;
	indev = in ? in->name : nulldevname;
	outdev = out ? out->name : nulldevname;
	outdev = out ? out->name : nulldevname;


	rcu_read_lock_bh();
	xt_info_rdlock_bh();
	private = rcu_dereference(table->private);
	private = table->private;
	table_base = rcu_dereference(private->entries[smp_processor_id()]);
	table_base = private->entries[smp_processor_id()];


	e = get_entry(table_base, private->hook_entry[hook]);
	e = get_entry(table_base, private->hook_entry[hook]);
	back = get_entry(table_base, private->underflow[hook]);
	back = get_entry(table_base, private->underflow[hook]);
@@ -273,6 +273,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,


			hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) +
			hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) +
				(2 * skb->dev->addr_len);
				(2 * skb->dev->addr_len);

			ADD_COUNTER(e->counters, hdr_len, 1);
			ADD_COUNTER(e->counters, hdr_len, 1);


			t = arpt_get_target(e);
			t = arpt_get_target(e);
@@ -328,8 +329,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
			e = (void *)e + e->next_offset;
			e = (void *)e + e->next_offset;
		}
		}
	} while (!hotdrop);
	} while (!hotdrop);

	xt_info_rdunlock_bh();
	rcu_read_unlock_bh();


	if (hotdrop)
	if (hotdrop)
		return NF_DROP;
		return NF_DROP;
@@ -711,9 +711,12 @@ static void get_counters(const struct xt_table_info *t,
	/* Instead of clearing (by a previous call to memset())
	/* Instead of clearing (by a previous call to memset())
	 * the counters and using adds, we set the counters
	 * the counters and using adds, we set the counters
	 * with data used by 'current' CPU
	 * with data used by 'current' CPU
	 * We dont care about preemption here.
	 *
	 * Bottom half has to be disabled to prevent deadlock
	 * if new softirq were to run and call ipt_do_table
	 */
	 */
	curcpu = raw_smp_processor_id();
	local_bh_disable();
	curcpu = smp_processor_id();


	i = 0;
	i = 0;
	ARPT_ENTRY_ITERATE(t->entries[curcpu],
	ARPT_ENTRY_ITERATE(t->entries[curcpu],
@@ -726,73 +729,22 @@ static void get_counters(const struct xt_table_info *t,
		if (cpu == curcpu)
		if (cpu == curcpu)
			continue;
			continue;
		i = 0;
		i = 0;
		xt_info_wrlock(cpu);
		ARPT_ENTRY_ITERATE(t->entries[cpu],
		ARPT_ENTRY_ITERATE(t->entries[cpu],
				   t->size,
				   t->size,
				   add_entry_to_counter,
				   add_entry_to_counter,
				   counters,
				   counters,
				   &i);
				   &i);
		xt_info_wrunlock(cpu);
	}
	}
}


/* We're lazy, and add to the first CPU; overflow works its fey magic
 * and everything is OK. */
static int
add_counter_to_entry(struct arpt_entry *e,
		     const struct xt_counters addme[],
		     unsigned int *i)
{
	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);

	(*i)++;
	return 0;
}

/* Take values from counters and add them back onto the current cpu */
static void put_counters(struct xt_table_info *t,
			 const struct xt_counters counters[])
{
	unsigned int i, cpu;

	local_bh_disable();
	cpu = smp_processor_id();
	i = 0;
	ARPT_ENTRY_ITERATE(t->entries[cpu],
			  t->size,
			  add_counter_to_entry,
			  counters,
			  &i);
	local_bh_enable();
	local_bh_enable();
}
}


static inline int
zero_entry_counter(struct arpt_entry *e, void *arg)
{
	e->counters.bcnt = 0;
	e->counters.pcnt = 0;
	return 0;
}

static void
clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
{
	unsigned int cpu;
	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];

	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
	for_each_possible_cpu(cpu) {
		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
		ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
				  zero_entry_counter, NULL);
	}
}

static struct xt_counters *alloc_counters(struct xt_table *table)
static struct xt_counters *alloc_counters(struct xt_table *table)
{
{
	unsigned int countersize;
	unsigned int countersize;
	struct xt_counters *counters;
	struct xt_counters *counters;
	struct xt_table_info *private = table->private;
	struct xt_table_info *private = table->private;
	struct xt_table_info *info;


	/* We need atomic snapshot of counters: rest doesn't change
	/* We need atomic snapshot of counters: rest doesn't change
	 * (other than comefrom, which userspace doesn't care
	 * (other than comefrom, which userspace doesn't care
@@ -802,30 +754,11 @@ static struct xt_counters *alloc_counters(struct xt_table *table)
	counters = vmalloc_node(countersize, numa_node_id());
	counters = vmalloc_node(countersize, numa_node_id());


	if (counters == NULL)
	if (counters == NULL)
		goto nomem;
		return ERR_PTR(-ENOMEM);

	info = xt_alloc_table_info(private->size);
	if (!info)
		goto free_counters;

	clone_counters(info, private);

	mutex_lock(&table->lock);
	xt_table_entry_swap_rcu(private, info);
	synchronize_net();	/* Wait until smoke has cleared */

	get_counters(info, counters);
	put_counters(private, counters);
	mutex_unlock(&table->lock);


	xt_free_table_info(info);
	get_counters(private, counters);


	return counters;
	return counters;

 free_counters:
	vfree(counters);
 nomem:
	return ERR_PTR(-ENOMEM);
}
}


static int copy_entries_to_user(unsigned int total_size,
static int copy_entries_to_user(unsigned int total_size,
@@ -1094,8 +1027,9 @@ static int __do_replace(struct net *net, const char *name,
	    (newinfo->number <= oldinfo->initial_entries))
	    (newinfo->number <= oldinfo->initial_entries))
		module_put(t->me);
		module_put(t->me);


	/* Get the old counters. */
	/* Get the old counters, and synchronize with replace */
	get_counters(oldinfo, counters);
	get_counters(oldinfo, counters);

	/* Decrease module usage counts and free resource */
	/* Decrease module usage counts and free resource */
	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
	ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
	ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
@@ -1165,10 +1099,23 @@ static int do_replace(struct net *net, void __user *user, unsigned int len)
	return ret;
	return ret;
}
}


/* We're lazy, and add to the first CPU; overflow works its fey magic
 * and everything is OK. */
static int
add_counter_to_entry(struct arpt_entry *e,
		     const struct xt_counters addme[],
		     unsigned int *i)
{
	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);

	(*i)++;
	return 0;
}

static int do_add_counters(struct net *net, void __user *user, unsigned int len,
static int do_add_counters(struct net *net, void __user *user, unsigned int len,
			   int compat)
			   int compat)
{
{
	unsigned int i;
	unsigned int i, curcpu;
	struct xt_counters_info tmp;
	struct xt_counters_info tmp;
	struct xt_counters *paddc;
	struct xt_counters *paddc;
	unsigned int num_counters;
	unsigned int num_counters;
@@ -1224,26 +1171,26 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
		goto free;
		goto free;
	}
	}


	mutex_lock(&t->lock);
	local_bh_disable();
	private = t->private;
	private = t->private;
	if (private->number != num_counters) {
	if (private->number != num_counters) {
		ret = -EINVAL;
		ret = -EINVAL;
		goto unlock_up_free;
		goto unlock_up_free;
	}
	}


	preempt_disable();
	i = 0;
	i = 0;
	/* Choose the copy that is on our node */
	/* Choose the copy that is on our node */
	loc_cpu_entry = private->entries[smp_processor_id()];
	curcpu = smp_processor_id();
	loc_cpu_entry = private->entries[curcpu];
	xt_info_wrlock(curcpu);
	ARPT_ENTRY_ITERATE(loc_cpu_entry,
	ARPT_ENTRY_ITERATE(loc_cpu_entry,
			   private->size,
			   private->size,
			   add_counter_to_entry,
			   add_counter_to_entry,
			   paddc,
			   paddc,
			   &i);
			   &i);
	preempt_enable();
	xt_info_wrunlock(curcpu);
 unlock_up_free:
 unlock_up_free:
	mutex_unlock(&t->lock);
	local_bh_enable();

	xt_table_unlock(t);
	xt_table_unlock(t);
	module_put(t->me);
	module_put(t->me);
 free:
 free:
+35 −91
Original line number Original line Diff line number Diff line
@@ -338,10 +338,9 @@ ipt_do_table(struct sk_buff *skb,
	tgpar.hooknum = hook;
	tgpar.hooknum = hook;


	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
	IP_NF_ASSERT(table->valid_hooks & (1 << hook));

	xt_info_rdlock_bh();
	rcu_read_lock_bh();
	private = table->private;
	private = rcu_dereference(table->private);
	table_base = private->entries[smp_processor_id()];
	table_base = rcu_dereference(private->entries[smp_processor_id()]);


	e = get_entry(table_base, private->hook_entry[hook]);
	e = get_entry(table_base, private->hook_entry[hook]);


@@ -436,8 +435,7 @@ ipt_do_table(struct sk_buff *skb,
			e = (void *)e + e->next_offset;
			e = (void *)e + e->next_offset;
		}
		}
	} while (!hotdrop);
	} while (!hotdrop);

	xt_info_rdunlock_bh();
	rcu_read_unlock_bh();


#ifdef DEBUG_ALLOW_ALL
#ifdef DEBUG_ALLOW_ALL
	return NF_ACCEPT;
	return NF_ACCEPT;
@@ -896,10 +894,13 @@ get_counters(const struct xt_table_info *t,


	/* Instead of clearing (by a previous call to memset())
	/* Instead of clearing (by a previous call to memset())
	 * the counters and using adds, we set the counters
	 * the counters and using adds, we set the counters
	 * with data used by 'current' CPU
	 * with data used by 'current' CPU.
	 * We dont care about preemption here.
	 *
	 * Bottom half has to be disabled to prevent deadlock
	 * if new softirq were to run and call ipt_do_table
	 */
	 */
	curcpu = raw_smp_processor_id();
	local_bh_disable();
	curcpu = smp_processor_id();


	i = 0;
	i = 0;
	IPT_ENTRY_ITERATE(t->entries[curcpu],
	IPT_ENTRY_ITERATE(t->entries[curcpu],
@@ -912,74 +913,22 @@ get_counters(const struct xt_table_info *t,
		if (cpu == curcpu)
		if (cpu == curcpu)
			continue;
			continue;
		i = 0;
		i = 0;
		xt_info_wrlock(cpu);
		IPT_ENTRY_ITERATE(t->entries[cpu],
		IPT_ENTRY_ITERATE(t->entries[cpu],
				  t->size,
				  t->size,
				  add_entry_to_counter,
				  add_entry_to_counter,
				  counters,
				  counters,
				  &i);
				  &i);
		xt_info_wrunlock(cpu);
	}
	}

}

/* We're lazy, and add to the first CPU; overflow works its fey magic
 * and everything is OK. */
static int
add_counter_to_entry(struct ipt_entry *e,
		     const struct xt_counters addme[],
		     unsigned int *i)
{
	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);

	(*i)++;
	return 0;
}

/* Take values from counters and add them back onto the current cpu */
static void put_counters(struct xt_table_info *t,
			 const struct xt_counters counters[])
{
	unsigned int i, cpu;

	local_bh_disable();
	cpu = smp_processor_id();
	i = 0;
	IPT_ENTRY_ITERATE(t->entries[cpu],
			  t->size,
			  add_counter_to_entry,
			  counters,
			  &i);
	local_bh_enable();
	local_bh_enable();
}
}



static inline int
zero_entry_counter(struct ipt_entry *e, void *arg)
{
	e->counters.bcnt = 0;
	e->counters.pcnt = 0;
	return 0;
}

static void
clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
{
	unsigned int cpu;
	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];

	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
	for_each_possible_cpu(cpu) {
		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
		IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
				  zero_entry_counter, NULL);
	}
}

static struct xt_counters * alloc_counters(struct xt_table *table)
static struct xt_counters * alloc_counters(struct xt_table *table)
{
{
	unsigned int countersize;
	unsigned int countersize;
	struct xt_counters *counters;
	struct xt_counters *counters;
	struct xt_table_info *private = table->private;
	struct xt_table_info *private = table->private;
	struct xt_table_info *info;


	/* We need atomic snapshot of counters: rest doesn't change
	/* We need atomic snapshot of counters: rest doesn't change
	   (other than comefrom, which userspace doesn't care
	   (other than comefrom, which userspace doesn't care
@@ -988,30 +937,11 @@ static struct xt_counters * alloc_counters(struct xt_table *table)
	counters = vmalloc_node(countersize, numa_node_id());
	counters = vmalloc_node(countersize, numa_node_id());


	if (counters == NULL)
	if (counters == NULL)
		goto nomem;
		return ERR_PTR(-ENOMEM);

	info = xt_alloc_table_info(private->size);
	if (!info)
		goto free_counters;

	clone_counters(info, private);

	mutex_lock(&table->lock);
	xt_table_entry_swap_rcu(private, info);
	synchronize_net();	/* Wait until smoke has cleared */


	get_counters(info, counters);
	get_counters(private, counters);
	put_counters(private, counters);
	mutex_unlock(&table->lock);

	xt_free_table_info(info);


	return counters;
	return counters;

 free_counters:
	vfree(counters);
 nomem:
	return ERR_PTR(-ENOMEM);
}
}


static int
static int
@@ -1306,8 +1236,9 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
	    (newinfo->number <= oldinfo->initial_entries))
	    (newinfo->number <= oldinfo->initial_entries))
		module_put(t->me);
		module_put(t->me);


	/* Get the old counters. */
	/* Get the old counters, and synchronize with replace */
	get_counters(oldinfo, counters);
	get_counters(oldinfo, counters);

	/* Decrease module usage counts and free resource */
	/* Decrease module usage counts and free resource */
	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
	IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
	IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
@@ -1377,11 +1308,23 @@ do_replace(struct net *net, void __user *user, unsigned int len)
	return ret;
	return ret;
}
}


/* We're lazy, and add to the first CPU; overflow works its fey magic
 * and everything is OK. */
static int
add_counter_to_entry(struct ipt_entry *e,
		     const struct xt_counters addme[],
		     unsigned int *i)
{
	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);

	(*i)++;
	return 0;
}


static int
static int
do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
{
{
	unsigned int i;
	unsigned int i, curcpu;
	struct xt_counters_info tmp;
	struct xt_counters_info tmp;
	struct xt_counters *paddc;
	struct xt_counters *paddc;
	unsigned int num_counters;
	unsigned int num_counters;
@@ -1437,25 +1380,26 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
		goto free;
		goto free;
	}
	}


	mutex_lock(&t->lock);
	local_bh_disable();
	private = t->private;
	private = t->private;
	if (private->number != num_counters) {
	if (private->number != num_counters) {
		ret = -EINVAL;
		ret = -EINVAL;
		goto unlock_up_free;
		goto unlock_up_free;
	}
	}


	preempt_disable();
	i = 0;
	i = 0;
	/* Choose the copy that is on our node */
	/* Choose the copy that is on our node */
	loc_cpu_entry = private->entries[raw_smp_processor_id()];
	curcpu = smp_processor_id();
	loc_cpu_entry = private->entries[curcpu];
	xt_info_wrlock(curcpu);
	IPT_ENTRY_ITERATE(loc_cpu_entry,
	IPT_ENTRY_ITERATE(loc_cpu_entry,
			  private->size,
			  private->size,
			  add_counter_to_entry,
			  add_counter_to_entry,
			  paddc,
			  paddc,
			  &i);
			  &i);
	preempt_enable();
	xt_info_wrunlock(curcpu);
 unlock_up_free:
 unlock_up_free:
	mutex_unlock(&t->lock);
	local_bh_enable();
	xt_table_unlock(t);
	xt_table_unlock(t);
	module_put(t->me);
	module_put(t->me);
 free:
 free:
+37 −86
Original line number Original line Diff line number Diff line
@@ -365,9 +365,9 @@ ip6t_do_table(struct sk_buff *skb,


	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
	IP_NF_ASSERT(table->valid_hooks & (1 << hook));


	rcu_read_lock_bh();
	xt_info_rdlock_bh();
	private = rcu_dereference(table->private);
	private = table->private;
	table_base = rcu_dereference(private->entries[smp_processor_id()]);
	table_base = private->entries[smp_processor_id()];


	e = get_entry(table_base, private->hook_entry[hook]);
	e = get_entry(table_base, private->hook_entry[hook]);


@@ -466,7 +466,7 @@ ip6t_do_table(struct sk_buff *skb,
#ifdef CONFIG_NETFILTER_DEBUG
#ifdef CONFIG_NETFILTER_DEBUG
	((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
	((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
#endif
#endif
	rcu_read_unlock_bh();
	xt_info_rdunlock_bh();


#ifdef DEBUG_ALLOW_ALL
#ifdef DEBUG_ALLOW_ALL
	return NF_ACCEPT;
	return NF_ACCEPT;
@@ -926,9 +926,12 @@ get_counters(const struct xt_table_info *t,
	/* Instead of clearing (by a previous call to memset())
	/* Instead of clearing (by a previous call to memset())
	 * the counters and using adds, we set the counters
	 * the counters and using adds, we set the counters
	 * with data used by 'current' CPU
	 * with data used by 'current' CPU
	 * We dont care about preemption here.
	 *
	 * Bottom half has to be disabled to prevent deadlock
	 * if new softirq were to run and call ipt_do_table
	 */
	 */
	curcpu = raw_smp_processor_id();
	local_bh_disable();
	curcpu = smp_processor_id();


	i = 0;
	i = 0;
	IP6T_ENTRY_ITERATE(t->entries[curcpu],
	IP6T_ENTRY_ITERATE(t->entries[curcpu],
@@ -941,72 +944,22 @@ get_counters(const struct xt_table_info *t,
		if (cpu == curcpu)
		if (cpu == curcpu)
			continue;
			continue;
		i = 0;
		i = 0;
		xt_info_wrlock(cpu);
		IP6T_ENTRY_ITERATE(t->entries[cpu],
		IP6T_ENTRY_ITERATE(t->entries[cpu],
				  t->size,
				  t->size,
				  add_entry_to_counter,
				  add_entry_to_counter,
				  counters,
				  counters,
				  &i);
				  &i);
		xt_info_wrunlock(cpu);
	}
	}
}

/* We're lazy, and add to the first CPU; overflow works its fey magic
 * and everything is OK. */
static int
add_counter_to_entry(struct ip6t_entry *e,
		     const struct xt_counters addme[],
		     unsigned int *i)
{
	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);

	(*i)++;
	return 0;
}

/* Take values from counters and add them back onto the current cpu */
static void put_counters(struct xt_table_info *t,
			 const struct xt_counters counters[])
{
	unsigned int i, cpu;

	local_bh_disable();
	cpu = smp_processor_id();
	i = 0;
	IP6T_ENTRY_ITERATE(t->entries[cpu],
			   t->size,
			   add_counter_to_entry,
			   counters,
			   &i);
	local_bh_enable();
	local_bh_enable();
}
}


static inline int
zero_entry_counter(struct ip6t_entry *e, void *arg)
{
	e->counters.bcnt = 0;
	e->counters.pcnt = 0;
	return 0;
}

static void
clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
{
	unsigned int cpu;
	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];

	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
	for_each_possible_cpu(cpu) {
		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
		IP6T_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
				   zero_entry_counter, NULL);
	}
}

static struct xt_counters *alloc_counters(struct xt_table *table)
static struct xt_counters *alloc_counters(struct xt_table *table)
{
{
	unsigned int countersize;
	unsigned int countersize;
	struct xt_counters *counters;
	struct xt_counters *counters;
	struct xt_table_info *private = table->private;
	struct xt_table_info *private = table->private;
	struct xt_table_info *info;


	/* We need atomic snapshot of counters: rest doesn't change
	/* We need atomic snapshot of counters: rest doesn't change
	   (other than comefrom, which userspace doesn't care
	   (other than comefrom, which userspace doesn't care
@@ -1015,30 +968,11 @@ static struct xt_counters *alloc_counters(struct xt_table *table)
	counters = vmalloc_node(countersize, numa_node_id());
	counters = vmalloc_node(countersize, numa_node_id());


	if (counters == NULL)
	if (counters == NULL)
		goto nomem;
		return ERR_PTR(-ENOMEM);

	info = xt_alloc_table_info(private->size);
	if (!info)
		goto free_counters;

	clone_counters(info, private);

	mutex_lock(&table->lock);
	xt_table_entry_swap_rcu(private, info);
	synchronize_net();	/* Wait until smoke has cleared */

	get_counters(info, counters);
	put_counters(private, counters);
	mutex_unlock(&table->lock);


	xt_free_table_info(info);
	get_counters(private, counters);


	return counters;
	return counters;

 free_counters:
	vfree(counters);
 nomem:
	return ERR_PTR(-ENOMEM);
}
}


static int
static int
@@ -1334,8 +1268,9 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
	    (newinfo->number <= oldinfo->initial_entries))
	    (newinfo->number <= oldinfo->initial_entries))
		module_put(t->me);
		module_put(t->me);


	/* Get the old counters. */
	/* Get the old counters, and synchronize with replace */
	get_counters(oldinfo, counters);
	get_counters(oldinfo, counters);

	/* Decrease module usage counts and free resource */
	/* Decrease module usage counts and free resource */
	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
	IP6T_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
	IP6T_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
@@ -1405,11 +1340,24 @@ do_replace(struct net *net, void __user *user, unsigned int len)
	return ret;
	return ret;
}
}


/* We're lazy, and add to the first CPU; overflow works its fey magic
 * and everything is OK. */
static int
add_counter_to_entry(struct ip6t_entry *e,
		     const struct xt_counters addme[],
		     unsigned int *i)
{
	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);

	(*i)++;
	return 0;
}

static int
static int
do_add_counters(struct net *net, void __user *user, unsigned int len,
do_add_counters(struct net *net, void __user *user, unsigned int len,
		int compat)
		int compat)
{
{
	unsigned int i;
	unsigned int i, curcpu;
	struct xt_counters_info tmp;
	struct xt_counters_info tmp;
	struct xt_counters *paddc;
	struct xt_counters *paddc;
	unsigned int num_counters;
	unsigned int num_counters;
@@ -1465,25 +1413,28 @@ do_add_counters(struct net *net, void __user *user, unsigned int len,
		goto free;
		goto free;
	}
	}


	mutex_lock(&t->lock);

	local_bh_disable();
	private = t->private;
	private = t->private;
	if (private->number != num_counters) {
	if (private->number != num_counters) {
		ret = -EINVAL;
		ret = -EINVAL;
		goto unlock_up_free;
		goto unlock_up_free;
	}
	}


	preempt_disable();
	i = 0;
	i = 0;
	/* Choose the copy that is on our node */
	/* Choose the copy that is on our node */
	loc_cpu_entry = private->entries[raw_smp_processor_id()];
	curcpu = smp_processor_id();
	xt_info_wrlock(curcpu);
	loc_cpu_entry = private->entries[curcpu];
	IP6T_ENTRY_ITERATE(loc_cpu_entry,
	IP6T_ENTRY_ITERATE(loc_cpu_entry,
			  private->size,
			  private->size,
			  add_counter_to_entry,
			  add_counter_to_entry,
			  paddc,
			  paddc,
			  &i);
			  &i);
	preempt_enable();
	xt_info_wrunlock(curcpu);

 unlock_up_free:
 unlock_up_free:
	mutex_unlock(&t->lock);
	local_bh_enable();
	xt_table_unlock(t);
	xt_table_unlock(t);
	module_put(t->me);
	module_put(t->me);
 free:
 free:
+28 −25
Original line number Original line Diff line number Diff line
@@ -625,20 +625,6 @@ void xt_free_table_info(struct xt_table_info *info)
}
}
EXPORT_SYMBOL(xt_free_table_info);
EXPORT_SYMBOL(xt_free_table_info);


void xt_table_entry_swap_rcu(struct xt_table_info *oldinfo,
			     struct xt_table_info *newinfo)
{
	unsigned int cpu;

	for_each_possible_cpu(cpu) {
		void *p = oldinfo->entries[cpu];
		rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]);
		newinfo->entries[cpu] = p;
	}

}
EXPORT_SYMBOL_GPL(xt_table_entry_swap_rcu);

/* Find table by name, grabs mutex & ref.  Returns ERR_PTR() on error. */
/* Find table by name, grabs mutex & ref.  Returns ERR_PTR() on error. */
struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
				    const char *name)
				    const char *name)
@@ -676,32 +662,43 @@ void xt_compat_unlock(u_int8_t af)
EXPORT_SYMBOL_GPL(xt_compat_unlock);
EXPORT_SYMBOL_GPL(xt_compat_unlock);
#endif
#endif


DEFINE_PER_CPU(struct xt_info_lock, xt_info_locks);
EXPORT_PER_CPU_SYMBOL_GPL(xt_info_locks);


struct xt_table_info *
struct xt_table_info *
xt_replace_table(struct xt_table *table,
xt_replace_table(struct xt_table *table,
	      unsigned int num_counters,
	      unsigned int num_counters,
	      struct xt_table_info *newinfo,
	      struct xt_table_info *newinfo,
	      int *error)
	      int *error)
{
{
	struct xt_table_info *oldinfo, *private;
	struct xt_table_info *private;


	/* Do the substitution. */
	/* Do the substitution. */
	mutex_lock(&table->lock);
	local_bh_disable();
	private = table->private;
	private = table->private;

	/* Check inside lock: is the old number correct? */
	/* Check inside lock: is the old number correct? */
	if (num_counters != private->number) {
	if (num_counters != private->number) {
		duprintf("num_counters != table->private->number (%u/%u)\n",
		duprintf("num_counters != table->private->number (%u/%u)\n",
			 num_counters, private->number);
			 num_counters, private->number);
		mutex_unlock(&table->lock);
		local_bh_enable();
		*error = -EAGAIN;
		*error = -EAGAIN;
		return NULL;
		return NULL;
	}
	}
	oldinfo = private;
	rcu_assign_pointer(table->private, newinfo);
	newinfo->initial_entries = oldinfo->initial_entries;
	mutex_unlock(&table->lock);


	synchronize_net();
	table->private = newinfo;
	return oldinfo;
	newinfo->initial_entries = private->initial_entries;

	/*
	 * Even though table entries have now been swapped, other CPU's
	 * may still be using the old entries. This is okay, because
	 * resynchronization happens because of the locking done
	 * during the get_counters() routine.
	 */
	local_bh_enable();

	return private;
}
}
EXPORT_SYMBOL_GPL(xt_replace_table);
EXPORT_SYMBOL_GPL(xt_replace_table);


@@ -734,7 +731,6 @@ struct xt_table *xt_register_table(struct net *net, struct xt_table *table,


	/* Simplifies replace_table code. */
	/* Simplifies replace_table code. */
	table->private = bootstrap;
	table->private = bootstrap;
	mutex_init(&table->lock);


	if (!xt_replace_table(table, 0, newinfo, &ret))
	if (!xt_replace_table(table, 0, newinfo, &ret))
		goto unlock;
		goto unlock;
@@ -1147,7 +1143,14 @@ static struct pernet_operations xt_net_ops = {


static int __init xt_init(void)
static int __init xt_init(void)
{
{
	int i, rv;
	unsigned int i;
	int rv;

	for_each_possible_cpu(i) {
		struct xt_info_lock *lock = &per_cpu(xt_info_locks, i);
		spin_lock_init(&lock->lock);
		lock->readers = 0;
	}


	xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL);
	xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL);
	if (!xt)
	if (!xt)