Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 9dcb8b68 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

mm: remove per-zone hashtable of bitlock waitqueues



The per-zone waitqueues exist because of a scalability issue with the
page waitqueues on some NUMA machines, but it turns out that they hurt
normal loads, and now with the vmalloced stacks they also end up
breaking gfs2 that uses a bit_wait on a stack object:

     wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE)

where 'gh' can be a reference to the local variable 'mount_gh' on the
stack of fill_super().

The reason the per-zone hash table breaks for this case is that there is
no "zone" for virtual allocations, and trying to look up the physical
page to get at it will fail (with a BUG_ON()).

It turns out that I actually complained to the mm people about the
per-zone hash table for another reason just a month ago: the zone lookup
also hurts the regular use of "unlock_page()" a lot, because the zone
lookup ends up forcing several unnecessary cache misses and generates
horrible code.

As part of that earlier discussion, we had a much better solution for
the NUMA scalability issue - by just making the page lock have a
separate contention bit, the waitqueue doesn't even have to be looked at
for the normal case.

Peter Zijlstra already has a patch for that, but let's see if anybody
even notices.  In the meantime, let's fix the actual gfs2 breakage by
simplifying the bitlock waitqueues and removing the per-zone issue.

Reported-by: default avatarAndreas Gruenbacher <agruenba@redhat.com>
Tested-by: default avatarBob Peterson <rpeterso@redhat.com>
Acked-by: default avatarMel Gorman <mgorman@techsingularity.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 9fe68cad
Loading
Loading
Loading
Loading
+2 −28
Original line number Diff line number Diff line
@@ -440,33 +440,7 @@ struct zone {
	seqlock_t		span_seqlock;
#endif

	/*
	 * wait_table		-- the array holding the hash table
	 * wait_table_hash_nr_entries	-- the size of the hash table array
	 * wait_table_bits	-- wait_table_size == (1 << wait_table_bits)
	 *
	 * The purpose of all these is to keep track of the people
	 * waiting for a page to become available and make them
	 * runnable again when possible. The trouble is that this
	 * consumes a lot of space, especially when so few things
	 * wait on pages at a given time. So instead of using
	 * per-page waitqueues, we use a waitqueue hash table.
	 *
	 * The bucket discipline is to sleep on the same queue when
	 * colliding and wake all in that wait queue when removing.
	 * When something wakes, it must check to be sure its page is
	 * truly available, a la thundering herd. The cost of a
	 * collision is great, but given the expected load of the
	 * table, they should be so rare as to be outweighed by the
	 * benefits from the saved space.
	 *
	 * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
	 * primary users of these fields, and in mm/page_alloc.c
	 * free_area_init_core() performs the initialization of them.
	 */
	wait_queue_head_t	*wait_table;
	unsigned long		wait_table_hash_nr_entries;
	unsigned long		wait_table_bits;
	int initialized;

	/* Write-intensive fields used from the page allocator */
	ZONE_PADDING(_pad1_)
@@ -546,7 +520,7 @@ static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn)

static inline bool zone_is_initialized(struct zone *zone)
{
	return !!zone->wait_table;
	return zone->initialized;
}

static inline bool zone_is_empty(struct zone *zone)
+16 −0
Original line number Diff line number Diff line
@@ -7515,11 +7515,27 @@ static struct kmem_cache *task_group_cache __read_mostly;
DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);

#define WAIT_TABLE_BITS 8
#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned;

wait_queue_head_t *bit_waitqueue(void *word, int bit)
{
	const int shift = BITS_PER_LONG == 32 ? 5 : 6;
	unsigned long val = (unsigned long)word << shift | bit;

	return bit_wait_table + hash_long(val, WAIT_TABLE_BITS);
}
EXPORT_SYMBOL(bit_waitqueue);

void __init sched_init(void)
{
	int i, j;
	unsigned long alloc_size = 0, ptr;

	for (i = 0; i < WAIT_TABLE_SIZE; i++)
		init_waitqueue_head(bit_wait_table + i);

#ifdef CONFIG_FAIR_GROUP_SCHED
	alloc_size += 2 * nr_cpu_ids * sizeof(void **);
#endif
+0 −10
Original line number Diff line number Diff line
@@ -480,16 +480,6 @@ void wake_up_bit(void *word, int bit)
}
EXPORT_SYMBOL(wake_up_bit);

wait_queue_head_t *bit_waitqueue(void *word, int bit)
{
	const int shift = BITS_PER_LONG == 32 ? 5 : 6;
	const struct zone *zone = page_zone(virt_to_page(word));
	unsigned long val = (unsigned long)word << shift | bit;

	return &zone->wait_table[hash_long(val, zone->wait_table_bits)];
}
EXPORT_SYMBOL(bit_waitqueue);

/*
 * Manipulate the atomic_t address to produce a better bit waitqueue table hash
 * index (we're keying off bit -1, but that would produce a horrible hash
+1 −3
Original line number Diff line number Diff line
@@ -790,9 +790,7 @@ EXPORT_SYMBOL(__page_cache_alloc);
 */
wait_queue_head_t *page_waitqueue(struct page *page)
{
	const struct zone *zone = page_zone(page);

	return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)];
	return bit_waitqueue(page, 0);
}
EXPORT_SYMBOL(page_waitqueue);

+0 −28
Original line number Diff line number Diff line
@@ -268,7 +268,6 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
	unsigned long i, pfn, end_pfn, nr_pages;
	int node = pgdat->node_id;
	struct page *page;
	struct zone *zone;

	nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT;
	page = virt_to_page(pgdat);
@@ -276,19 +275,6 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
	for (i = 0; i < nr_pages; i++, page++)
		get_page_bootmem(node, page, NODE_INFO);

	zone = &pgdat->node_zones[0];
	for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) {
		if (zone_is_initialized(zone)) {
			nr_pages = zone->wait_table_hash_nr_entries
				* sizeof(wait_queue_head_t);
			nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT;
			page = virt_to_page(zone->wait_table);

			for (i = 0; i < nr_pages; i++, page++)
				get_page_bootmem(node, page, NODE_INFO);
		}
	}

	pfn = pgdat->node_start_pfn;
	end_pfn = pgdat_end_pfn(pgdat);

@@ -2158,20 +2144,6 @@ void try_offline_node(int nid)
	 */
	node_set_offline(nid);
	unregister_one_node(nid);

	/* free waittable in each zone */
	for (i = 0; i < MAX_NR_ZONES; i++) {
		struct zone *zone = pgdat->node_zones + i;

		/*
		 * wait_table may be allocated from boot memory,
		 * here only free if it's allocated by vmalloc.
		 */
		if (is_vmalloc_addr(zone->wait_table)) {
			vfree(zone->wait_table);
			zone->wait_table = NULL;
		}
	}
}
EXPORT_SYMBOL(try_offline_node);

Loading