Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 4518e6a0 authored by Tejun Heo's avatar Tejun Heo
Browse files

x86,percpu: use embedding for 64bit NUMA and page for 32bit NUMA



Embedding percpu first chunk allocator can now handle very sparse unit
mapping.  Use embedding allocator instead of lpage for 64bit NUMA.
This removes extra TLB pressure and the need to do complex and fragile
dancing when changing page attributes.

For 32bit, using very sparse unit mapping isn't a good idea because
the vmalloc space is very constrained.  32bit NUMA machines aren't
exactly the focus of optimization and it isn't very clear whether
lpage performs better than page.  Use page first chunk allocator for
32bit NUMAs.

As this leaves setup_pcpu_*() functions pretty much empty, fold them
into setup_per_cpu_areas().

Signed-off-by: default avatarTejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Andi Kleen <andi@firstfloor.org>
parent c8826dd5
Loading
Loading
Loading
Loading
+0 −4
Original line number Original line Diff line number Diff line
@@ -156,10 +156,6 @@ config NEED_PER_CPU_EMBED_FIRST_CHUNK
config NEED_PER_CPU_PAGE_FIRST_CHUNK
config NEED_PER_CPU_PAGE_FIRST_CHUNK
	def_bool y
	def_bool y


config NEED_PER_CPU_LPAGE_FIRST_CHUNK
	def_bool y
	depends on NEED_MULTIPLE_NODES

config HAVE_CPUMASK_OF_CPU_MAP
config HAVE_CPUMASK_OF_CPU_MAP
	def_bool X86_64_SMP
	def_bool X86_64_SMP


+28 −127
Original line number Original line Diff line number Diff line
@@ -55,6 +55,7 @@ EXPORT_SYMBOL(__per_cpu_offset);
#define PERCPU_FIRST_CHUNK_RESERVE	0
#define PERCPU_FIRST_CHUNK_RESERVE	0
#endif
#endif


#ifdef CONFIG_X86_32
/**
/**
 * pcpu_need_numa - determine percpu allocation needs to consider NUMA
 * pcpu_need_numa - determine percpu allocation needs to consider NUMA
 *
 *
@@ -83,6 +84,7 @@ static bool __init pcpu_need_numa(void)
#endif
#endif
	return false;
	return false;
}
}
#endif


/**
/**
 * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
 * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
@@ -136,128 +138,23 @@ static void __init pcpu_fc_free(void *ptr, size_t size)
	free_bootmem(__pa(ptr), size);
	free_bootmem(__pa(ptr), size);
}
}


/*
static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
 * Large page remapping allocator
 */
#ifdef CONFIG_NEED_MULTIPLE_NODES
static void __init pcpul_map(void *ptr, size_t size, void *addr)
{
	pmd_t *pmd, pmd_v;

	pmd = populate_extra_pmd((unsigned long)addr);
	pmd_v = pfn_pmd(page_to_pfn(virt_to_page(ptr)), PAGE_KERNEL_LARGE);
	set_pmd(pmd, pmd_v);
}

static int pcpu_lpage_cpu_distance(unsigned int from, unsigned int to)
{
{
#ifdef CONFIG_NEED_MULTIPLE_NODES
	if (early_cpu_to_node(from) == early_cpu_to_node(to))
	if (early_cpu_to_node(from) == early_cpu_to_node(to))
		return LOCAL_DISTANCE;
		return LOCAL_DISTANCE;
	else
	else
		return REMOTE_DISTANCE;
		return REMOTE_DISTANCE;
}

static int __init setup_pcpu_lpage(bool chosen)
{
	size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
	size_t dyn_size = reserve - PERCPU_FIRST_CHUNK_RESERVE;
	struct pcpu_alloc_info *ai;
	int rc;

	/* on non-NUMA, embedding is better */
	if (!chosen && !pcpu_need_numa())
		return -EINVAL;

	/* need PSE */
	if (!cpu_has_pse) {
		pr_warning("PERCPU: lpage allocator requires PSE\n");
		return -EINVAL;
	}

	/* allocate and build unit_map */
	ai = pcpu_build_alloc_info(PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
				   PMD_SIZE, pcpu_lpage_cpu_distance);
	if (IS_ERR(ai)) {
		pr_warning("PERCPU: failed to build unit_map (%ld)\n",
			   PTR_ERR(ai));
		return PTR_ERR(ai);
	}

	/* do the parameters look okay? */
	if (!chosen) {
		size_t vm_size = VMALLOC_END - VMALLOC_START;
		size_t tot_size = 0;
		int group;

		for (group = 0; group < ai->nr_groups; group++)
			tot_size += ai->unit_size * ai->groups[group].nr_units;

		/* don't consume more than 20% of vmalloc area */
		if (tot_size > vm_size / 5) {
			pr_info("PERCPU: too large chunk size %zuMB for "
				"large page remap\n", tot_size >> 20);
			rc = -EINVAL;
			goto out_free;
		}
	}

	rc = pcpu_lpage_first_chunk(ai, pcpu_fc_alloc, pcpu_fc_free, pcpul_map);
out_free:
	pcpu_free_alloc_info(ai);
	return rc;
}
#else
#else
static int __init setup_pcpu_lpage(bool chosen)
	return LOCAL_DISTANCE;
{
	return -EINVAL;
}
#endif
#endif

/*
 * Embedding allocator
 *
 * The first chunk is sized to just contain the static area plus
 * module and dynamic reserves and embedded into linear physical
 * mapping so that it can use PMD mapping without additional TLB
 * pressure.
 */
static int __init setup_pcpu_embed(bool chosen)
{
	size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;

	/*
	 * If large page isn't supported, there's no benefit in doing
	 * this.  Also, embedding allocation doesn't play well with
	 * NUMA.
	 */
	if (!chosen && (!cpu_has_pse || pcpu_need_numa()))
		return -EINVAL;

	return pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
				      reserve - PERCPU_FIRST_CHUNK_RESERVE,
				      PAGE_SIZE, NULL, pcpu_fc_alloc,
				      pcpu_fc_free);
}
}


/*
 * Page allocator
 *
 * Boring fallback 4k page allocator.  This allocator puts more
 * pressure on PTE TLBs but other than that behaves nicely on both UMA
 * and NUMA.
 */
static void __init pcpup_populate_pte(unsigned long addr)
static void __init pcpup_populate_pte(unsigned long addr)
{
{
	populate_extra_pte(addr);
	populate_extra_pte(addr);
}
}


static int __init setup_pcpu_page(void)
{
	return pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
				     pcpu_fc_alloc, pcpu_fc_free,
				     pcpup_populate_pte);
}

static inline void setup_percpu_segment(int cpu)
static inline void setup_percpu_segment(int cpu)
{
{
#ifdef CONFIG_X86_32
#ifdef CONFIG_X86_32
@@ -281,30 +178,34 @@ void __init setup_per_cpu_areas(void)
		NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
		NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);


	/*
	/*
	 * Allocate percpu area.  If PSE is supported, try to make use
	 * Allocate percpu area.  Embedding allocator is our favorite;
	 * of large page mappings.  Please read comments on top of
	 * however, on NUMA configurations, it can result in very
	 * each allocator for details.
	 * sparse unit mapping and vmalloc area isn't spacious enough
	 * on 32bit.  Use page in that case.
	 */
	 */
#ifdef CONFIG_X86_32
	if (pcpu_chosen_fc == PCPU_FC_AUTO && pcpu_need_numa())
		pcpu_chosen_fc = PCPU_FC_PAGE;
#endif
	rc = -EINVAL;
	rc = -EINVAL;
	if (pcpu_chosen_fc != PCPU_FC_AUTO) {
	if (pcpu_chosen_fc != PCPU_FC_PAGE) {
	if (pcpu_chosen_fc != PCPU_FC_PAGE) {
			if (pcpu_chosen_fc == PCPU_FC_LPAGE)
		const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE;
				rc = setup_pcpu_lpage(true);
		const size_t dyn_size = PERCPU_MODULE_RESERVE +
			else
			PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE;
				rc = setup_pcpu_embed(true);


		rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
					    dyn_size, atom_size,
					    pcpu_cpu_distance,
					    pcpu_fc_alloc, pcpu_fc_free);
		if (rc < 0)
		if (rc < 0)
			pr_warning("PERCPU: %s allocator failed (%d), "
			pr_warning("PERCPU: %s allocator failed (%d), "
				   "falling back to page size\n",
				   "falling back to page size\n",
				   pcpu_fc_names[pcpu_chosen_fc], rc);
				   pcpu_fc_names[pcpu_chosen_fc], rc);
	}
	}
	} else {
		rc = setup_pcpu_lpage(false);
	if (rc < 0)
	if (rc < 0)
			rc = setup_pcpu_embed(false);
		rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
	}
					   pcpu_fc_alloc, pcpu_fc_free,
	if (rc < 0)
					   pcpup_populate_pte);
		rc = setup_pcpu_page();
	if (rc < 0)
	if (rc < 0)
		panic("cannot initialize percpu area (err=%d)", rc);
		panic("cannot initialize percpu area (err=%d)", rc);