Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit bf72aeba authored by Paul Mackerras's avatar Paul Mackerras
Browse files

powerpc: Use 64k pages without needing cache-inhibited large pages



Some POWER5+ machines can do 64k hardware pages for normal memory but
not for cache-inhibited pages.  This patch lets us use 64k hardware
pages for most user processes on such machines (assuming the kernel
has been configured with CONFIG_PPC_64K_PAGES=y).  User processes
start out using 64k pages and get switched to 4k pages if they use any
non-cacheable mappings.

With this, we use 64k pages for the vmalloc region and 4k pages for
the imalloc region.  If anything creates a non-cacheable mapping in
the vmalloc region, the vmalloc region will get switched to 4k pages.
I don't know of any driver other than the DRM that would do this,
though, and these machines don't have AGP.

When a region gets switched from 64k pages to 4k pages, we do not have
to clear out all the 64k HPTEs from the hash table immediately.  We
use the _PAGE_COMBO bit in the Linux PTE to indicate whether the page
was hashed in as a 64k page or a set of 4k pages.  If hash_page is
trying to insert a 4k page for a Linux PTE and it sees that it has
already been inserted as a 64k page, it first invalidates the 64k HPTE
before inserting the 4k HPTE.  The hash invalidation routines also use
the _PAGE_COMBO bit, to determine whether to look for a 64k HPTE or a
set of 4k HPTEs to remove.  With those two changes, we can tolerate a
mix of 4k and 64k HPTEs in the hash table, and they will all get
removed when the address space is torn down.

Signed-off-by: default avatarPaul Mackerras <paulus@samba.org>
parent 31925323
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -122,6 +122,8 @@ int main(void)
	DEFINE(PACASLBCACHE, offsetof(struct paca_struct, slb_cache));
	DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr));
	DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id));
	DEFINE(PACACONTEXTSLLP, offsetof(struct paca_struct, context.sllp));
	DEFINE(PACAVMALLOCSLLP, offsetof(struct paca_struct, vmalloc_sllp));
#ifdef CONFIG_HUGETLB_PAGE
	DEFINE(PACALOWHTLBAREAS, offsetof(struct paca_struct, context.low_htlb_areas));
	DEFINE(PACAHIGHHTLBAREAS, offsetof(struct paca_struct, context.high_htlb_areas));
+3 −0
Original line number Diff line number Diff line
@@ -948,7 +948,10 @@ static struct ibm_pa_feature {
	{CPU_FTR_CTRL, 0,		0, 3, 0},
	{CPU_FTR_NOEXECUTE, 0,		0, 6, 0},
	{CPU_FTR_NODSISRALIGN, 0,	1, 1, 1},
#if 0
	/* put this back once we know how to test if firmware does 64k IO */
	{CPU_FTR_CI_LARGE_PAGE, 0,	1, 2, 0},
#endif
};

static void __init check_cpu_pa_features(unsigned long node)
+28 −0
Original line number Diff line number Diff line
@@ -369,6 +369,7 @@ _GLOBAL(__hash_page_4K)
	rlwinm	r30,r4,32-9+7,31-7,31-7	/* _PAGE_RW -> _PAGE_DIRTY */
	or	r30,r30,r31
	ori	r30,r30,_PAGE_BUSY | _PAGE_ACCESSED | _PAGE_HASHPTE
	oris	r30,r30,_PAGE_COMBO@h
	/* Write the linux PTE atomically (setting busy) */
	stdcx.	r30,0,r6
	bne-	1b
@@ -428,6 +429,14 @@ END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE)
	andi.	r0,r31,_PAGE_HASHPTE
	li	r26,0			/* Default hidx */
	beq	htab_insert_pte

	/*
	 * Check if the pte was already inserted into the hash table
	 * as a 64k HW page, and invalidate the 64k HPTE if so.
	 */
	andis.	r0,r31,_PAGE_COMBO@h
	beq	htab_inval_old_hpte

	ld	r6,STK_PARM(r6)(r1)
	ori	r26,r6,0x8000		/* Load the hidx mask */
	ld	r26,0(r26)
@@ -498,6 +507,19 @@ _GLOBAL(htab_call_hpte_remove)
	/* Try all again */
	b	htab_insert_pte

	/*
	 * Call out to C code to invalidate an 64k HW HPTE that is
	 * useless now that the segment has been switched to 4k pages.
	 */
htab_inval_old_hpte:
	mr	r3,r29			/* virtual addr */
	mr	r4,r31			/* PTE.pte */
	li	r5,0			/* PTE.hidx */
	li	r6,MMU_PAGE_64K		/* psize */
	ld	r7,STK_PARM(r8)(r1)	/* local */
	bl	.flush_hash_page
	b	htab_insert_pte
	
htab_bail_ok:
	li	r3,0
	b	htab_bail
@@ -638,6 +660,12 @@ _GLOBAL(__hash_page_64K)
	 * is changing this PTE anyway and might hash it.
	 */
	bne-	ht64_bail_ok
BEGIN_FTR_SECTION
	/* Check if PTE has the cache-inhibit bit set */
	andi.	r0,r31,_PAGE_NO_CACHE
	/* If so, bail out and refault as a 4k page */
	bne-	ht64_bail_ok
END_FTR_SECTION_IFCLR(CPU_FTR_CI_LARGE_PAGE)
	/* Prepare new PTE value (turn access RW into DIRTY, then
	 * add BUSY,HASHPTE and ACCESSED)
	 */
+75 −9
Original line number Diff line number Diff line
@@ -92,10 +92,15 @@ unsigned long htab_size_bytes;
unsigned long htab_hash_mask;
int mmu_linear_psize = MMU_PAGE_4K;
int mmu_virtual_psize = MMU_PAGE_4K;
int mmu_vmalloc_psize = MMU_PAGE_4K;
int mmu_io_psize = MMU_PAGE_4K;
#ifdef CONFIG_HUGETLB_PAGE
int mmu_huge_psize = MMU_PAGE_16M;
unsigned int HPAGE_SHIFT;
#endif
#ifdef CONFIG_PPC_64K_PAGES
int mmu_ci_restrictions;
#endif

/* There are definitions of page sizes arrays to be used when none
 * is provided by the firmware.
@@ -308,20 +313,31 @@ static void __init htab_init_page_sizes(void)
	else if (mmu_psize_defs[MMU_PAGE_1M].shift)
		mmu_linear_psize = MMU_PAGE_1M;

#ifdef CONFIG_PPC_64K_PAGES
	/*
	 * Pick a size for the ordinary pages. Default is 4K, we support
	 * 64K if cache inhibited large pages are supported by the
	 * processor
	 * 64K for user mappings and vmalloc if supported by the processor.
	 * We only use 64k for ioremap if the processor
	 * (and firmware) support cache-inhibited large pages.
	 * If not, we use 4k and set mmu_ci_restrictions so that
	 * hash_page knows to switch processes that use cache-inhibited
	 * mappings to 4k pages.
	 */
#ifdef CONFIG_PPC_64K_PAGES
	if (mmu_psize_defs[MMU_PAGE_64K].shift &&
	    cpu_has_feature(CPU_FTR_CI_LARGE_PAGE))
	if (mmu_psize_defs[MMU_PAGE_64K].shift) {
		mmu_virtual_psize = MMU_PAGE_64K;
		mmu_vmalloc_psize = MMU_PAGE_64K;
		if (cpu_has_feature(CPU_FTR_CI_LARGE_PAGE))
			mmu_io_psize = MMU_PAGE_64K;
		else
			mmu_ci_restrictions = 1;
	}
#endif

	printk(KERN_DEBUG "Page orders: linear mapping = %d, others = %d\n",
	printk(KERN_DEBUG "Page orders: linear mapping = %d, "
	       "virtual = %d, io = %d\n",
	       mmu_psize_defs[mmu_linear_psize].shift,
	       mmu_psize_defs[mmu_virtual_psize].shift);
	       mmu_psize_defs[mmu_virtual_psize].shift,
	       mmu_psize_defs[mmu_io_psize].shift);

#ifdef CONFIG_HUGETLB_PAGE
	/* Init large page size. Currently, we pick 16M or 1M depending
@@ -556,6 +572,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
	pte_t *ptep;
	cpumask_t tmp;
	int rc, user_region = 0, local = 0;
	int psize;

	DBG_LOW("hash_page(ea=%016lx, access=%lx, trap=%lx\n",
		ea, access, trap);
@@ -575,10 +592,15 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
			return 1;
		}
		vsid = get_vsid(mm->context.id, ea);
		psize = mm->context.user_psize;
		break;
	case VMALLOC_REGION_ID:
		mm = &init_mm;
		vsid = get_kernel_vsid(ea);
		if (ea < VMALLOC_END)
			psize = mmu_vmalloc_psize;
		else
			psize = mmu_io_psize;
		break;
	default:
		/* Not a valid range
@@ -629,7 +651,40 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
#ifndef CONFIG_PPC_64K_PAGES
	rc = __hash_page_4K(ea, access, vsid, ptep, trap, local);
#else
	if (mmu_virtual_psize == MMU_PAGE_64K)
	if (mmu_ci_restrictions) {
		/* If this PTE is non-cacheable, switch to 4k */
		if (psize == MMU_PAGE_64K &&
		    (pte_val(*ptep) & _PAGE_NO_CACHE)) {
			if (user_region) {
				psize = MMU_PAGE_4K;
				mm->context.user_psize = MMU_PAGE_4K;
				mm->context.sllp = SLB_VSID_USER |
					mmu_psize_defs[MMU_PAGE_4K].sllp;
			} else if (ea < VMALLOC_END) {
				/*
				 * some driver did a non-cacheable mapping
				 * in vmalloc space, so switch vmalloc
				 * to 4k pages
				 */
				printk(KERN_ALERT "Reducing vmalloc segment "
				       "to 4kB pages because of "
				       "non-cacheable mapping\n");
				psize = mmu_vmalloc_psize = MMU_PAGE_4K;
			}
		}
		if (user_region) {
			if (psize != get_paca()->context.user_psize) {
				get_paca()->context = mm->context;
				slb_flush_and_rebolt();
			}
		} else if (get_paca()->vmalloc_sllp !=
			   mmu_psize_defs[mmu_vmalloc_psize].sllp) {
			get_paca()->vmalloc_sllp =
				mmu_psize_defs[mmu_vmalloc_psize].sllp;
			slb_flush_and_rebolt();
		}
	}
	if (psize == MMU_PAGE_64K)
		rc = __hash_page_64K(ea, access, vsid, ptep, trap, local);
	else
		rc = __hash_page_4K(ea, access, vsid, ptep, trap, local);
@@ -681,7 +736,18 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
#ifndef CONFIG_PPC_64K_PAGES
	__hash_page_4K(ea, access, vsid, ptep, trap, local);
#else
	if (mmu_virtual_psize == MMU_PAGE_64K)
	if (mmu_ci_restrictions) {
		/* If this PTE is non-cacheable, switch to 4k */
		if (mm->context.user_psize == MMU_PAGE_64K &&
		    (pte_val(*ptep) & _PAGE_NO_CACHE)) {
			mm->context.user_psize = MMU_PAGE_4K;
			mm->context.sllp = SLB_VSID_USER |
				mmu_psize_defs[MMU_PAGE_4K].sllp;
			get_paca()->context = mm->context;
			slb_flush_and_rebolt();
		}
	}
	if (mm->context.user_psize == MMU_PAGE_64K)
		__hash_page_64K(ea, access, vsid, ptep, trap, local);
	else
		__hash_page_4K(ea, access, vsid, ptep, trap, local);
+3 −0
Original line number Diff line number Diff line
@@ -49,6 +49,9 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
	}

	mm->context.id = index;
	mm->context.user_psize = mmu_virtual_psize;
	mm->context.sllp = SLB_VSID_USER |
		mmu_psize_defs[mmu_virtual_psize].sllp;

	return 0;
}
Loading