Merge "arm64: Hot-remove implementation for arm64" (5cac878f) · Commits · e / devices / android_kernel_sony_msm8998

arch/arm64/Kconfig

+1 −0

Original line number	Diff line number	Diff line
		@@ -653,6 +653,7 @@ config HOTPLUG_CPU
		can be controlled through /sys/devices/system/cpu.

		config ARCH_ENABLE_MEMORY_HOTPLUG
		depends on !NUMA
		def_bool y

		config ARCH_ENABLE_MEMORY_HOTREMOVE

arch/arm64/include/asm/mmu.h

+7 −0

Original line number	Diff line number	Diff line
		@@ -35,5 +35,12 @@ extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
		unsigned long virt, phys_addr_t size,
		pgprot_t prot);
		extern void *fixmap_remap_fdt(phys_addr_t dt_phys);
		#ifdef CONFIG_MEMORY_HOTPLUG
		extern void hotplug_paging(phys_addr_t start, phys_addr_t size);
		#ifdef CONFIG_MEMORY_HOTREMOVE
		extern void remove_pagetable(unsigned long start,
		unsigned long end, bool direct);
		#endif
		#endif

		#endif

arch/arm64/include/asm/pgtable.h

+15 −0

Original line number	Diff line number	Diff line
		@@ -461,6 +461,11 @@ static inline phys_addr_t pmd_page_paddr(pmd_t pmd)
		return pmd_val(pmd) & PHYS_MASK & (s32)PAGE_MASK;
		}

		static inline unsigned long pmd_page_vaddr(pmd_t pmd)
		{
		return (unsigned long) __va(pmd_page_paddr(pmd));
		}

		/* Find an entry in the third-level page table. */
		#define pte_index(addr) (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))

		@@ -512,6 +517,11 @@ static inline phys_addr_t pud_page_paddr(pud_t pud)
		return pud_val(pud) & PHYS_MASK & (s32)PAGE_MASK;
		}

		static inline unsigned long pud_page_vaddr(pud_t pud)
		{
		return (unsigned long) __va(pud_page_paddr(pud));
		}

		/* Find an entry in the second-level page table. */
		#define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))

		@@ -564,6 +574,11 @@ static inline phys_addr_t pgd_page_paddr(pgd_t pgd)
		return pgd_val(pgd) & PHYS_MASK & (s32)PAGE_MASK;
		}

		static inline unsigned long pgd_page_vaddr(pgd_t pgd)
		{
		return (unsigned long) __va(pgd_page_paddr(pgd));
		}

		/* Find an entry in the frst-level page table. */
		#define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))

arch/arm64/mm/init.c

+78 −7

Original line number	Diff line number	Diff line
		@@ -503,14 +503,74 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
		struct zone *zone;
		unsigned long start_pfn = start >> PAGE_SHIFT;
		unsigned long nr_pages = size >> PAGE_SHIFT;
		unsigned long end_pfn = start_pfn + nr_pages;
		unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);
		int ret;

		if (end_pfn > max_sparsemem_pfn) {
		pr_err("end_pfn too big");
		return -1;
		}
		hotplug_paging(start, size);

		/*
		* Mark the first page in the range as unusable. This is needed
		* because __add_section (within __add_pages) wants pfn_valid
		* of it to be false, and in arm64 pfn falid is implemented by
		* just checking at the nomap flag for existing blocks.
		*
		* A small trick here is that __add_section() requires only
		* phys_start_pfn (that is the first pfn of a section) to be
		* invalid. Regardless of whether it was assumed (by the function
		* author) that all pfns within a section are either all valid
		* or all invalid, it allows to avoid looping twice (once here,
		* second when memblock_clear_nomap() is called) through all
		* pfns of the section and modify only one pfn. Thanks to that,
		* further, in __add_zone() only this very first pfn is skipped
		* and corresponding page is not flagged reserved. Therefore it
		* is enough to correct this setup only for it.
		*
		* When arch_add_memory() returns the walk_memory_range() function
		* is called and passed with online_memory_block() callback,
		* which execution finally reaches the memory_block_action()
		* function, where also only the first pfn of a memory block is
		* checked to be reserved. Above, it was first pfn of a section,
		* here it is a block but
		* (drivers/base/memory.c):
		* sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
		* (include/linux/memory.h):
		* #define MIN_MEMORY_BLOCK_SIZE (1UL << SECTION_SIZE_BITS)
		* so we can consider block and section equivalently
		*/
		memblock_mark_nomap(start, 1<<PAGE_SHIFT);

		pgdat = NODE_DATA(nid);

		zone = pgdat->node_zones +
		zone_for_memory(nid, start, size, ZONE_NORMAL, for_device);
		ret = __add_pages(nid, zone, start_pfn, nr_pages);

		/*
		* Make the pages usable after they have been added.
		* This will make pfn_valid return true
		*/
		memblock_clear_nomap(start, 1<<PAGE_SHIFT);

		/*
		* This is a hack to avoid having to mix arch specific code
		* into arch independent code. SetPageReserved is supposed
		* to be called by __add_zone (within __add_section, within
		* __add_pages). However, when it is called there, it assumes that
		* pfn_valid returns true. For the way pfn_valid is implemented
		* in arm64 (a check on the nomap flag), the only way to make
		* this evaluate true inside __add_zone is to clear the nomap
		* flags of blocks in architecture independent code.
		*
		* To avoid this, we set the Reserved flag here after we cleared
		* the nomap flag in the line above.
		*/
		SetPageReserved(pfn_to_page(start_pfn));

		if (ret)
		pr_warn("%s: Problem encountered in __add_pages() ret=%d\n",
		__func__, ret);
		@@ -519,21 +579,32 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
		}

		#ifdef CONFIG_MEMORY_HOTREMOVE
		static void kernel_physical_mapping_remove(unsigned long start,
		unsigned long end)
		{
		start = (unsigned long)__va(start);
		end = (unsigned long)__va(end);

		remove_pagetable(start, end, true);

		}

		int arch_remove_memory(u64 start, u64 size)
		{
		unsigned long start_pfn = start >> PAGE_SHIFT;
		unsigned long nr_pages = size >> PAGE_SHIFT;
		struct page *page = pfn_to_page(start_pfn);
		struct zone *zone;
		int ret;
		int ret = 0;

		zone = page_zone(pfn_to_page(start_pfn));
		zone = page_zone(page);
		ret = __remove_pages(zone, start_pfn, nr_pages);
		if (ret)
		pr_warn("%s: Problem encountered in __remove_pages() ret=%d\n",
		__func__, ret);
		WARN_ON_ONCE(ret);

		kernel_physical_mapping_remove(start, start + size);

		return ret;
		}
		#endif
		#endif

		#endif /* CONFIG_MEMORY_HOTREMOVE */
		#endif /* CONFIG_MEMORY_HOTPLUG */

arch/arm64/mm/mmu.c

+420 −0

Original line number	Diff line number	Diff line
		@@ -605,6 +605,423 @@ void __init paging_init(void)
		bootmem_init();
		}

		#ifdef CONFIG_MEMORY_HOTPLUG
		static phys_addr_t pgd_pgtable_alloc(void)
		{
		void ptr = (void )__get_free_page(PGALLOC_GFP);
		if (!ptr \|\| !pgtable_page_ctor(virt_to_page(ptr)))
		BUG();

		/* Ensure the zeroed page is visible to the page table walker */
		dsb(ishst);
		return __pa(ptr);
		}

		/*
		* hotplug_paging() is used by memory hotplug to build new page tables
		* for hot added memory.
		*/
		void hotplug_paging(phys_addr_t start, phys_addr_t size)
		{

		struct page *pg;
		phys_addr_t pgd_phys = pgd_pgtable_alloc();
		pgd_t *pgd = pgd_set_fixmap(pgd_phys);

		memcpy(pgd, swapper_pg_dir, PAGE_SIZE);

		__create_pgd_mapping(pgd, start, __phys_to_virt(start), size,
		PAGE_KERNEL, pgd_pgtable_alloc);

		cpu_replace_ttbr1(__va(pgd_phys));
		memcpy(swapper_pg_dir, pgd, PAGE_SIZE);
		cpu_replace_ttbr1(swapper_pg_dir);

		pgd_clear_fixmap();

		pg = phys_to_page(pgd_phys);
		pgtable_page_dtor(pg);
		__free_pages(pg, 0);
		}

		#ifdef CONFIG_MEMORY_HOTREMOVE
		#define PAGE_INUSE 0xFD

		static void free_pagetable(struct page *page, int order, bool direct)
		{
		unsigned long magic;
		unsigned int nr_pages = 1 << order;

		/* bootmem page has reserved flag */
		if (PageReserved(page)) {
		__ClearPageReserved(page);

		magic = (unsigned long)page->lru.next;
		if (magic == SECTION_INFO \|\| magic == MIX_SECTION_INFO) {
		while (nr_pages--)
		put_page_bootmem(page++);
		} else {
		while (nr_pages--)
		free_reserved_page(page++);
		}
		} else {
		/*
		* Only direct pagetable allocation (those allocated via
		* hotplug) call the pgtable_page_ctor; vmemmap pgtable
		* allocations don't.
		*/
		if (direct)
		pgtable_page_dtor(page);

		free_pages((unsigned long)page_address(page), order);
		}
		}

		static void free_pte_table(pmd_t *pmd, bool direct)
		{
		pte_t pte_start, pte;
		struct page *page;
		int i;

		pte_start = (pte_t ) pmd_page_vaddr(pmd);
		/* Check if there is no valid entry in the PMD */
		for (i = 0; i < PTRS_PER_PTE; i++) {
		pte = pte_start + i;
		if (!pte_none(*pte))
		return;
		}

		page = pmd_page(*pmd);

		free_pagetable(page, 0, direct);

		/*
		* This spin lock could be only taken in _pte_aloc_kernel
		* in mm/memory.c and nowhere else (for arm64). Not sure if
		* the function above can be called concurrently. In doubt,
		* I am living it here for now, but it probably can be removed
		*/
		spin_lock(&init_mm.page_table_lock);
		pmd_clear(pmd);
		spin_unlock(&init_mm.page_table_lock);
		}

		static void free_pmd_table(pud_t *pud, bool direct)
		{
		pmd_t pmd_start, pmd;
		struct page *page;
		int i;

		pmd_start = (pmd_t ) pud_page_vaddr(pud);
		/* Check if there is no valid entry in the PMD */
		for (i = 0; i < PTRS_PER_PMD; i++) {
		pmd = pmd_start + i;
		if (!pmd_none(*pmd))
		return;
		}

		page = pud_page(*pud);

		free_pagetable(page, 0, direct);

		/*
		* This spin lock could be only taken in _pte_aloc_kernel
		* in mm/memory.c and nowhere else (for arm64). Not sure if
		* the function above can be called concurrently. In doubt,
		* I am living it here for now, but it probably can be removed
		*/
		spin_lock(&init_mm.page_table_lock);
		pud_clear(pud);
		spin_unlock(&init_mm.page_table_lock);
		}

		/*
		* When the PUD is folded on the PGD (three levels of paging),
		* there's no need to free PUDs
		*/
		#if CONFIG_PGTABLE_LEVELS > 3
		static void free_pud_table(pgd_t *pgd, bool direct)
		{
		pud_t pud_start, pud;
		struct page *page;
		int i;

		pud_start = (pud_t ) pgd_page_vaddr(pgd);
		/* Check if there is no valid entry in the PUD */
		for (i = 0; i < PTRS_PER_PUD; i++) {
		pud = pud_start + i;
		if (!pud_none(*pud))
		return;
		}

		page = pgd_page(*pgd);

		free_pagetable(page, 0, direct);

		/*
		* This spin lock could be only
		* taken in _pte_aloc_kernel in
		* mm/memory.c and nowhere else
		* (for arm64). Not sure if the
		* function above can be called
		* concurrently. In doubt,
		* I am living it here for now,
		* but it probably can be removed.
		*/
		spin_lock(&init_mm.page_table_lock);
		pgd_clear(pgd);
		spin_unlock(&init_mm.page_table_lock);
		}
		#endif

		static void remove_pte_table(pte_t *pte, unsigned long addr,
		unsigned long end, bool direct)
		{
		unsigned long next;
		void *page_addr;

		for (; addr < end; addr = next, pte++) {
		next = (addr + PAGE_SIZE) & PAGE_MASK;
		if (next > end)
		next = end;

		if (!pte_present(*pte))
		continue;

		if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) {
		/*
		* Do not free direct mapping pages since they were
		* freed when offlining, or simplely not in use.
		*/
		if (!direct)
		free_pagetable(pte_page(*pte), 0, direct);

		/*
		* This spin lock could be only
		* taken in _pte_aloc_kernel in
		* mm/memory.c and nowhere else
		* (for arm64). Not sure if the
		* function above can be called
		* concurrently. In doubt,
		* I am living it here for now,
		* but it probably can be removed.
		*/
		spin_lock(&init_mm.page_table_lock);
		pte_clear(&init_mm, addr, pte);
		spin_unlock(&init_mm.page_table_lock);
		} else {
		/*
		* If we are here, we are freeing vmemmap pages since
		* direct mapped memory ranges to be freed are aligned.
		*
		* If we are not removing the whole page, it means
		* other page structs in this page are being used and
		* we canot remove them. So fill the unused page_structs
		* with 0xFD, and remove the page when it is wholly
		* filled with 0xFD.
		*/
		memset((void *)addr, PAGE_INUSE, next - addr);

		page_addr = page_address(pte_page(*pte));
		if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) {
		free_pagetable(pte_page(*pte), 0, direct);

		/*
		* This spin lock could be only
		* taken in _pte_aloc_kernel in
		* mm/memory.c and nowhere else
		* (for arm64). Not sure if the
		* function above can be called
		* concurrently. In doubt,
		* I am living it here for now,
		* but it probably can be removed.
		*/
		spin_lock(&init_mm.page_table_lock);
		pte_clear(&init_mm, addr, pte);
		spin_unlock(&init_mm.page_table_lock);
		}
		}
		}

		// I am adding this flush here in simmetry to the x86 code.
		// Why do I need to call it here and not in remove_p[mu]d
		flush_tlb_all();
		}

		static void remove_pmd_table(pmd_t *pmd, unsigned long addr,
		unsigned long end, bool direct)
		{
		unsigned long next;
		void *page_addr;
		pte_t *pte;

		for (; addr < end; addr = next, pmd++) {
		next = pmd_addr_end(addr, end);

		if (!pmd_present(*pmd))
		continue;

		// check if we are using 2MB section mappings
		if (pmd_sect(*pmd)) {
		if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) {
		if (!direct) {
		free_pagetable(pmd_page(*pmd),
		get_order(PMD_SIZE), direct);
		}
		/*
		* This spin lock could be only
		* taken in _pte_aloc_kernel in
		* mm/memory.c and nowhere else
		* (for arm64). Not sure if the
		* function above can be called
		* concurrently. In doubt,
		* I am living it here for now,
		* but it probably can be removed.
		*/
		spin_lock(&init_mm.page_table_lock);
		pmd_clear(pmd);
		spin_unlock(&init_mm.page_table_lock);
		} else {
		/* If here, we are freeing vmemmap pages. */
		memset((void *)addr, PAGE_INUSE, next - addr);

		page_addr = page_address(pmd_page(*pmd));
		if (!memchr_inv(page_addr, PAGE_INUSE,
		PMD_SIZE)) {
		free_pagetable(pmd_page(*pmd),
		get_order(PMD_SIZE), direct);

		/*
		* This spin lock could be only
		* taken in _pte_aloc_kernel in
		* mm/memory.c and nowhere else
		* (for arm64). Not sure if the
		* function above can be called
		* concurrently. In doubt,
		* I am living it here for now,
		* but it probably can be removed.
		*/
		spin_lock(&init_mm.page_table_lock);
		pmd_clear(pmd);
		spin_unlock(&init_mm.page_table_lock);
		}
		}
		continue;
		}

		BUG_ON(!pmd_table(*pmd));

		pte = pte_offset_map(pmd, addr);
		remove_pte_table(pte, addr, next, direct);
		free_pte_table(pmd, direct);
		}
		}

		static void remove_pud_table(pud_t *pud, unsigned long addr,
		unsigned long end, bool direct)
		{
		unsigned long next;
		pmd_t *pmd;
		void *page_addr;

		for (; addr < end; addr = next, pud++) {
		next = pud_addr_end(addr, end);
		if (!pud_present(*pud))
		continue;
		/*
		* If we are using 4K granules, check if we are using
		* 1GB section mapping.
		*/
		if (pud_sect(*pud)) {
		if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) {
		if (!direct) {
		free_pagetable(pud_page(*pud),
		get_order(PUD_SIZE), direct);
		}

		/*
		* This spin lock could be only
		* taken in _pte_aloc_kernel in
		* mm/memory.c and nowhere else
		* (for arm64). Not sure if the
		* function above can be called
		* concurrently. In doubt,
		* I am living it here for now,
		* but it probably can be removed.
		*/
		spin_lock(&init_mm.page_table_lock);
		pud_clear(pud);
		spin_unlock(&init_mm.page_table_lock);
		} else {
		/* If here, we are freeing vmemmap pages. */
		memset((void *)addr, PAGE_INUSE, next - addr);

		page_addr = page_address(pud_page(*pud));
		if (!memchr_inv(page_addr, PAGE_INUSE,
		PUD_SIZE)) {

		free_pagetable(pud_page(*pud),
		get_order(PUD_SIZE), direct);

		/*
		* This spin lock could be only
		* taken in _pte_aloc_kernel in
		* mm/memory.c and nowhere else
		* (for arm64). Not sure if the
		* function above can be called
		* concurrently. In doubt,
		* I am living it here for now,
		* but it probably can be removed.
		*/
		spin_lock(&init_mm.page_table_lock);
		pud_clear(pud);
		spin_unlock(&init_mm.page_table_lock);
		}
		}
		continue;
		}

		BUG_ON(!pud_table(*pud));

		pmd = pmd_offset(pud, addr);
		remove_pmd_table(pmd, addr, next, direct);
		free_pmd_table(pud, direct);
		}
		}

		void remove_pagetable(unsigned long start, unsigned long end, bool direct)
		{
		unsigned long next;
		unsigned long addr;
		pgd_t *pgd;
		pud_t *pud;

		for (addr = start; addr < end; addr = next) {
		next = pgd_addr_end(addr, end);

		pgd = pgd_offset_k(addr);
		if (pgd_none(*pgd))
		continue;

		pud = pud_offset(pgd, addr);
		remove_pud_table(pud, addr, next, direct);
		/*
		* When the PUD is folded on the PGD (three levels of paging),
		* I did already clear the PMD page in free_pmd_table,
		* and reset the corresponding PGD==PUD entry.
		*/
		#if CONFIG_PGTABLE_LEVELS > 3
		free_pud_table(pgd, direct);
		#endif
		}

		flush_tlb_all();
		}


		#endif /* CONFIG_MEMORY_HOTREMOVE */
		#endif /* CONFIG_MEMORY_HOTPLUG */

		/*
		* Check whether a kernel address is valid (derived from arch/x86/).
		*/
		@@ -686,6 +1103,9 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
		#endif /* CONFIG_ARM64_64K_PAGES */
		void vmemmap_free(unsigned long start, unsigned long end)
		{
		#ifdef CONFIG_MEMORY_HOTREMOVE
		remove_pagetable(start, end, false);
		#endif
		}
		#endif /* CONFIG_SPARSEMEM_VMEMMAP */