Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 5cac878f authored by Linux Build Service Account's avatar Linux Build Service Account Committed by Gerrit - the friendly Code Review server
Browse files

Merge "arm64: Hot-remove implementation for arm64"

parents aa035609 efdbaef3
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -653,6 +653,7 @@ config HOTPLUG_CPU
	  can be controlled through /sys/devices/system/cpu.

config ARCH_ENABLE_MEMORY_HOTPLUG
    depends on !NUMA
	def_bool y

config ARCH_ENABLE_MEMORY_HOTREMOVE
+7 −0
Original line number Diff line number Diff line
@@ -35,5 +35,12 @@ extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
			       unsigned long virt, phys_addr_t size,
			       pgprot_t prot);
extern void *fixmap_remap_fdt(phys_addr_t dt_phys);
#ifdef CONFIG_MEMORY_HOTPLUG
extern void hotplug_paging(phys_addr_t start, phys_addr_t size);
#ifdef CONFIG_MEMORY_HOTREMOVE
extern void remove_pagetable(unsigned long start,
	unsigned long end, bool direct);
#endif
#endif

#endif
+15 −0
Original line number Diff line number Diff line
@@ -461,6 +461,11 @@ static inline phys_addr_t pmd_page_paddr(pmd_t pmd)
	return pmd_val(pmd) & PHYS_MASK & (s32)PAGE_MASK;
}

static inline unsigned long pmd_page_vaddr(pmd_t pmd)
{
	return (unsigned long) __va(pmd_page_paddr(pmd));
}

/* Find an entry in the third-level page table. */
#define pte_index(addr)		(((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))

@@ -512,6 +517,11 @@ static inline phys_addr_t pud_page_paddr(pud_t pud)
	return pud_val(pud) & PHYS_MASK & (s32)PAGE_MASK;
}

static inline unsigned long pud_page_vaddr(pud_t pud)
{
	return (unsigned long) __va(pud_page_paddr(pud));
}

/* Find an entry in the second-level page table. */
#define pmd_index(addr)		(((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))

@@ -564,6 +574,11 @@ static inline phys_addr_t pgd_page_paddr(pgd_t pgd)
	return pgd_val(pgd) & PHYS_MASK & (s32)PAGE_MASK;
}

static inline unsigned long pgd_page_vaddr(pgd_t pgd)
{
	return (unsigned long) __va(pgd_page_paddr(pgd));
}

/* Find an entry in the frst-level page table. */
#define pud_index(addr)		(((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))

+78 −7
Original line number Diff line number Diff line
@@ -503,14 +503,74 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
	struct zone *zone;
	unsigned long start_pfn = start >> PAGE_SHIFT;
	unsigned long nr_pages = size >> PAGE_SHIFT;
	unsigned long end_pfn = start_pfn + nr_pages;
	unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);
	int ret;

	if (end_pfn > max_sparsemem_pfn) {
		pr_err("end_pfn too big");
		return -1;
	}
	hotplug_paging(start, size);

	/*
	 * Mark the first page in the range as unusable. This is needed
	 * because __add_section (within __add_pages) wants pfn_valid
	 * of it to be false, and in arm64 pfn falid is implemented by
	 * just checking at the nomap flag for existing blocks.
	 *
	 * A small trick here is that __add_section() requires only
	 * phys_start_pfn (that is the first pfn of a section) to be
	 * invalid. Regardless of whether it was assumed (by the function
	 * author) that all pfns within a section are either all valid
	 * or all invalid, it allows to avoid looping twice (once here,
	 * second when memblock_clear_nomap() is called) through all
	 * pfns of the section and modify only one pfn. Thanks to that,
	 * further, in __add_zone() only this very first pfn is skipped
	 * and corresponding page is not flagged reserved. Therefore it
	 * is enough to correct this setup only for it.
	 *
	 * When arch_add_memory() returns the walk_memory_range() function
	 * is called and passed with online_memory_block() callback,
	 * which execution finally reaches the memory_block_action()
	 * function, where also only the first pfn of a memory block is
	 * checked to be reserved. Above, it was first pfn of a section,
	 * here it is a block but
	 * (drivers/base/memory.c):
	 *     sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
	 * (include/linux/memory.h):
	 *     #define MIN_MEMORY_BLOCK_SIZE     (1UL << SECTION_SIZE_BITS)
	 * so we can consider block and section equivalently
	 */
	memblock_mark_nomap(start, 1<<PAGE_SHIFT);

	pgdat = NODE_DATA(nid);

	zone = pgdat->node_zones +
		zone_for_memory(nid, start, size, ZONE_NORMAL, for_device);
	ret = __add_pages(nid, zone, start_pfn, nr_pages);

	/*
	 * Make the pages usable after they have been added.
	 * This will make pfn_valid return true
	 */
	memblock_clear_nomap(start, 1<<PAGE_SHIFT);

	/*
	 * This is a hack to avoid having to mix arch specific code
	 * into arch independent code. SetPageReserved is supposed
	 * to be called by __add_zone (within __add_section, within
	 * __add_pages). However, when it is called there, it assumes that
	 * pfn_valid returns true.  For the way pfn_valid is implemented
	 * in arm64 (a check on the nomap flag), the only way to make
	 * this evaluate true inside __add_zone is to clear the nomap
	 * flags of blocks in architecture independent code.
	 *
	 * To avoid this, we set the Reserved flag here after we cleared
	 * the nomap flag in the line above.
	 */
	SetPageReserved(pfn_to_page(start_pfn));

	if (ret)
		pr_warn("%s: Problem encountered in __add_pages() ret=%d\n",
			__func__, ret);
@@ -519,21 +579,32 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
}

#ifdef CONFIG_MEMORY_HOTREMOVE
static void kernel_physical_mapping_remove(unsigned long start,
	unsigned long end)
{
	start = (unsigned long)__va(start);
	end = (unsigned long)__va(end);

	remove_pagetable(start, end, true);

}

int arch_remove_memory(u64 start, u64 size)
{
	unsigned long start_pfn = start >> PAGE_SHIFT;
	unsigned long nr_pages = size >> PAGE_SHIFT;
	struct page *page = pfn_to_page(start_pfn);
	struct zone *zone;
	int ret;
	int ret = 0;

	zone = page_zone(pfn_to_page(start_pfn));
	zone = page_zone(page);
	ret = __remove_pages(zone, start_pfn, nr_pages);
	if (ret)
		pr_warn("%s: Problem encountered in __remove_pages() ret=%d\n",
			__func__, ret);
	WARN_ON_ONCE(ret);

	kernel_physical_mapping_remove(start, start + size);

	return ret;
}
#endif
#endif

#endif /* CONFIG_MEMORY_HOTREMOVE */
#endif /* CONFIG_MEMORY_HOTPLUG */
+420 −0
Original line number Diff line number Diff line
@@ -605,6 +605,423 @@ void __init paging_init(void)
	bootmem_init();
}

#ifdef CONFIG_MEMORY_HOTPLUG
static phys_addr_t pgd_pgtable_alloc(void)
{
        void *ptr = (void *)__get_free_page(PGALLOC_GFP);
        if (!ptr || !pgtable_page_ctor(virt_to_page(ptr)))
                BUG();

        /* Ensure the zeroed page is visible to the page table walker */
        dsb(ishst);
        return __pa(ptr);
}

/*
 * hotplug_paging() is used by memory hotplug to build new page tables
 * for hot added memory.
 */
void hotplug_paging(phys_addr_t start, phys_addr_t size)
{

	struct page *pg;
	phys_addr_t pgd_phys = pgd_pgtable_alloc();
	pgd_t *pgd = pgd_set_fixmap(pgd_phys);

	memcpy(pgd, swapper_pg_dir, PAGE_SIZE);

	__create_pgd_mapping(pgd, start, __phys_to_virt(start), size,
		PAGE_KERNEL, pgd_pgtable_alloc);

	cpu_replace_ttbr1(__va(pgd_phys));
	memcpy(swapper_pg_dir, pgd, PAGE_SIZE);
	cpu_replace_ttbr1(swapper_pg_dir);

	pgd_clear_fixmap();

	pg = phys_to_page(pgd_phys);
	pgtable_page_dtor(pg);
	__free_pages(pg, 0);
}

#ifdef CONFIG_MEMORY_HOTREMOVE
#define PAGE_INUSE 0xFD

static void  free_pagetable(struct page *page, int order, bool direct)
{
	unsigned long magic;
	unsigned int nr_pages = 1 << order;

	/* bootmem page has reserved flag */
	if (PageReserved(page)) {
		__ClearPageReserved(page);

		magic = (unsigned long)page->lru.next;
		if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
			while (nr_pages--)
				put_page_bootmem(page++);
		} else {
			while (nr_pages--)
				free_reserved_page(page++);
		}
	} else {
		/*
		 * Only direct pagetable allocation (those allocated via
		 * hotplug) call the pgtable_page_ctor; vmemmap pgtable
		 * allocations don't.
		 */
		if (direct)
			pgtable_page_dtor(page);

		free_pages((unsigned long)page_address(page), order);
	}
}

static void free_pte_table(pmd_t *pmd, bool direct)
{
	pte_t *pte_start, *pte;
	struct page *page;
	int i;

	pte_start =  (pte_t *) pmd_page_vaddr(*pmd);
	/* Check if there is no valid entry in the PMD */
	for (i = 0; i < PTRS_PER_PTE; i++) {
		pte = pte_start + i;
		if (!pte_none(*pte))
			return;
	}

	page = pmd_page(*pmd);

	free_pagetable(page, 0, direct);

	/*
	 * This spin lock could be only taken in _pte_aloc_kernel
	 * in mm/memory.c and nowhere else (for arm64). Not sure if
	 * the function above can be called concurrently. In doubt,
	 * I am living it here for now, but it probably can be removed
	 */
	spin_lock(&init_mm.page_table_lock);
	pmd_clear(pmd);
	spin_unlock(&init_mm.page_table_lock);
}

static void free_pmd_table(pud_t *pud, bool direct)
{
	pmd_t *pmd_start, *pmd;
	struct page *page;
	int i;

	pmd_start = (pmd_t *) pud_page_vaddr(*pud);
	/* Check if there is no valid entry in the PMD */
	for (i = 0; i < PTRS_PER_PMD; i++) {
		pmd = pmd_start + i;
		if (!pmd_none(*pmd))
			return;
	}

	page = pud_page(*pud);

	free_pagetable(page, 0, direct);

	/*
	 * This spin lock could be only taken in _pte_aloc_kernel
	 * in mm/memory.c and nowhere else (for arm64). Not sure if
	 * the function above can be called concurrently. In doubt,
	 * I am living it here for now, but it probably can be removed
	 */
	spin_lock(&init_mm.page_table_lock);
	pud_clear(pud);
	spin_unlock(&init_mm.page_table_lock);
}

/*
 * When the PUD is folded on the PGD (three levels of paging),
 * there's no need to free PUDs
 */
#if CONFIG_PGTABLE_LEVELS > 3
static void free_pud_table(pgd_t *pgd, bool direct)
{
	pud_t *pud_start, *pud;
	struct page *page;
	int i;

	pud_start = (pud_t *) pgd_page_vaddr(*pgd);
	/* Check if there is no valid entry in the PUD */
	for (i = 0; i < PTRS_PER_PUD; i++) {
		pud = pud_start + i;
		if (!pud_none(*pud))
			return;
	}

	page = pgd_page(*pgd);

	free_pagetable(page, 0, direct);

	/*
	 * This spin lock could be only
	 * taken in _pte_aloc_kernel in
	 * mm/memory.c and nowhere else
	 * (for arm64). Not sure if the
	 * function above can be called
	 * concurrently. In doubt,
	 * I am living it here for now,
	 * but it probably can be removed.
	 */
	spin_lock(&init_mm.page_table_lock);
	pgd_clear(pgd);
	spin_unlock(&init_mm.page_table_lock);
}
#endif

static void remove_pte_table(pte_t *pte, unsigned long addr,
	unsigned long end, bool direct)
{
	unsigned long next;
	void *page_addr;

	for (; addr < end; addr = next, pte++) {
		next = (addr + PAGE_SIZE) & PAGE_MASK;
		if (next > end)
			next = end;

		if (!pte_present(*pte))
			continue;

		if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) {
			/*
			 * Do not free direct mapping pages since they were
			 * freed when offlining, or simplely not in use.
			 */
			if (!direct)
				free_pagetable(pte_page(*pte), 0, direct);

			/*
			 * This spin lock could be only
			 * taken in _pte_aloc_kernel in
			 * mm/memory.c and nowhere else
			 * (for arm64). Not sure if the
			 * function above can be called
			 * concurrently. In doubt,
			 * I am living it here for now,
			 * but it probably can be removed.
			 */
			spin_lock(&init_mm.page_table_lock);
			pte_clear(&init_mm, addr, pte);
			spin_unlock(&init_mm.page_table_lock);
		} else {
			/*
			 * If we are here, we are freeing vmemmap pages since
			 * direct mapped memory ranges to be freed are aligned.
			 *
			 * If we are not removing the whole page, it means
			 * other page structs in this page are being used and
			 * we canot remove them. So fill the unused page_structs
			 * with 0xFD, and remove the page when it is wholly
			 * filled with 0xFD.
			 */
			memset((void *)addr, PAGE_INUSE, next - addr);

			page_addr = page_address(pte_page(*pte));
			if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) {
				free_pagetable(pte_page(*pte), 0, direct);

				/*
				 * This spin lock could be only
				 * taken in _pte_aloc_kernel in
				 * mm/memory.c and nowhere else
				 * (for arm64). Not sure if the
				 * function above can be called
				 * concurrently. In doubt,
				 * I am living it here for now,
				 * but it probably can be removed.
				 */
				spin_lock(&init_mm.page_table_lock);
				pte_clear(&init_mm, addr, pte);
				spin_unlock(&init_mm.page_table_lock);
			}
		}
	}

	// I am adding this flush here in simmetry to the x86 code.
	// Why do I need to call it here and not in remove_p[mu]d
	flush_tlb_all();
}

static void remove_pmd_table(pmd_t *pmd, unsigned long addr,
	unsigned long end, bool direct)
{
	unsigned long next;
	void *page_addr;
	pte_t *pte;

	for (; addr < end; addr = next, pmd++) {
		next = pmd_addr_end(addr, end);

		if (!pmd_present(*pmd))
			continue;

		// check if we are using 2MB section mappings
		if (pmd_sect(*pmd)) {
			if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) {
				if (!direct) {
					free_pagetable(pmd_page(*pmd),
						get_order(PMD_SIZE), direct);
				}
				/*
				 * This spin lock could be only
				 * taken in _pte_aloc_kernel in
				 * mm/memory.c and nowhere else
				 * (for arm64). Not sure if the
				 * function above can be called
				 * concurrently. In doubt,
				 * I am living it here for now,
				 * but it probably can be removed.
				 */
				spin_lock(&init_mm.page_table_lock);
				pmd_clear(pmd);
				spin_unlock(&init_mm.page_table_lock);
			} else {
				/* If here, we are freeing vmemmap pages. */
				memset((void *)addr, PAGE_INUSE, next - addr);

				page_addr = page_address(pmd_page(*pmd));
				if (!memchr_inv(page_addr, PAGE_INUSE,
						PMD_SIZE)) {
					free_pagetable(pmd_page(*pmd),
						get_order(PMD_SIZE), direct);

					/*
					 * This spin lock could be only
					 * taken in _pte_aloc_kernel in
					 * mm/memory.c and nowhere else
					 * (for arm64). Not sure if the
					 * function above can be called
					 * concurrently. In doubt,
					 * I am living it here for now,
					 * but it probably can be removed.
					 */
					spin_lock(&init_mm.page_table_lock);
					pmd_clear(pmd);
					spin_unlock(&init_mm.page_table_lock);
				}
			}
			continue;
		}

		BUG_ON(!pmd_table(*pmd));

		pte = pte_offset_map(pmd, addr);
		remove_pte_table(pte, addr, next, direct);
		free_pte_table(pmd, direct);
	}
}

static void remove_pud_table(pud_t *pud, unsigned long addr,
	unsigned long end, bool direct)
{
	unsigned long next;
	pmd_t *pmd;
	void *page_addr;

	for (; addr < end; addr = next, pud++) {
		next = pud_addr_end(addr, end);
		if (!pud_present(*pud))
			continue;
		/*
		 * If we are using 4K granules, check if we are using
		 * 1GB section mapping.
		 */
		if (pud_sect(*pud)) {
			if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) {
				if (!direct) {
					free_pagetable(pud_page(*pud),
						get_order(PUD_SIZE), direct);
				}

				/*
				 * This spin lock could be only
				 * taken in _pte_aloc_kernel in
				 * mm/memory.c and nowhere else
				 * (for arm64). Not sure if the
				 * function above can be called
				 * concurrently. In doubt,
				 * I am living it here for now,
				 * but it probably can be removed.
				 */
				spin_lock(&init_mm.page_table_lock);
				pud_clear(pud);
				spin_unlock(&init_mm.page_table_lock);
			} else {
				/* If here, we are freeing vmemmap pages. */
				memset((void *)addr, PAGE_INUSE, next - addr);

				page_addr = page_address(pud_page(*pud));
				if (!memchr_inv(page_addr, PAGE_INUSE,
						PUD_SIZE)) {

					free_pagetable(pud_page(*pud),
						get_order(PUD_SIZE), direct);

					/*
					 * This spin lock could be only
					 * taken in _pte_aloc_kernel in
					 * mm/memory.c and nowhere else
					 * (for arm64). Not sure if the
					 * function above can be called
					 * concurrently. In doubt,
					 * I am living it here for now,
					 * but it probably can be removed.
					 */
					spin_lock(&init_mm.page_table_lock);
					pud_clear(pud);
					spin_unlock(&init_mm.page_table_lock);
				}
			}
			continue;
		}

		BUG_ON(!pud_table(*pud));

		pmd = pmd_offset(pud, addr);
		remove_pmd_table(pmd, addr, next, direct);
		free_pmd_table(pud, direct);
	}
}

void remove_pagetable(unsigned long start, unsigned long end, bool direct)
{
	unsigned long next;
	unsigned long addr;
	pgd_t *pgd;
	pud_t *pud;

	for (addr = start; addr < end; addr = next) {
		next = pgd_addr_end(addr, end);

		pgd = pgd_offset_k(addr);
		if (pgd_none(*pgd))
			continue;

		pud = pud_offset(pgd, addr);
		remove_pud_table(pud, addr, next, direct);
		/*
		 * When the PUD is folded on the PGD (three levels of paging),
		 * I did already clear the PMD page in free_pmd_table,
		 * and reset the corresponding PGD==PUD entry.
		 */
#if CONFIG_PGTABLE_LEVELS > 3
		free_pud_table(pgd, direct);
#endif
	}

	flush_tlb_all();
}


#endif /* CONFIG_MEMORY_HOTREMOVE */
#endif /* CONFIG_MEMORY_HOTPLUG */

/*
 * Check whether a kernel address is valid (derived from arch/x86/).
 */
@@ -686,6 +1103,9 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
#endif	/* CONFIG_ARM64_64K_PAGES */
void vmemmap_free(unsigned long start, unsigned long end)
{
#ifdef CONFIG_MEMORY_HOTREMOVE
	remove_pagetable(start, end, false);
#endif
}
#endif	/* CONFIG_SPARSEMEM_VMEMMAP */

Loading