Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit c09230f3 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull power management fixes from Rafael Wysocki:
 "One fix for a recent cpuidle core change that, against all odds,
  introduced a functional regression on Power systems and the fix for
  the crash during resume from hibernation on x86-64 that has been in
  the works for the last few weeks (it actually was ready last week, but
  I wanted to allow the reporters to test if for some more time).

  Specifics:

   - Fix a recent performance regression on Power systems (powernv and
     pseries) introduced by a core cpuidle commit that decreased the
     precision of the last_residency conversion from nano- to
     microseconds, which should not matter in theory, but turned out to
     play not-so-well with the special "snooze" idle state on Power
     (Shreyas B Prabhu).

   - Fix a crash during resume from hibernation on x86-64 caused by
     possible corruption of the kernel text part of page tables in the
     last phase of image restoration exposed by a security-related
     change during the 4.3 development cycle (Rafael Wysocki)"

* tag 'pm-4.7-rc7' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm:
  cpuidle: Fix last_residency division
  x86/power/64: Fix kernel text mapping corruption during image restoration
parents ac904ae6 7fe39a21
Loading
Loading
Loading
Loading
+85 −12
Original line number Diff line number Diff line
@@ -19,6 +19,7 @@
#include <asm/mtrr.h>
#include <asm/sections.h>
#include <asm/suspend.h>
#include <asm/tlbflush.h>

/* Defined in hibernate_asm_64.S */
extern asmlinkage __visible int restore_image(void);
@@ -28,6 +29,7 @@ extern asmlinkage __visible int restore_image(void);
 * kernel's text (this value is passed in the image header).
 */
unsigned long restore_jump_address __visible;
unsigned long jump_address_phys;

/*
 * Value of the cr3 register from before the hibernation (this value is passed
@@ -37,7 +39,43 @@ unsigned long restore_cr3 __visible;

pgd_t *temp_level4_pgt __visible;

void *relocated_restore_code __visible;
unsigned long relocated_restore_code __visible;

static int set_up_temporary_text_mapping(void)
{
	pmd_t *pmd;
	pud_t *pud;

	/*
	 * The new mapping only has to cover the page containing the image
	 * kernel's entry point (jump_address_phys), because the switch over to
	 * it is carried out by relocated code running from a page allocated
	 * specifically for this purpose and covered by the identity mapping, so
	 * the temporary kernel text mapping is only needed for the final jump.
	 * Moreover, in that mapping the virtual address of the image kernel's
	 * entry point must be the same as its virtual address in the image
	 * kernel (restore_jump_address), so the image kernel's
	 * restore_registers() code doesn't find itself in a different area of
	 * the virtual address space after switching over to the original page
	 * tables used by the image kernel.
	 */
	pud = (pud_t *)get_safe_page(GFP_ATOMIC);
	if (!pud)
		return -ENOMEM;

	pmd = (pmd_t *)get_safe_page(GFP_ATOMIC);
	if (!pmd)
		return -ENOMEM;

	set_pmd(pmd + pmd_index(restore_jump_address),
		__pmd((jump_address_phys & PMD_MASK) | __PAGE_KERNEL_LARGE_EXEC));
	set_pud(pud + pud_index(restore_jump_address),
		__pud(__pa(pmd) | _KERNPG_TABLE));
	set_pgd(temp_level4_pgt + pgd_index(restore_jump_address),
		__pgd(__pa(pud) | _KERNPG_TABLE));

	return 0;
}

static void *alloc_pgt_page(void *context)
{
@@ -59,9 +97,10 @@ static int set_up_temporary_mappings(void)
	if (!temp_level4_pgt)
		return -ENOMEM;

	/* It is safe to reuse the original kernel mapping */
	set_pgd(temp_level4_pgt + pgd_index(__START_KERNEL_map),
		init_level4_pgt[pgd_index(__START_KERNEL_map)]);
	/* Prepare a temporary mapping for the kernel text */
	result = set_up_temporary_text_mapping();
	if (result)
		return result;

	/* Set up the direct mapping from scratch */
	for (i = 0; i < nr_pfn_mapped; i++) {
@@ -78,19 +117,50 @@ static int set_up_temporary_mappings(void)
	return 0;
}

static int relocate_restore_code(void)
{
	pgd_t *pgd;
	pud_t *pud;

	relocated_restore_code = get_safe_page(GFP_ATOMIC);
	if (!relocated_restore_code)
		return -ENOMEM;

	memcpy((void *)relocated_restore_code, &core_restore_code, PAGE_SIZE);

	/* Make the page containing the relocated code executable */
	pgd = (pgd_t *)__va(read_cr3()) + pgd_index(relocated_restore_code);
	pud = pud_offset(pgd, relocated_restore_code);
	if (pud_large(*pud)) {
		set_pud(pud, __pud(pud_val(*pud) & ~_PAGE_NX));
	} else {
		pmd_t *pmd = pmd_offset(pud, relocated_restore_code);

		if (pmd_large(*pmd)) {
			set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_NX));
		} else {
			pte_t *pte = pte_offset_kernel(pmd, relocated_restore_code);

			set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_NX));
		}
	}
	__flush_tlb_all();

	return 0;
}

int swsusp_arch_resume(void)
{
	int error;

	/* We have got enough memory and from now on we cannot recover */
	if ((error = set_up_temporary_mappings()))
	error = set_up_temporary_mappings();
	if (error)
		return error;

	relocated_restore_code = (void *)get_safe_page(GFP_ATOMIC);
	if (!relocated_restore_code)
		return -ENOMEM;
	memcpy(relocated_restore_code, &core_restore_code,
	       &restore_registers - &core_restore_code);
	error = relocate_restore_code();
	if (error)
		return error;

	restore_image();
	return 0;
@@ -109,11 +179,12 @@ int pfn_is_nosave(unsigned long pfn)

struct restore_data_record {
	unsigned long jump_address;
	unsigned long jump_address_phys;
	unsigned long cr3;
	unsigned long magic;
};

#define RESTORE_MAGIC	0x0123456789ABCDEFUL
#define RESTORE_MAGIC	0x123456789ABCDEF0UL

/**
 *	arch_hibernation_header_save - populate the architecture specific part
@@ -126,7 +197,8 @@ int arch_hibernation_header_save(void *addr, unsigned int max_size)

	if (max_size < sizeof(struct restore_data_record))
		return -EOVERFLOW;
	rdr->jump_address = restore_jump_address;
	rdr->jump_address = (unsigned long)&restore_registers;
	rdr->jump_address_phys = __pa_symbol(&restore_registers);
	rdr->cr3 = restore_cr3;
	rdr->magic = RESTORE_MAGIC;
	return 0;
@@ -142,6 +214,7 @@ int arch_hibernation_header_restore(void *addr)
	struct restore_data_record *rdr = addr;

	restore_jump_address = rdr->jump_address;
	jump_address_phys = rdr->jump_address_phys;
	restore_cr3 = rdr->cr3;
	return (rdr->magic == RESTORE_MAGIC) ? 0 : -EINVAL;
}
+24 −31
Original line number Diff line number Diff line
@@ -44,9 +44,6 @@ ENTRY(swsusp_arch_suspend)
	pushfq
	popq	pt_regs_flags(%rax)

	/* save the address of restore_registers */
	movq	$restore_registers, %rax
	movq	%rax, restore_jump_address(%rip)
	/* save cr3 */
	movq	%cr3, %rax
	movq	%rax, restore_cr3(%rip)
@@ -57,31 +54,34 @@ ENTRY(swsusp_arch_suspend)
ENDPROC(swsusp_arch_suspend)

ENTRY(restore_image)
	/* switch to temporary page tables */
	movq	$__PAGE_OFFSET, %rdx
	movq	temp_level4_pgt(%rip), %rax
	subq	%rdx, %rax
	movq	%rax, %cr3
	/* Flush TLB */
	movq	mmu_cr4_features(%rip), %rax
	movq	%rax, %rdx
	andq	$~(X86_CR4_PGE), %rdx
	movq	%rdx, %cr4;  # turn off PGE
	movq	%cr3, %rcx;  # flush TLB
	movq	%rcx, %cr3;
	movq	%rax, %cr4;  # turn PGE back on

	/* prepare to jump to the image kernel */
	movq	restore_jump_address(%rip), %rax
	movq	restore_cr3(%rip), %rbx
	movq	restore_jump_address(%rip), %r8
	movq	restore_cr3(%rip), %r9

	/* prepare to switch to temporary page tables */
	movq	temp_level4_pgt(%rip), %rax
	movq	mmu_cr4_features(%rip), %rbx

	/* prepare to copy image data to their original locations */
	movq	restore_pblist(%rip), %rdx

	/* jump to relocated restore code */
	movq	relocated_restore_code(%rip), %rcx
	jmpq	*%rcx

	/* code below has been relocated to a safe page */
ENTRY(core_restore_code)
	/* switch to temporary page tables */
	movq	$__PAGE_OFFSET, %rcx
	subq	%rcx, %rax
	movq	%rax, %cr3
	/* flush TLB */
	movq	%rbx, %rcx
	andq	$~(X86_CR4_PGE), %rcx
	movq	%rcx, %cr4;  # turn off PGE
	movq	%cr3, %rcx;  # flush TLB
	movq	%rcx, %cr3;
	movq	%rbx, %cr4;  # turn PGE back on
.Lloop:
	testq	%rdx, %rdx
	jz	.Ldone
@@ -96,24 +96,17 @@ ENTRY(core_restore_code)
	/* progress to the next pbe */
	movq	pbe_next(%rdx), %rdx
	jmp	.Lloop

.Ldone:
	/* jump to the restore_registers address from the image header */
	jmpq	*%rax
	/*
	 * NOTE: This assumes that the boot kernel's text mapping covers the
	 * image kernel's page containing restore_registers and the address of
	 * this page is the same as in the image kernel's text mapping (it
	 * should always be true, because the text mapping is linear, starting
	 * from 0, and is supposed to cover the entire kernel text for every
	 * kernel).
	 *
	 * code below belongs to the image kernel
	 */
	jmpq	*%r8

	 /* code below belongs to the image kernel */
	.align PAGE_SIZE
ENTRY(restore_registers)
	FRAME_BEGIN
	/* go back to the original page tables */
	movq    %rbx, %cr3
	movq    %r9, %cr3

	/* Flush TLB, including "global" things (vmalloc) */
	movq	mmu_cr4_features(%rip), %rax
+4 −8
Original line number Diff line number Diff line
@@ -173,7 +173,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,

	struct cpuidle_state *target_state = &drv->states[index];
	bool broadcast = !!(target_state->flags & CPUIDLE_FLAG_TIMER_STOP);
	u64 time_start, time_end;
	ktime_t time_start, time_end;
	s64 diff;

	/*
@@ -195,13 +195,13 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
	sched_idle_set_state(target_state);

	trace_cpu_idle_rcuidle(index, dev->cpu);
	time_start = local_clock();
	time_start = ns_to_ktime(local_clock());

	stop_critical_timings();
	entered_state = target_state->enter(dev, drv, index);
	start_critical_timings();

	time_end = local_clock();
	time_end = ns_to_ktime(local_clock());
	trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);

	/* The cpu is no longer idle or about to enter idle. */
@@ -217,11 +217,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
	if (!cpuidle_state_is_coupled(drv, index))
		local_irq_enable();

	/*
	 * local_clock() returns the time in nanosecond, let's shift
	 * by 10 (divide by 1024) to have microsecond based time.
	 */
	diff = (time_end - time_start) >> 10;
	diff = ktime_us_delta(time_end, time_start);
	if (diff > INT_MAX)
		diff = INT_MAX;