Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 402b0862 authored by Carsten Otte's avatar Carsten Otte Committed by Avi Kivity
Browse files

s390: KVM preparation: provide hook to enable pgstes in user pagetable



The SIE instruction on s390 uses the 2nd half of the page table page to
virtualize the storage keys of a guest. This patch offers the s390_enable_sie
function, which reorganizes the page tables of a single-threaded process to
reserve space in the page table:
s390_enable_sie makes sure that the process is single threaded and then uses
dup_mm to create a new mm with reorganized page tables. The old mm is freed
and the process has now a page status extended field after every page table.

Code that wants to exploit pgstes should SELECT CONFIG_PGSTE.

This patch has a small common code hit, namely making dup_mm non-static.

Edit (Carsten): I've modified Martin's patch, following Jeremy Fitzhardinge's
review feedback. Now we do have the prototype for dup_mm in
include/linux/sched.h. Following Martin's suggestion, s390_enable_sie() does now
call task_lock() to prevent race against ptrace modification of mm_users.

Signed-off-by: default avatarMartin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: default avatarCarsten Otte <cotte@de.ibm.com>
Acked-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarAvi Kivity <avi@qumranet.com>
parent 37817f29
Loading
Loading
Loading
Loading
+4 −0
Original line number Diff line number Diff line
@@ -62,6 +62,10 @@ config GENERIC_LOCKBREAK
	default y
	depends on SMP && PREEMPT

config PGSTE
	bool
	default y if KVM

mainmenu "Linux Kernel Configuration"

config S390
+4 −0
Original line number Diff line number Diff line
@@ -316,7 +316,11 @@ static int __init early_parse_ipldelay(char *p)
early_param("ipldelay", early_parse_ipldelay);

#ifdef CONFIG_S390_SWITCH_AMODE
#ifdef CONFIG_PGSTE
unsigned int switch_amode = 1;
#else
unsigned int switch_amode = 0;
#endif
EXPORT_SYMBOL_GPL(switch_amode);

static void set_amode_and_uaccess(unsigned long user_amode,
+62 −3
Original line number Diff line number Diff line
@@ -30,11 +30,27 @@
#define TABLES_PER_PAGE	4
#define FRAG_MASK	15UL
#define SECOND_HALVES	10UL

void clear_table_pgstes(unsigned long *table)
{
	clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/4);
	memset(table + 256, 0, PAGE_SIZE/4);
	clear_table(table + 512, _PAGE_TYPE_EMPTY, PAGE_SIZE/4);
	memset(table + 768, 0, PAGE_SIZE/4);
}

#else
#define ALLOC_ORDER	2
#define TABLES_PER_PAGE	2
#define FRAG_MASK	3UL
#define SECOND_HALVES	2UL

void clear_table_pgstes(unsigned long *table)
{
	clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2);
	memset(table + 256, 0, PAGE_SIZE/2);
}

#endif

unsigned long *crst_table_alloc(struct mm_struct *mm, int noexec)
@@ -153,7 +169,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
	unsigned long *table;
	unsigned long bits;

	bits = mm->context.noexec ? 3UL : 1UL;
	bits = (mm->context.noexec || mm->context.pgstes) ? 3UL : 1UL;
	spin_lock(&mm->page_table_lock);
	page = NULL;
	if (!list_empty(&mm->context.pgtable_list)) {
@@ -170,6 +186,9 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
		pgtable_page_ctor(page);
		page->flags &= ~FRAG_MASK;
		table = (unsigned long *) page_to_phys(page);
		if (mm->context.pgstes)
			clear_table_pgstes(table);
		else
			clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE);
		spin_lock(&mm->page_table_lock);
		list_add(&page->lru, &mm->context.pgtable_list);
@@ -191,7 +210,7 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
	struct page *page;
	unsigned long bits;

	bits = mm->context.noexec ? 3UL : 1UL;
	bits = (mm->context.noexec || mm->context.pgstes) ? 3UL : 1UL;
	bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long);
	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
	spin_lock(&mm->page_table_lock);
@@ -228,3 +247,43 @@ void disable_noexec(struct mm_struct *mm, struct task_struct *tsk)
	mm->context.noexec = 0;
	update_mm(mm, tsk);
}

/*
 * switch on pgstes for its userspace process (for kvm)
 */
int s390_enable_sie(void)
{
	struct task_struct *tsk = current;
	struct mm_struct *mm;
	int rc;

	task_lock(tsk);

	rc = 0;
	if (tsk->mm->context.pgstes)
		goto unlock;

	rc = -EINVAL;
	if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
	    tsk->mm != tsk->active_mm || tsk->mm->ioctx_list)
		goto unlock;

	tsk->mm->context.pgstes = 1;	/* dirty little tricks .. */
	mm = dup_mm(tsk);
	tsk->mm->context.pgstes = 0;

	rc = -ENOMEM;
	if (!mm)
		goto unlock;
	mmput(tsk->mm);
	tsk->mm = tsk->active_mm = mm;
	preempt_disable();
	update_mm(mm, tsk);
	cpu_set(smp_processor_id(), mm->cpu_vm_mask);
	preempt_enable();
	rc = 0;
unlock:
	task_unlock(tsk);
	return rc;
}
EXPORT_SYMBOL_GPL(s390_enable_sie);
+1 −0
Original line number Diff line number Diff line
@@ -7,6 +7,7 @@ typedef struct {
	unsigned long asce_bits;
	unsigned long asce_limit;
	int noexec;
	int pgstes;
} mm_context_t;

#endif
+7 −1
Original line number Diff line number Diff line
@@ -20,7 +20,13 @@ static inline int init_new_context(struct task_struct *tsk,
#ifdef CONFIG_64BIT
	mm->context.asce_bits |= _ASCE_TYPE_REGION3;
#endif
	if (current->mm->context.pgstes) {
		mm->context.noexec = 0;
		mm->context.pgstes = 1;
	} else {
		mm->context.noexec = s390_noexec;
		mm->context.pgstes = 0;
	}
	mm->context.asce_limit = STACK_TOP_MAX;
	crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm));
	return 0;
Loading