Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 1d86b5cc authored by Avi Kivity's avatar Avi Kivity
Browse files

Merge branch 'queue' into next



* queue:
  KVM: MMU: Eliminate pointless temporary 'ac'
  KVM: MMU: Avoid access/dirty update loop if all is well
  KVM: MMU: Eliminate eperm temporary
  KVM: MMU: Optimize is_last_gpte()
  KVM: MMU: Simplify walk_addr_generic() loop
  KVM: MMU: Optimize pte permission checks
  KVM: MMU: Update accessed and dirty bits after guest pagetable walk
  KVM: MMU: Move gpte_access() out of paging_tmpl.h
  KVM: MMU: Optimize gpte_access() slightly
  KVM: MMU: Push clean gpte write protection out of gpte_access()
  KVM: clarify kvmclock documentation
  KVM: make processes waiting on vcpu mutex killable
  KVM: SVM: Make use of asm.h
  KVM: VMX: Make use of asm.h
  KVM: VMX: Make lto-friendly

Signed-off-by: default avatarAvi Kivity <avi@redhat.com>
parents ecba9a52 c5421519
Loading
Loading
Loading
Loading
+20 −12
Original line number Diff line number Diff line
@@ -34,9 +34,12 @@ MSR_KVM_WALL_CLOCK_NEW: 0x4b564d00
		time information and check that they are both equal and even.
		An odd version indicates an in-progress update.

		sec: number of seconds for wallclock.
		sec: number of seconds for wallclock at time of boot.

		nsec: number of nanoseconds for wallclock.
		nsec: number of nanoseconds for wallclock at time of boot.

	In order to get the current wallclock time, the system_time from
	MSR_KVM_SYSTEM_TIME_NEW needs to be added.

	Note that although MSRs are per-CPU entities, the effect of this
	particular MSR is global.
@@ -82,20 +85,25 @@ MSR_KVM_SYSTEM_TIME_NEW: 0x4b564d01
		time at the time this structure was last updated. Unit is
		nanoseconds.

		tsc_to_system_mul: a function of the tsc frequency. One has
		to multiply any tsc-related quantity by this value to get
		a value in nanoseconds, besides dividing by 2^tsc_shift
		tsc_to_system_mul: multiplier to be used when converting
		tsc-related quantity to nanoseconds

		tsc_shift: cycle to nanosecond divider, as a power of two, to
		allow for shift rights. One has to shift right any tsc-related
		quantity by this value to get a value in nanoseconds, besides
		multiplying by tsc_to_system_mul.
		tsc_shift: shift to be used when converting tsc-related
		quantity to nanoseconds. This shift will ensure that
		multiplication with tsc_to_system_mul does not overflow.
		A positive value denotes a left shift, a negative value
		a right shift.

		With this information, guests can derive per-CPU time by
		doing:
		The conversion from tsc to nanoseconds involves an additional
		right shift by 32 bits. With this information, guests can
		derive per-CPU time by doing:

			time = (current_tsc - tsc_timestamp)
			time = (time * tsc_to_system_mul) >> tsc_shift
			if (tsc_shift >= 0)
				time <<= tsc_shift;
			else
				time >>= -tsc_shift;
			time = (time * tsc_to_system_mul) >> 32
			time = time + system_time

		flags: bits in this field indicate extended capabilities
+14 −0
Original line number Diff line number Diff line
@@ -287,10 +287,24 @@ struct kvm_mmu {
	union kvm_mmu_page_role base_role;
	bool direct_map;

	/*
	 * Bitmap; bit set = permission fault
	 * Byte index: page fault error code [4:1]
	 * Bit index: pte permissions in ACC_* format
	 */
	u8 permissions[16];

	u64 *pae_root;
	u64 *lm_root;
	u64 rsvd_bits_mask[2][4];

	/*
	 * Bitmap: bit set = last pte in walk
	 * index[0:1]: level (zero-based)
	 * index[2]: pte.ps
	 */
	u8 last_pte_bitmap;

	bool nx;

	u64 pdptrs[4]; /* pae */
+91 −0
Original line number Diff line number Diff line
@@ -3408,6 +3408,18 @@ static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
	return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
}

static inline void protect_clean_gpte(unsigned *access, unsigned gpte)
{
	unsigned mask;

	BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);

	mask = (unsigned)~ACC_WRITE_MASK;
	/* Allow write access to dirty gptes */
	mask |= (gpte >> (PT_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & PT_WRITABLE_MASK;
	*access &= mask;
}

static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
			   int *nr_present)
{
@@ -3425,6 +3437,25 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
	return false;
}

static inline unsigned gpte_access(struct kvm_vcpu *vcpu, u64 gpte)
{
	unsigned access;

	access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
	access &= ~(gpte >> PT64_NX_SHIFT);

	return access;
}

static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte)
{
	unsigned index;

	index = level - 1;
	index |= (gpte & PT_PAGE_SIZE_MASK) >> (PT_PAGE_SIZE_SHIFT - 2);
	return mmu->last_pte_bitmap & (1 << index);
}

#define PTTYPE 64
#include "paging_tmpl.h"
#undef PTTYPE
@@ -3494,6 +3525,56 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
	}
}

static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
{
	unsigned bit, byte, pfec;
	u8 map;
	bool fault, x, w, u, wf, uf, ff, smep;

	smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
	for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
		pfec = byte << 1;
		map = 0;
		wf = pfec & PFERR_WRITE_MASK;
		uf = pfec & PFERR_USER_MASK;
		ff = pfec & PFERR_FETCH_MASK;
		for (bit = 0; bit < 8; ++bit) {
			x = bit & ACC_EXEC_MASK;
			w = bit & ACC_WRITE_MASK;
			u = bit & ACC_USER_MASK;

			/* Not really needed: !nx will cause pte.nx to fault */
			x |= !mmu->nx;
			/* Allow supervisor writes if !cr0.wp */
			w |= !is_write_protection(vcpu) && !uf;
			/* Disallow supervisor fetches of user code if cr4.smep */
			x &= !(smep && u && !uf);

			fault = (ff && !x) || (uf && !u) || (wf && !w);
			map |= fault << bit;
		}
		mmu->permissions[byte] = map;
	}
}

static void update_last_pte_bitmap(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
{
	u8 map;
	unsigned level, root_level = mmu->root_level;
	const unsigned ps_set_index = 1 << 2;  /* bit 2 of index: ps */

	if (root_level == PT32E_ROOT_LEVEL)
		--root_level;
	/* PT_PAGE_TABLE_LEVEL always terminates */
	map = 1 | (1 << ps_set_index);
	for (level = PT_DIRECTORY_LEVEL; level <= root_level; ++level) {
		if (level <= PT_PDPE_LEVEL
		    && (mmu->root_level >= PT32E_ROOT_LEVEL || is_pse(vcpu)))
			map |= 1 << (ps_set_index | (level - 1));
	}
	mmu->last_pte_bitmap = map;
}

static int paging64_init_context_common(struct kvm_vcpu *vcpu,
					struct kvm_mmu *context,
					int level)
@@ -3502,6 +3583,8 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
	context->root_level = level;

	reset_rsvds_bits_mask(vcpu, context);
	update_permission_bitmask(vcpu, context);
	update_last_pte_bitmap(vcpu, context);

	ASSERT(is_pae(vcpu));
	context->new_cr3 = paging_new_cr3;
@@ -3530,6 +3613,8 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
	context->root_level = PT32_ROOT_LEVEL;

	reset_rsvds_bits_mask(vcpu, context);
	update_permission_bitmask(vcpu, context);
	update_last_pte_bitmap(vcpu, context);

	context->new_cr3 = paging_new_cr3;
	context->page_fault = paging32_page_fault;
@@ -3590,6 +3675,9 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
		context->gva_to_gpa = paging32_gva_to_gpa;
	}

	update_permission_bitmask(vcpu, context);
	update_last_pte_bitmap(vcpu, context);

	return 0;
}

@@ -3665,6 +3753,9 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
		g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
	}

	update_permission_bitmask(vcpu, g_context);
	update_last_pte_bitmap(vcpu, g_context);

	return 0;
}

+12 −13
Original line number Diff line number Diff line
@@ -18,8 +18,10 @@
#define PT_PCD_MASK (1ULL << 4)
#define PT_ACCESSED_SHIFT 5
#define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT)
#define PT_DIRTY_MASK (1ULL << 6)
#define PT_PAGE_SIZE_MASK (1ULL << 7)
#define PT_DIRTY_SHIFT 6
#define PT_DIRTY_MASK (1ULL << PT_DIRTY_SHIFT)
#define PT_PAGE_SIZE_SHIFT 7
#define PT_PAGE_SIZE_MASK (1ULL << PT_PAGE_SIZE_SHIFT)
#define PT_PAT_MASK (1ULL << 7)
#define PT_GLOBAL_MASK (1ULL << 8)
#define PT64_NX_SHIFT 63
@@ -88,17 +90,14 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu)
	return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
}

static inline bool check_write_user_access(struct kvm_vcpu *vcpu,
					   bool write_fault, bool user_fault,
					   unsigned long pte)
/*
 * Will a fault with a given page-fault error code (pfec) cause a permission
 * fault with the given access (in ACC_* format)?
 */
static inline bool permission_fault(struct kvm_mmu *mmu, unsigned pte_access,
				    unsigned pfec)
{
	if (unlikely(write_fault && !is_writable_pte(pte)
	      && (user_fault || is_write_protection(vcpu))))
		return false;

	if (unlikely(user_fault && !(pte & PT_USER_MASK)))
		return false;

	return true;
	return (mmu->permissions[pfec >> 1] >> pte_access) & 1;
}

#endif
+81 −110
Original line number Diff line number Diff line
@@ -63,10 +63,12 @@
 */
struct guest_walker {
	int level;
	unsigned max_level;
	gfn_t table_gfn[PT_MAX_FULL_LEVELS];
	pt_element_t ptes[PT_MAX_FULL_LEVELS];
	pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
	gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
	pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS];
	unsigned pt_access;
	unsigned pte_access;
	gfn_t gfn;
@@ -101,38 +103,41 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
	return (ret != orig_pte);
}

static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte,
				   bool last)
static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
					     struct kvm_mmu *mmu,
					     struct guest_walker *walker,
					     int write_fault)
{
	unsigned access;

	access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
	if (last && !is_dirty_gpte(gpte))
		access &= ~ACC_WRITE_MASK;
	unsigned level, index;
	pt_element_t pte, orig_pte;
	pt_element_t __user *ptep_user;
	gfn_t table_gfn;
	int ret;

#if PTTYPE == 64
	if (vcpu->arch.mmu.nx)
		access &= ~(gpte >> PT64_NX_SHIFT);
#endif
	return access;
	for (level = walker->max_level; level >= walker->level; --level) {
		pte = orig_pte = walker->ptes[level - 1];
		table_gfn = walker->table_gfn[level - 1];
		ptep_user = walker->ptep_user[level - 1];
		index = offset_in_page(ptep_user) / sizeof(pt_element_t);
		if (!(pte & PT_ACCESSED_MASK)) {
			trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte));
			pte |= PT_ACCESSED_MASK;
		}
		if (level == walker->level && write_fault && !is_dirty_gpte(pte)) {
			trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
			pte |= PT_DIRTY_MASK;
		}
		if (pte == orig_pte)
			continue;

static bool FNAME(is_last_gpte)(struct guest_walker *walker,
				struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
				pt_element_t gpte)
{
	if (walker->level == PT_PAGE_TABLE_LEVEL)
		return true;

	if ((walker->level == PT_DIRECTORY_LEVEL) && is_large_pte(gpte) &&
	    (PTTYPE == 64 || is_pse(vcpu)))
		return true;

	if ((walker->level == PT_PDPE_LEVEL) && is_large_pte(gpte) &&
	    (mmu->root_level == PT64_ROOT_LEVEL))
		return true;
		ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, orig_pte, pte);
		if (ret)
			return ret;

	return false;
		mark_page_dirty(vcpu->kvm, table_gfn);
		walker->ptes[level] = pte;
	}
	return 0;
}

/*
@@ -142,21 +147,22 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
				    struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
				    gva_t addr, u32 access)
{
	int ret;
	pt_element_t pte;
	pt_element_t __user *uninitialized_var(ptep_user);
	gfn_t table_gfn;
	unsigned index, pt_access, uninitialized_var(pte_access);
	unsigned index, pt_access, pte_access, accessed_dirty, shift;
	gpa_t pte_gpa;
	bool eperm, last_gpte;
	int offset;
	const int write_fault = access & PFERR_WRITE_MASK;
	const int user_fault  = access & PFERR_USER_MASK;
	const int fetch_fault = access & PFERR_FETCH_MASK;
	u16 errcode = 0;
	gpa_t real_gpa;
	gfn_t gfn;

	trace_kvm_mmu_pagetable_walk(addr, access);
retry_walk:
	eperm = false;
	walker->level = mmu->root_level;
	pte           = mmu->get_cr3(vcpu);

@@ -169,15 +175,21 @@ retry_walk:
		--walker->level;
	}
#endif
	walker->max_level = walker->level;
	ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
	       (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);

	pt_access = ACC_ALL;
	accessed_dirty = PT_ACCESSED_MASK;
	pt_access = pte_access = ACC_ALL;
	++walker->level;

	for (;;) {
	do {
		gfn_t real_gfn;
		unsigned long host_addr;

		pt_access &= pte_access;
		--walker->level;

		index = PT_INDEX(addr, walker->level);

		table_gfn = gpte_to_gfn(pte);
@@ -199,6 +211,7 @@ retry_walk:
		ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
		if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte))))
			goto error;
		walker->ptep_user[walker->level - 1] = ptep_user;

		trace_kvm_mmu_paging_element(pte, walker->level);

@@ -211,92 +224,48 @@ retry_walk:
			goto error;
		}

		if (!check_write_user_access(vcpu, write_fault, user_fault,
					  pte))
			eperm = true;

#if PTTYPE == 64
		if (unlikely(fetch_fault && (pte & PT64_NX_MASK)))
			eperm = true;
#endif
		accessed_dirty &= pte;
		pte_access = pt_access & gpte_access(vcpu, pte);

		last_gpte = FNAME(is_last_gpte)(walker, vcpu, mmu, pte);
		if (last_gpte) {
			pte_access = pt_access &
				     FNAME(gpte_access)(vcpu, pte, true);
			/* check if the kernel is fetching from user page */
			if (unlikely(pte_access & PT_USER_MASK) &&
			    kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
				if (fetch_fault && !user_fault)
					eperm = true;
		}
		walker->ptes[walker->level - 1] = pte;
	} while (!is_last_gpte(mmu, walker->level, pte));

		if (!eperm && unlikely(!(pte & PT_ACCESSED_MASK))) {
			int ret;
			trace_kvm_mmu_set_accessed_bit(table_gfn, index,
						       sizeof(pte));
			ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index,
						  pte, pte|PT_ACCESSED_MASK);
			if (unlikely(ret < 0))
	if (unlikely(permission_fault(mmu, pte_access, access))) {
		errcode |= PFERR_PRESENT_MASK;
		goto error;
			else if (ret)
				goto retry_walk;

			mark_page_dirty(vcpu->kvm, table_gfn);
			pte |= PT_ACCESSED_MASK;
	}

		walker->ptes[walker->level - 1] = pte;

		if (last_gpte) {
			int lvl = walker->level;
			gpa_t real_gpa;
			gfn_t gfn;
			u32 ac;
	gfn = gpte_to_gfn_lvl(pte, walker->level);
	gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT;

			gfn = gpte_to_gfn_lvl(pte, lvl);
			gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT;

			if (PTTYPE == 32 &&
			    walker->level == PT_DIRECTORY_LEVEL &&
			    is_cpuid_PSE36())
	if (PTTYPE == 32 && walker->level == PT_DIRECTORY_LEVEL && is_cpuid_PSE36())
		gfn += pse36_gfn_delta(pte);

			ac = write_fault | fetch_fault | user_fault;

			real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn),
						      ac);
	real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access);
	if (real_gpa == UNMAPPED_GVA)
		return 0;

	walker->gfn = real_gpa >> PAGE_SHIFT;

			break;
		}

		pt_access &= FNAME(gpte_access)(vcpu, pte, false);
		--walker->level;
	}

	if (unlikely(eperm)) {
		errcode |= PFERR_PRESENT_MASK;
		goto error;
	}
	if (!write_fault)
		protect_clean_gpte(&pte_access, pte);

	if (write_fault && unlikely(!is_dirty_gpte(pte))) {
		int ret;
	/*
	 * On a write fault, fold the dirty bit into accessed_dirty by shifting it one
	 * place right.
	 *
	 * On a read fault, do nothing.
	 */
	shift = write_fault >> ilog2(PFERR_WRITE_MASK);
	shift *= PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT;
	accessed_dirty &= pte >> shift;

		trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
		ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index,
					  pte, pte|PT_DIRTY_MASK);
	if (unlikely(!accessed_dirty)) {
		ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault);
		if (unlikely(ret < 0))
			goto error;
		else if (ret)
			goto retry_walk;

		mark_page_dirty(vcpu->kvm, table_gfn);
		pte |= PT_DIRTY_MASK;
		walker->ptes[walker->level - 1] = pte;
	}

	walker->pt_access = pt_access;
@@ -368,7 +337,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
		return;

	pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
	pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, true);
	pte_access = sp->role.access & gpte_access(vcpu, gpte);
	protect_clean_gpte(&pte_access, gpte);
	pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte));
	if (mmu_invalid_pfn(pfn))
		return;
@@ -441,8 +411,8 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
		if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
			continue;

		pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte,
								  true);
		pte_access = sp->role.access & gpte_access(vcpu, gpte);
		protect_clean_gpte(&pte_access, gpte);
		gfn = gpte_to_gfn(gpte);
		pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
				      pte_access & ACC_WRITE_MASK);
@@ -794,7 +764,8 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)

		gfn = gpte_to_gfn(gpte);
		pte_access = sp->role.access;
		pte_access &= FNAME(gpte_access)(vcpu, gpte, true);
		pte_access &= gpte_access(vcpu, gpte);
		protect_clean_gpte(&pte_access, gpte);

		if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present))
			continue;
Loading