Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 08b297bb authored by Greg Kroah-Hartman's avatar Greg Kroah-Hartman
Browse files
Paolo writes:
  "KVM changes for 4.19-rc7

   x86 and PPC bugfixes, mostly introduced in 4.19-rc1."

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
  kvm: nVMX: fix entry with pending interrupt if APICv is enabled
  KVM: VMX: hide flexpriority from guest when disabled at the module level
  KVM: VMX: check for existence of secondary exec controls before accessing
  KVM: PPC: Book3S HV: Avoid crash from THP collapse during radix page fault
  KVM: x86: fix L1TF's MMIO GFN calculation
  tools/kvm_stat: cut down decimal places in update interval dialog
  KVM: nVMX: Fix emulation of VM_ENTRY_LOAD_BNDCFGS
  KVM: x86: Do not use kvm_x86_ops->mpx_supported() directly
  KVM: nVMX: Do not expose MPX VMX controls when guest MPX disabled
  KVM: x86: never trap MSR_KERNEL_GS_BASE
parents 4fbeba43 cc906f07
Loading
Loading
Loading
Loading
+10 −0
Original line number Diff line number Diff line
@@ -646,6 +646,16 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
	 */
	local_irq_disable();
	ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift);
	/*
	 * If the PTE disappeared temporarily due to a THP
	 * collapse, just return and let the guest try again.
	 */
	if (!ptep) {
		local_irq_enable();
		if (page)
			put_page(page);
		return RESUME_GUEST;
	}
	pte = *ptep;
	local_irq_enable();

+20 −4
Original line number Diff line number Diff line
@@ -249,6 +249,17 @@ static u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
 */
static const u64 shadow_nonpresent_or_rsvd_mask_len = 5;

/*
 * In some cases, we need to preserve the GFN of a non-present or reserved
 * SPTE when we usurp the upper five bits of the physical address space to
 * defend against L1TF, e.g. for MMIO SPTEs.  To preserve the GFN, we'll
 * shift bits of the GFN that overlap with shadow_nonpresent_or_rsvd_mask
 * left into the reserved bits, i.e. the GFN in the SPTE will be split into
 * high and low parts.  This mask covers the lower bits of the GFN.
 */
static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;


static void mmu_spte_set(u64 *sptep, u64 spte);
static union kvm_mmu_page_role
kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);
@@ -357,9 +368,7 @@ static bool is_mmio_spte(u64 spte)

static gfn_t get_mmio_spte_gfn(u64 spte)
{
	u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask |
		   shadow_nonpresent_or_rsvd_mask;
	u64 gpa = spte & ~mask;
	u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;

	gpa |= (spte >> shadow_nonpresent_or_rsvd_mask_len)
	       & shadow_nonpresent_or_rsvd_mask;
@@ -423,6 +432,8 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);

static void kvm_mmu_reset_all_pte_masks(void)
{
	u8 low_phys_bits;

	shadow_user_mask = 0;
	shadow_accessed_mask = 0;
	shadow_dirty_mask = 0;
@@ -437,12 +448,17 @@ static void kvm_mmu_reset_all_pte_masks(void)
	 * appropriate mask to guard against L1TF attacks. Otherwise, it is
	 * assumed that the CPU is not vulnerable to L1TF.
	 */
	low_phys_bits = boot_cpu_data.x86_phys_bits;
	if (boot_cpu_data.x86_phys_bits <
	    52 - shadow_nonpresent_or_rsvd_mask_len)
	    52 - shadow_nonpresent_or_rsvd_mask_len) {
		shadow_nonpresent_or_rsvd_mask =
			rsvd_bits(boot_cpu_data.x86_phys_bits -
				  shadow_nonpresent_or_rsvd_mask_len,
				  boot_cpu_data.x86_phys_bits - 1);
		low_phys_bits -= shadow_nonpresent_or_rsvd_mask_len;
	}
	shadow_nonpresent_or_rsvd_lower_gfn_mask =
		GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
}

static int is_cpuid_PSE36(void)
+76 −61
Original line number Diff line number Diff line
@@ -121,7 +121,6 @@ module_param_named(pml, enable_pml, bool, S_IRUGO);

#define MSR_BITMAP_MODE_X2APIC		1
#define MSR_BITMAP_MODE_X2APIC_APICV	2
#define MSR_BITMAP_MODE_LM		4

#define KVM_VMX_TSC_MULTIPLIER_MAX     0xffffffffffffffffULL

@@ -857,6 +856,7 @@ struct nested_vmx {

	/* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
	u64 vmcs01_debugctl;
	u64 vmcs01_guest_bndcfgs;

	u16 vpid02;
	u16 last_vpid;
@@ -2899,7 +2899,6 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
		vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
	}

	if (is_long_mode(&vmx->vcpu))
	wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
#else
	savesegment(fs, fs_sel);
@@ -2951,7 +2950,6 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
	vmx->loaded_cpu_state = NULL;

#ifdef CONFIG_X86_64
	if (is_long_mode(&vmx->vcpu))
	rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
#endif
	if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
@@ -2980,24 +2978,19 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
#ifdef CONFIG_X86_64
static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
{
	if (is_long_mode(&vmx->vcpu)) {
	preempt_disable();
	if (vmx->loaded_cpu_state)
			rdmsrl(MSR_KERNEL_GS_BASE,
			       vmx->msr_guest_kernel_gs_base);
		rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
	preempt_enable();
	}
	return vmx->msr_guest_kernel_gs_base;
}

static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
{
	if (is_long_mode(&vmx->vcpu)) {
	preempt_disable();
	if (vmx->loaded_cpu_state)
		wrmsrl(MSR_KERNEL_GS_BASE, data);
	preempt_enable();
	}
	vmx->msr_guest_kernel_gs_base = data;
}
#endif
@@ -3533,9 +3526,6 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv)
		VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
		VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;

	if (kvm_mpx_supported())
		msrs->exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;

	/* We support free control of debug control saving. */
	msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;

@@ -3552,8 +3542,6 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv)
		VM_ENTRY_LOAD_IA32_PAT;
	msrs->entry_ctls_high |=
		(VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
	if (kvm_mpx_supported())
		msrs->entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;

	/* We support free control of debug control loading. */
	msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
@@ -3601,12 +3589,12 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv)
		msrs->secondary_ctls_high);
	msrs->secondary_ctls_low = 0;
	msrs->secondary_ctls_high &=
		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
		SECONDARY_EXEC_DESC |
		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
		SECONDARY_EXEC_APIC_REGISTER_VIRT |
		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
		SECONDARY_EXEC_WBINVD_EXITING;

	/*
	 * We can emulate "VMCS shadowing," even if the hardware
	 * doesn't support it.
@@ -3663,6 +3651,10 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv)
		msrs->secondary_ctls_high |=
			SECONDARY_EXEC_UNRESTRICTED_GUEST;

	if (flexpriority_enabled)
		msrs->secondary_ctls_high |=
			SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;

	/* miscellaneous data */
	rdmsr(MSR_IA32_VMX_MISC,
		msrs->misc_low,
@@ -5073,19 +5065,6 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
	if (!msr)
		return;

	/*
	 * MSR_KERNEL_GS_BASE is not intercepted when the guest is in
	 * 64-bit mode as a 64-bit kernel may frequently access the
	 * MSR.  This means we need to manually save/restore the MSR
	 * when switching between guest and host state, but only if
	 * the guest is in 64-bit mode.  Sync our cached value if the
	 * guest is transitioning to 32-bit mode and the CPU contains
	 * guest state, i.e. the cache is stale.
	 */
#ifdef CONFIG_X86_64
	if (!(efer & EFER_LMA))
		(void)vmx_read_guest_kernel_gs_base(vmx);
#endif
	vcpu->arch.efer = efer;
	if (efer & EFER_LMA) {
		vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
@@ -6078,9 +6057,6 @@ static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
			mode |= MSR_BITMAP_MODE_X2APIC_APICV;
	}

	if (is_long_mode(vcpu))
		mode |= MSR_BITMAP_MODE_LM;

	return mode;
}

@@ -6121,9 +6097,6 @@ static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
	if (!changed)
		return;

	vmx_set_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW,
				  !(mode & MSR_BITMAP_MODE_LM));

	if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
		vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);

@@ -6189,6 +6162,11 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
	nested_mark_vmcs12_pages_dirty(vcpu);
}

static u8 vmx_get_rvi(void)
{
	return vmcs_read16(GUEST_INTR_STATUS) & 0xff;
}

static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6201,7 +6179,7 @@ static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
		WARN_ON_ONCE(!vmx->nested.virtual_apic_page))
		return false;

	rvi = vmcs_read16(GUEST_INTR_STATUS) & 0xff;
	rvi = vmx_get_rvi();

	vapic_page = kmap(vmx->nested.virtual_apic_page);
	vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
@@ -10245,15 +10223,16 @@ static void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
	if (!lapic_in_kernel(vcpu))
		return;

	if (!flexpriority_enabled &&
	    !cpu_has_vmx_virtualize_x2apic_mode())
		return;

	/* Postpone execution until vmcs01 is the current VMCS. */
	if (is_guest_mode(vcpu)) {
		to_vmx(vcpu)->nested.change_vmcs01_virtual_apic_mode = true;
		return;
	}

	if (!cpu_need_tpr_shadow(vcpu))
		return;

	sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
	sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
			      SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
@@ -10375,6 +10354,14 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
	return max_irr;
}

static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu)
{
	u8 rvi = vmx_get_rvi();
	u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI);

	return ((rvi & 0xf0) > (vppr & 0xf0));
}

static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
{
	if (!kvm_vcpu_apicv_active(vcpu))
@@ -11264,6 +11251,23 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
#undef cr4_fixed1_update
}

static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);

	if (kvm_mpx_supported()) {
		bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX);

		if (mpx_enabled) {
			vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
			vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
		} else {
			vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS;
			vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS;
		}
	}
}

static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -11280,8 +11284,10 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
		to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
			~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;

	if (nested_vmx_allowed(vcpu))
	if (nested_vmx_allowed(vcpu)) {
		nested_vmx_cr_fixed1_bits_update(vcpu);
		nested_vmx_entry_exit_ctls_update(vcpu);
	}
}

static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
@@ -12049,8 +12055,13 @@ static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)

	set_cr4_guest_host_mask(vmx);

	if (vmx_mpx_supported())
	if (kvm_mpx_supported()) {
		if (vmx->nested.nested_run_pending &&
			(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
			vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
		else
			vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
	}

	if (enable_vpid) {
		if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
@@ -12595,15 +12606,21 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual)
	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
	bool from_vmentry = !!exit_qual;
	u32 dummy_exit_qual;
	u32 vmcs01_cpu_exec_ctrl;
	bool evaluate_pending_interrupts;
	int r = 0;

	vmcs01_cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
	evaluate_pending_interrupts = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
		(CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING);
	if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
		evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);

	enter_guest_mode(vcpu);

	if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
		vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
	if (kvm_mpx_supported() &&
		!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
		vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);

	vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
	vmx_segment_cache_clear(vmx);
@@ -12643,16 +12660,14 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual)
	 * to L1 or delivered directly to L2 (e.g. In case L1 don't
	 * intercept EXTERNAL_INTERRUPT).
	 *
	 * Usually this would be handled by L0 requesting a
	 * IRQ/NMI window by setting VMCS accordingly. However,
	 * this setting was done on VMCS01 and now VMCS02 is active
	 * instead. Thus, we force L0 to perform pending event
	 * evaluation by requesting a KVM_REQ_EVENT.
	 */
	if (vmcs01_cpu_exec_ctrl &
		(CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING)) {
	 * Usually this would be handled by the processor noticing an
	 * IRQ/NMI window request, or checking RVI during evaluation of
	 * pending virtual interrupts.  However, this setting was done
	 * on VMCS01 and now VMCS02 is active instead. Thus, we force L0
	 * to perform pending event evaluation by requesting a KVM_REQ_EVENT.
	 */
	if (unlikely(evaluate_pending_interrupts))
		kvm_make_request(KVM_REQ_EVENT, vcpu);
	}

	/*
	 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
+1 −1
Original line number Diff line number Diff line
@@ -4698,7 +4698,7 @@ static void kvm_init_msr_list(void)
		 */
		switch (msrs_to_save[i]) {
		case MSR_IA32_BNDCFGS:
			if (!kvm_x86_ops->mpx_supported())
			if (!kvm_mpx_supported())
				continue;
			break;
		case MSR_TSC_AUX:
+1 −1
Original line number Diff line number Diff line
@@ -1325,7 +1325,7 @@ class Tui(object):
        msg = ''
        while True:
            self.screen.erase()
            self.screen.addstr(0, 0, 'Set update interval (defaults to %fs).' %
            self.screen.addstr(0, 0, 'Set update interval (defaults to %.1fs).' %
                               DELAY_DEFAULT, curses.A_BOLD)
            self.screen.addstr(4, 0, msg)
            self.screen.addstr(2, 0, 'Change delay from %.1fs to ' %