Merge branch 'kvm-updates/3.1' of git://git.kernel.org/pub/scm/virt/kvm/kvm (5fabc487) · Commits · e / devices / android_kernel_xiaomi_markw

Documentation/kernel-parameters.txt

+4 −4

Original line number	Diff line number	Diff line
		@@ -1159,10 +1159,6 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
		for all guests.
		Default is 1 (enabled) if in 64bit or 32bit-PAE mode

		kvm-intel.bypass_guest_pf=
		[KVM,Intel] Disables bypassing of guest page faults
		on Intel chips. Default is 1 (enabled)

		kvm-intel.ept= [KVM,Intel] Disable extended page tables
		(virtualized MMU) support on capable Intel chips.
		Default is 1 (enabled)
		@@ -1737,6 +1733,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
		no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page
		fault handling.

		no-steal-acc [X86,KVM] Disable paravirtualized steal time accounting.
		steal time is computed, but won't influence scheduler
		behaviour

		nolapic [X86-32,APIC] Do not enable or use the local APIC.

		nolapic_timer [X86-32,APIC] Do not use the local APIC timer.

Documentation/virtual/kvm/api.txt

+164 −8

Original line number	Diff line number	Diff line
		@@ -180,6 +180,19 @@ KVM_CHECK_EXTENSION ioctl() to determine the value for max_vcpus at run-time.
		If the KVM_CAP_NR_VCPUS does not exist, you should assume that max_vcpus is 4
		cpus max.

		On powerpc using book3s_hv mode, the vcpus are mapped onto virtual
		threads in one or more virtual CPU cores. (This is because the
		hardware requires all the hardware threads in a CPU core to be in the
		same partition.) The KVM_CAP_PPC_SMT capability indicates the number
		of vcpus per virtual core (vcore). The vcore id is obtained by
		dividing the vcpu id by the number of vcpus per vcore. The vcpus in a
		given vcore will always be in the same physical core as each other
		(though that might be a different physical core from time to time).
		Userspace can control the threading (SMT) mode of the guest by its
		allocation of vcpu ids. For example, if userspace wants
		single-threaded guest vcpus, it should make all vcpu ids be a multiple
		of the number of vcpus per vcore.

		4.8 KVM_GET_DIRTY_LOG (vm ioctl)

		Capability: basic
		@@ -1143,15 +1156,10 @@ Assigns an IRQ to a passed-through device.

		struct kvm_assigned_irq {
		__u32 assigned_dev_id;
		__u32 host_irq;
		__u32 host_irq; /* ignored (legacy field) */
		__u32 guest_irq;
		__u32 flags;
		union {
		struct {
		__u32 addr_lo;
		__u32 addr_hi;
		__u32 data;
		} guest_msi;
		__u32 reserved[12];
		};
		};
		@@ -1239,8 +1247,10 @@ Type: vm ioctl
		Parameters: struct kvm_assigned_msix_nr (in)
		Returns: 0 on success, -1 on error

		Set the number of MSI-X interrupts for an assigned device. This service can
		only be called once in the lifetime of an assigned device.
		Set the number of MSI-X interrupts for an assigned device. The number is
		reset again by terminating the MSI-X assignment of the device via
		KVM_DEASSIGN_DEV_IRQ. Calling this service more than once at any earlier
		point will fail.

		struct kvm_assigned_msix_nr {
		__u32 assigned_dev_id;
		@@ -1291,6 +1301,135 @@ Returns the tsc frequency of the guest. The unit of the return value is
		KHz. If the host has unstable tsc this ioctl returns -EIO instead as an
		error.

		4.56 KVM_GET_LAPIC

		Capability: KVM_CAP_IRQCHIP
		Architectures: x86
		Type: vcpu ioctl
		Parameters: struct kvm_lapic_state (out)
		Returns: 0 on success, -1 on error

		#define KVM_APIC_REG_SIZE 0x400
		struct kvm_lapic_state {
		char regs[KVM_APIC_REG_SIZE];
		};

		Reads the Local APIC registers and copies them into the input argument. The
		data format and layout are the same as documented in the architecture manual.

		4.57 KVM_SET_LAPIC

		Capability: KVM_CAP_IRQCHIP
		Architectures: x86
		Type: vcpu ioctl
		Parameters: struct kvm_lapic_state (in)
		Returns: 0 on success, -1 on error

		#define KVM_APIC_REG_SIZE 0x400
		struct kvm_lapic_state {
		char regs[KVM_APIC_REG_SIZE];
		};

		Copies the input argument into the the Local APIC registers. The data format
		and layout are the same as documented in the architecture manual.

		4.58 KVM_IOEVENTFD

		Capability: KVM_CAP_IOEVENTFD
		Architectures: all
		Type: vm ioctl
		Parameters: struct kvm_ioeventfd (in)
		Returns: 0 on success, !0 on error

		This ioctl attaches or detaches an ioeventfd to a legal pio/mmio address
		within the guest. A guest write in the registered address will signal the
		provided event instead of triggering an exit.

		struct kvm_ioeventfd {
		__u64 datamatch;
		__u64 addr; /* legal pio/mmio address */
		__u32 len; /* 1, 2, 4, or 8 bytes */
		__s32 fd;
		__u32 flags;
		__u8 pad[36];
		};

		The following flags are defined:

		#define KVM_IOEVENTFD_FLAG_DATAMATCH (1 << kvm_ioeventfd_flag_nr_datamatch)
		#define KVM_IOEVENTFD_FLAG_PIO (1 << kvm_ioeventfd_flag_nr_pio)
		#define KVM_IOEVENTFD_FLAG_DEASSIGN (1 << kvm_ioeventfd_flag_nr_deassign)

		If datamatch flag is set, the event will be signaled only if the written value
		to the registered address is equal to datamatch in struct kvm_ioeventfd.

		4.62 KVM_CREATE_SPAPR_TCE

		Capability: KVM_CAP_SPAPR_TCE
		Architectures: powerpc
		Type: vm ioctl
		Parameters: struct kvm_create_spapr_tce (in)
		Returns: file descriptor for manipulating the created TCE table

		This creates a virtual TCE (translation control entry) table, which
		is an IOMMU for PAPR-style virtual I/O. It is used to translate
		logical addresses used in virtual I/O into guest physical addresses,
		and provides a scatter/gather capability for PAPR virtual I/O.

		/* for KVM_CAP_SPAPR_TCE */
		struct kvm_create_spapr_tce {
		__u64 liobn;
		__u32 window_size;
		};

		The liobn field gives the logical IO bus number for which to create a
		TCE table. The window_size field specifies the size of the DMA window
		which this TCE table will translate - the table will contain one 64
		bit TCE entry for every 4kiB of the DMA window.

		When the guest issues an H_PUT_TCE hcall on a liobn for which a TCE
		table has been created using this ioctl(), the kernel will handle it
		in real mode, updating the TCE table. H_PUT_TCE calls for other
		liobns will cause a vm exit and must be handled by userspace.

		The return value is a file descriptor which can be passed to mmap(2)
		to map the created TCE table into userspace. This lets userspace read
		the entries written by kernel-handled H_PUT_TCE calls, and also lets
		userspace update the TCE table directly which is useful in some
		circumstances.

		4.63 KVM_ALLOCATE_RMA

		Capability: KVM_CAP_PPC_RMA
		Architectures: powerpc
		Type: vm ioctl
		Parameters: struct kvm_allocate_rma (out)
		Returns: file descriptor for mapping the allocated RMA

		This allocates a Real Mode Area (RMA) from the pool allocated at boot
		time by the kernel. An RMA is a physically-contiguous, aligned region
		of memory used on older POWER processors to provide the memory which
		will be accessed by real-mode (MMU off) accesses in a KVM guest.
		POWER processors support a set of sizes for the RMA that usually
		includes 64MB, 128MB, 256MB and some larger powers of two.

		/* for KVM_ALLOCATE_RMA */
		struct kvm_allocate_rma {
		__u64 rma_size;
		};

		The return value is a file descriptor which can be passed to mmap(2)
		to map the allocated RMA into userspace. The mapped area can then be
		passed to the KVM_SET_USER_MEMORY_REGION ioctl to establish it as the
		RMA for a virtual machine. The size of the RMA in bytes (which is
		fixed at host kernel boot time) is returned in the rma_size field of
		the argument structure.

		The KVM_CAP_PPC_RMA capability is 1 or 2 if the KVM_ALLOCATE_RMA ioctl
		is supported; 2 if the processor requires all virtual machines to have
		an RMA, or 1 if the processor can use an RMA but doesn't require it,
		because it supports the Virtual RMA (VRMA) facility.

		5. The kvm_run structure

		Application code obtains a pointer to the kvm_run structure by
		@@ -1473,6 +1612,23 @@ Userspace can now handle the hypercall and when it's done modify the gprs as
		necessary. Upon guest entry all guest GPRs will then be replaced by the values
		in this struct.

		/* KVM_EXIT_PAPR_HCALL */
		struct {
		__u64 nr;
		__u64 ret;
		__u64 args[9];
		} papr_hcall;

		This is used on 64-bit PowerPC when emulating a pSeries partition,
		e.g. with the 'pseries' machine type in qemu. It occurs when the
		guest does a hypercall using the 'sc 1' instruction. The 'nr' field
		contains the hypercall number (from the guest R3), and 'args' contains
		the arguments (from the guest R4 - R12). Userspace should put the
		return code in 'ret' and any extra returned values in args[].
		The possible hypercalls are defined in the Power Architecture Platform
		Requirements (PAPR) document available from www.power.org (free
		developer registration required to access it).

		/* Fix the size of the union. */
		char padding[256];
		};

Documentation/virtual/kvm/mmu.txt

+18 −0

Original line number	Diff line number	Diff line
		@@ -165,6 +165,10 @@ Shadow pages contain the following information:
		Contains the value of efer.nxe for which the page is valid.
		role.cr0_wp:
		Contains the value of cr0.wp for which the page is valid.
		role.smep_andnot_wp:
		Contains the value of cr4.smep && !cr0.wp for which the page is valid
		(pages for which this is true are different from other pages; see the
		treatment of cr0.wp=0 below).
		gfn:
		Either the guest page table containing the translations shadowed by this
		page, or the base page frame for linear translations. See role.direct.
		@@ -317,6 +321,20 @@ on fault type:

		(user write faults generate a #PF)

		In the first case there is an additional complication if CR4.SMEP is
		enabled: since we've turned the page into a kernel page, the kernel may now
		execute it. We handle this by also setting spte.nx. If we get a user
		fetch or read fault, we'll change spte.u=1 and spte.nx=gpte.nx back.

		To prevent an spte that was converted into a kernel page with cr0.wp=0
		from being written by the kernel after cr0.wp has changed to 1, we make
		the value of cr0.wp part of the page role. This means that an spte created
		with one value of cr0.wp cannot be used when cr0.wp has a different value -
		it will simply be missed by the shadow page lookup code. A similar issue
		exists when an spte created with cr0.wp=0 and cr4.smep=0 is used after
		changing cr4.smep to 1. To avoid this, the value of !cr0.wp && cr4.smep
		is also made a part of the page role.

		Large pages
		===========

Documentation/virtual/kvm/msr.txt

+34 −0

Original line number	Diff line number	Diff line
		@@ -185,3 +185,37 @@ MSR_KVM_ASYNC_PF_EN: 0x4b564d02

		Currently type 2 APF will be always delivered on the same vcpu as
		type 1 was, but guest should not rely on that.

		MSR_KVM_STEAL_TIME: 0x4b564d03

		data: 64-byte alignment physical address of a memory area which must be
		in guest RAM, plus an enable bit in bit 0. This memory is expected to
		hold a copy of the following structure:

		struct kvm_steal_time {
		__u64 steal;
		__u32 version;
		__u32 flags;
		__u32 pad[12];
		}

		whose data will be filled in by the hypervisor periodically. Only one
		write, or registration, is needed for each VCPU. The interval between
		updates of this structure is arbitrary and implementation-dependent.
		The hypervisor may update this structure at any time it sees fit until
		anything with bit0 == 0 is written to it. Guest is required to make sure
		this structure is initialized to zero.

		Fields have the following meanings:

		version: a sequence counter. In other words, guest has to check
		this field before and after grabbing time information and make
		sure they are both equal and even. An odd version indicates an
		in-progress update.

		flags: At this point, always zero. May be used to indicate
		changes in this structure in the future.

		steal: the amount of time in which this vCPU did not run, in
		nanoseconds. Time during which the vcpu is idle, will not be
		reported as steal time.

Documentation/virtual/kvm/nested-vmx.txt

0 → 100644

+251 −0

Original line number	Diff line number	Diff line
		Nested VMX
		==========

		Overview
		---------

		On Intel processors, KVM uses Intel's VMX (Virtual-Machine eXtensions)
		to easily and efficiently run guest operating systems. Normally, these guests
		cannot themselves be hypervisors running their own guests, because in VMX,
		guests cannot use VMX instructions.

		The "Nested VMX" feature adds this missing capability - of running guest
		hypervisors (which use VMX) with their own nested guests. It does so by
		allowing a guest to use VMX instructions, and correctly and efficiently
		emulating them using the single level of VMX available in the hardware.

		We describe in much greater detail the theory behind the nested VMX feature,
		its implementation and its performance characteristics, in the OSDI 2010 paper
		"The Turtles Project: Design and Implementation of Nested Virtualization",
		available at:

		http://www.usenix.org/events/osdi10/tech/full_papers/Ben-Yehuda.pdf


		Terminology
		-----------

		Single-level virtualization has two levels - the host (KVM) and the guests.
		In nested virtualization, we have three levels: The host (KVM), which we call
		L0, the guest hypervisor, which we call L1, and its nested guest, which we
		call L2.


		Known limitations
		-----------------

		The current code supports running Linux guests under KVM guests.
		Only 64-bit guest hypervisors are supported.

		Additional patches for running Windows under guest KVM, and Linux under
		guest VMware server, and support for nested EPT, are currently running in
		the lab, and will be sent as follow-on patchsets.


		Running nested VMX
		------------------

		The nested VMX feature is disabled by default. It can be enabled by giving
		the "nested=1" option to the kvm-intel module.

		No modifications are required to user space (qemu). However, qemu's default
		emulated CPU type (qemu64) does not list the "VMX" CPU feature, so it must be
		explicitly enabled, by giving qemu one of the following options:

		-cpu host (emulated CPU has all features of the real CPU)

		-cpu qemu64,+vmx (add just the vmx feature to a named CPU type)


		ABIs
		----

		Nested VMX aims to present a standard and (eventually) fully-functional VMX
		implementation for the a guest hypervisor to use. As such, the official
		specification of the ABI that it provides is Intel's VMX specification,
		namely volume 3B of their "Intel 64 and IA-32 Architectures Software
		Developer's Manual". Not all of VMX's features are currently fully supported,
		but the goal is to eventually support them all, starting with the VMX features
		which are used in practice by popular hypervisors (KVM and others).

		As a VMX implementation, nested VMX presents a VMCS structure to L1.
		As mandated by the spec, other than the two fields revision_id and abort,
		this structure is opaque to its user, who is not supposed to know or care
		about its internal structure. Rather, the structure is accessed through the
		VMREAD and VMWRITE instructions.
		Still, for debugging purposes, KVM developers might be interested to know the
		internals of this structure; This is struct vmcs12 from arch/x86/kvm/vmx.c.

		The name "vmcs12" refers to the VMCS that L1 builds for L2. In the code we
		also have "vmcs01", the VMCS that L0 built for L1, and "vmcs02" is the VMCS
		which L0 builds to actually run L2 - how this is done is explained in the
		aforementioned paper.

		For convenience, we repeat the content of struct vmcs12 here. If the internals
		of this structure changes, this can break live migration across KVM versions.
		VMCS12_REVISION (from vmx.c) should be changed if struct vmcs12 or its inner
		struct shadow_vmcs is ever changed.

		typedef u64 natural_width;
		struct __packed vmcs12 {
		/* According to the Intel spec, a VMCS region must start with
		* these two user-visible fields */
		u32 revision_id;
		u32 abort;

		u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
		u32 padding[7]; /* room for future expansion */

		u64 io_bitmap_a;
		u64 io_bitmap_b;
		u64 msr_bitmap;
		u64 vm_exit_msr_store_addr;
		u64 vm_exit_msr_load_addr;
		u64 vm_entry_msr_load_addr;
		u64 tsc_offset;
		u64 virtual_apic_page_addr;
		u64 apic_access_addr;
		u64 ept_pointer;
		u64 guest_physical_address;
		u64 vmcs_link_pointer;
		u64 guest_ia32_debugctl;
		u64 guest_ia32_pat;
		u64 guest_ia32_efer;
		u64 guest_pdptr0;
		u64 guest_pdptr1;
		u64 guest_pdptr2;
		u64 guest_pdptr3;
		u64 host_ia32_pat;
		u64 host_ia32_efer;
		u64 padding64[8]; /* room for future expansion */
		natural_width cr0_guest_host_mask;
		natural_width cr4_guest_host_mask;
		natural_width cr0_read_shadow;
		natural_width cr4_read_shadow;
		natural_width cr3_target_value0;
		natural_width cr3_target_value1;
		natural_width cr3_target_value2;
		natural_width cr3_target_value3;
		natural_width exit_qualification;
		natural_width guest_linear_address;
		natural_width guest_cr0;
		natural_width guest_cr3;
		natural_width guest_cr4;
		natural_width guest_es_base;
		natural_width guest_cs_base;
		natural_width guest_ss_base;
		natural_width guest_ds_base;
		natural_width guest_fs_base;
		natural_width guest_gs_base;
		natural_width guest_ldtr_base;
		natural_width guest_tr_base;
		natural_width guest_gdtr_base;
		natural_width guest_idtr_base;
		natural_width guest_dr7;
		natural_width guest_rsp;
		natural_width guest_rip;
		natural_width guest_rflags;
		natural_width guest_pending_dbg_exceptions;
		natural_width guest_sysenter_esp;
		natural_width guest_sysenter_eip;
		natural_width host_cr0;
		natural_width host_cr3;
		natural_width host_cr4;
		natural_width host_fs_base;
		natural_width host_gs_base;
		natural_width host_tr_base;
		natural_width host_gdtr_base;
		natural_width host_idtr_base;
		natural_width host_ia32_sysenter_esp;
		natural_width host_ia32_sysenter_eip;
		natural_width host_rsp;
		natural_width host_rip;
		natural_width paddingl[8]; /* room for future expansion */
		u32 pin_based_vm_exec_control;
		u32 cpu_based_vm_exec_control;
		u32 exception_bitmap;
		u32 page_fault_error_code_mask;
		u32 page_fault_error_code_match;
		u32 cr3_target_count;
		u32 vm_exit_controls;
		u32 vm_exit_msr_store_count;
		u32 vm_exit_msr_load_count;
		u32 vm_entry_controls;
		u32 vm_entry_msr_load_count;
		u32 vm_entry_intr_info_field;
		u32 vm_entry_exception_error_code;
		u32 vm_entry_instruction_len;
		u32 tpr_threshold;
		u32 secondary_vm_exec_control;
		u32 vm_instruction_error;
		u32 vm_exit_reason;
		u32 vm_exit_intr_info;
		u32 vm_exit_intr_error_code;
		u32 idt_vectoring_info_field;
		u32 idt_vectoring_error_code;
		u32 vm_exit_instruction_len;
		u32 vmx_instruction_info;
		u32 guest_es_limit;
		u32 guest_cs_limit;
		u32 guest_ss_limit;
		u32 guest_ds_limit;
		u32 guest_fs_limit;
		u32 guest_gs_limit;
		u32 guest_ldtr_limit;
		u32 guest_tr_limit;
		u32 guest_gdtr_limit;
		u32 guest_idtr_limit;
		u32 guest_es_ar_bytes;
		u32 guest_cs_ar_bytes;
		u32 guest_ss_ar_bytes;
		u32 guest_ds_ar_bytes;
		u32 guest_fs_ar_bytes;
		u32 guest_gs_ar_bytes;
		u32 guest_ldtr_ar_bytes;
		u32 guest_tr_ar_bytes;
		u32 guest_interruptibility_info;
		u32 guest_activity_state;
		u32 guest_sysenter_cs;
		u32 host_ia32_sysenter_cs;
		u32 padding32[8]; /* room for future expansion */
		u16 virtual_processor_id;
		u16 guest_es_selector;
		u16 guest_cs_selector;
		u16 guest_ss_selector;
		u16 guest_ds_selector;
		u16 guest_fs_selector;
		u16 guest_gs_selector;
		u16 guest_ldtr_selector;
		u16 guest_tr_selector;
		u16 host_es_selector;
		u16 host_cs_selector;
		u16 host_ss_selector;
		u16 host_ds_selector;
		u16 host_fs_selector;
		u16 host_gs_selector;
		u16 host_tr_selector;
		};


		Authors
		-------

		These patches were written by:
		Abel Gordon, abelg <at> il.ibm.com
		Nadav Har'El, nyh <at> il.ibm.com
		Orit Wasserman, oritw <at> il.ibm.com
		Ben-Ami Yassor, benami <at> il.ibm.com
		Muli Ben-Yehuda, muli <at> il.ibm.com

		With contributions by:
		Anthony Liguori, aliguori <at> us.ibm.com
		Mike Day, mdday <at> us.ibm.com
		Michael Factor, factor <at> il.ibm.com
		Zvi Dubitzky, dubi <at> il.ibm.com

		And valuable reviews by:
		Avi Kivity, avi <at> redhat.com
		Gleb Natapov, gleb <at> redhat.com
		Marcelo Tosatti, mtosatti <at> redhat.com
		Kevin Tian, kevin.tian <at> intel.com
		and others.