Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit a7e1aabb authored by Linus Torvalds's avatar Linus Torvalds
Browse files
* git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-for-linus:
  lguest: Fix in/out emulation
  lguest: Fix translation count about wikipedia's cpuid page
  lguest: Fix three simple typos in comments
  lguest: update comments
  lguest: Simplify device initialization.
  lguest: don't rewrite vmcall instructions
  lguest: remove remaining vmcall
  lguest: use a special 1:1 linear pagetable mode until first switch.
  lguest: Do not exit on non-fatal errors
parents 111ad119 996ba96a
Loading
Loading
Loading
Loading
+16 −31
Original line number Original line Diff line number Diff line
@@ -51,7 +51,7 @@
#include <asm/bootparam.h>
#include <asm/bootparam.h>
#include "../../../include/linux/lguest_launcher.h"
#include "../../../include/linux/lguest_launcher.h"
/*L:110
/*L:110
 * We can ignore the 42 include files we need for this program, but I do want
 * We can ignore the 43 include files we need for this program, but I do want
 * to draw attention to the use of kernel-style types.
 * to draw attention to the use of kernel-style types.
 *
 *
 * As Linus said, "C is a Spartan language, and so should your naming be."  I
 * As Linus said, "C is a Spartan language, and so should your naming be."  I
@@ -65,7 +65,6 @@ typedef uint16_t u16;
typedef uint8_t u8;
typedef uint8_t u8;
/*:*/
/*:*/


#define PAGE_PRESENT 0x7 	/* Present, RW, Execute */
#define BRIDGE_PFX "bridge:"
#define BRIDGE_PFX "bridge:"
#ifndef SIOCBRADDIF
#ifndef SIOCBRADDIF
#define SIOCBRADDIF	0x89a2		/* add interface to bridge      */
#define SIOCBRADDIF	0x89a2		/* add interface to bridge      */
@@ -861,8 +860,10 @@ static void console_output(struct virtqueue *vq)
	/* writev can return a partial write, so we loop here. */
	/* writev can return a partial write, so we loop here. */
	while (!iov_empty(iov, out)) {
	while (!iov_empty(iov, out)) {
		int len = writev(STDOUT_FILENO, iov, out);
		int len = writev(STDOUT_FILENO, iov, out);
		if (len <= 0)
		if (len <= 0) {
			err(1, "Write to stdout gave %i", len);
			warn("Write to stdout gave %i (%d)", len, errno);
			break;
		}
		iov_consume(iov, out, len);
		iov_consume(iov, out, len);
	}
	}


@@ -898,7 +899,7 @@ static void net_output(struct virtqueue *vq)
	 * same format: what a coincidence!
	 * same format: what a coincidence!
	 */
	 */
	if (writev(net_info->tunfd, iov, out) < 0)
	if (writev(net_info->tunfd, iov, out) < 0)
		errx(1, "Write to tun failed?");
		warnx("Write to tun failed (%d)?", errno);


	/*
	/*
	 * Done with that one; wait_for_vq_desc() will send the interrupt if
	 * Done with that one; wait_for_vq_desc() will send the interrupt if
@@ -955,7 +956,7 @@ static void net_input(struct virtqueue *vq)
	 */
	 */
	len = readv(net_info->tunfd, iov, in);
	len = readv(net_info->tunfd, iov, in);
	if (len <= 0)
	if (len <= 0)
		err(1, "Failed to read from tun.");
		warn("Failed to read from tun (%d).", errno);


	/*
	/*
	 * Mark that packet buffer as used, but don't interrupt here.  We want
	 * Mark that packet buffer as used, but don't interrupt here.  We want
@@ -1093,8 +1094,9 @@ static void update_device_status(struct device *dev)
		warnx("Device %s configuration FAILED", dev->name);
		warnx("Device %s configuration FAILED", dev->name);
		if (dev->running)
		if (dev->running)
			reset_device(dev);
			reset_device(dev);
	} else if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) {
	} else {
		if (!dev->running)
		if (dev->running)
			err(1, "Device %s features finalized twice", dev->name);
		start_device(dev);
		start_device(dev);
	}
	}
}
}
@@ -1120,25 +1122,11 @@ static void handle_output(unsigned long addr)
			return;
			return;
		}
		}


		/*
		/* Devices should not be used before features are finalized. */
		 * Devices *can* be used before status is set to DRIVER_OK.
		 * The original plan was that they would never do this: they
		 * would always finish setting up their status bits before
		 * actually touching the virtqueues.  In practice, we allowed
		 * them to, and they do (eg. the disk probes for partition
		 * tables as part of initialization).
		 *
		 * If we see this, we start the device: once it's running, we
		 * expect the device to catch all the notifications.
		 */
		for (vq = i->vq; vq; vq = vq->next) {
		for (vq = i->vq; vq; vq = vq->next) {
			if (addr != vq->config.pfn*getpagesize())
			if (addr != vq->config.pfn*getpagesize())
				continue;
				continue;
			if (i->running)
			errx(1, "Notification on %s before setup!", i->name);
				errx(1, "Notification on running %s", i->name);
			/* This just calls create_thread() for each virtqueue */
			start_device(i);
			return;
		}
		}
	}
	}


@@ -1370,7 +1358,7 @@ static void setup_console(void)
 * --sharenet=<name> option which opens or creates a named pipe.  This can be
 * --sharenet=<name> option which opens or creates a named pipe.  This can be
 * used to send packets to another guest in a 1:1 manner.
 * used to send packets to another guest in a 1:1 manner.
 *
 *
 * More sopisticated is to use one of the tools developed for project like UML
 * More sophisticated is to use one of the tools developed for project like UML
 * to do networking.
 * to do networking.
 *
 *
 * Faster is to do virtio bonding in kernel.  Doing this 1:1 would be
 * Faster is to do virtio bonding in kernel.  Doing this 1:1 would be
@@ -1380,7 +1368,7 @@ static void setup_console(void)
 * multiple inter-guest channels behind one interface, although it would
 * multiple inter-guest channels behind one interface, although it would
 * require some manner of hotplugging new virtio channels.
 * require some manner of hotplugging new virtio channels.
 *
 *
 * Finally, we could implement a virtio network switch in the kernel.
 * Finally, we could use a virtio network switch in the kernel, ie. vhost.
:*/
:*/


static u32 str2ip(const char *ipaddr)
static u32 str2ip(const char *ipaddr)
@@ -2017,10 +2005,7 @@ int main(int argc, char *argv[])
	/* Tell the entry path not to try to reload segment registers. */
	/* Tell the entry path not to try to reload segment registers. */
	boot->hdr.loadflags |= KEEP_SEGMENTS;
	boot->hdr.loadflags |= KEEP_SEGMENTS;


	/*
	/* We tell the kernel to initialize the Guest. */
	 * We tell the kernel to initialize the Guest: this returns the open
	 * /dev/lguest file descriptor.
	 */
	tell_kernel(start);
	tell_kernel(start);


	/* Ensure that we terminate if a device-servicing child dies. */
	/* Ensure that we terminate if a device-servicing child dies. */
+1 −0
Original line number Original line Diff line number Diff line
@@ -61,6 +61,7 @@ hcall(unsigned long call,
		     : "memory");
		     : "memory");
	return call;
	return call;
}
}
/*:*/


/* Can't use our min() macro here: needs to be a constant */
/* Can't use our min() macro here: needs to be a constant */
#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32)
#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32)
+0 −1
Original line number Original line Diff line number Diff line
@@ -63,7 +63,6 @@ void foo(void)
	BLANK();
	BLANK();
	OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
	OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
	OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending);
	OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending);
	OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir);


	BLANK();
	BLANK();
	OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
	OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
+22 −14
Original line number Original line Diff line number Diff line
@@ -71,7 +71,8 @@
#include <asm/stackprotector.h>
#include <asm/stackprotector.h>
#include <asm/reboot.h>		/* for struct machine_ops */
#include <asm/reboot.h>		/* for struct machine_ops */


/*G:010 Welcome to the Guest!
/*G:010
 * Welcome to the Guest!
 *
 *
 * The Guest in our tale is a simple creature: identical to the Host but
 * The Guest in our tale is a simple creature: identical to the Host but
 * behaving in simplified but equivalent ways.  In particular, the Guest is the
 * behaving in simplified but equivalent ways.  In particular, the Guest is the
@@ -190,15 +191,23 @@ static void lazy_hcall4(unsigned long call,
#endif
#endif


/*G:036
/*G:036
 * When lazy mode is turned off reset the per-cpu lazy mode variable and then
 * When lazy mode is turned off, we issue the do-nothing hypercall to
 * issue the do-nothing hypercall to flush any stored calls.
 * flush any stored calls, and call the generic helper to reset the
:*/
 * per-cpu lazy mode variable.
 */
static void lguest_leave_lazy_mmu_mode(void)
static void lguest_leave_lazy_mmu_mode(void)
{
{
	hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
	hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
	paravirt_leave_lazy_mmu();
	paravirt_leave_lazy_mmu();
}
}


/*
 * We also catch the end of context switch; we enter lazy mode for much of
 * that too, so again we need to flush here.
 *
 * (Technically, this is lazy CPU mode, and normally we're in lazy MMU
 * mode, but unlike Xen, lguest doesn't care about the difference).
 */
static void lguest_end_context_switch(struct task_struct *next)
static void lguest_end_context_switch(struct task_struct *next)
{
{
	hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
	hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
@@ -391,7 +400,7 @@ static void lguest_load_tr_desc(void)
 * giant ball of hair.  Its entry in the current Intel manual runs to 28 pages.
 * giant ball of hair.  Its entry in the current Intel manual runs to 28 pages.
 *
 *
 * This instruction even it has its own Wikipedia entry.  The Wikipedia entry
 * This instruction even it has its own Wikipedia entry.  The Wikipedia entry
 * has been translated into 5 languages.  I am not making this up!
 * has been translated into 6 languages.  I am not making this up!
 *
 *
 * We could get funky here and identify ourselves as "GenuineLguest", but
 * We could get funky here and identify ourselves as "GenuineLguest", but
 * instead we just use the real "cpuid" instruction.  Then I pretty much turned
 * instead we just use the real "cpuid" instruction.  Then I pretty much turned
@@ -458,7 +467,7 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
	/*
	/*
	 * PAE systems can mark pages as non-executable.  Linux calls this the
	 * PAE systems can mark pages as non-executable.  Linux calls this the
	 * NX bit.  Intel calls it XD (eXecute Disable), AMD EVP (Enhanced
	 * NX bit.  Intel calls it XD (eXecute Disable), AMD EVP (Enhanced
	 * Virus Protection).  We just switch turn if off here, since we don't
	 * Virus Protection).  We just switch it off here, since we don't
	 * support it.
	 * support it.
	 */
	 */
	case 0x80000001:
	case 0x80000001:
@@ -520,17 +529,16 @@ static unsigned long lguest_read_cr2(void)


/* See lguest_set_pte() below. */
/* See lguest_set_pte() below. */
static bool cr3_changed = false;
static bool cr3_changed = false;
static unsigned long current_cr3;


/*
/*
 * cr3 is the current toplevel pagetable page: the principle is the same as
 * cr3 is the current toplevel pagetable page: the principle is the same as
 * cr0.  Keep a local copy, and tell the Host when it changes.  The only
 * cr0.  Keep a local copy, and tell the Host when it changes.
 * difference is that our local copy is in lguest_data because the Host needs
 * to set it upon our initial hypercall.
 */
 */
static void lguest_write_cr3(unsigned long cr3)
static void lguest_write_cr3(unsigned long cr3)
{
{
	lguest_data.pgdir = cr3;
	lazy_hcall1(LHCALL_NEW_PGTABLE, cr3);
	lazy_hcall1(LHCALL_NEW_PGTABLE, cr3);
	current_cr3 = cr3;


	/* These two page tables are simple, linear, and used during boot */
	/* These two page tables are simple, linear, and used during boot */
	if (cr3 != __pa(swapper_pg_dir) && cr3 != __pa(initial_page_table))
	if (cr3 != __pa(swapper_pg_dir) && cr3 != __pa(initial_page_table))
@@ -539,7 +547,7 @@ static void lguest_write_cr3(unsigned long cr3)


static unsigned long lguest_read_cr3(void)
static unsigned long lguest_read_cr3(void)
{
{
	return lguest_data.pgdir;
	return current_cr3;
}
}


/* cr4 is used to enable and disable PGE, but we don't care. */
/* cr4 is used to enable and disable PGE, but we don't care. */
@@ -758,7 +766,7 @@ static void lguest_pmd_clear(pmd_t *pmdp)
static void lguest_flush_tlb_single(unsigned long addr)
static void lguest_flush_tlb_single(unsigned long addr)
{
{
	/* Simply set it to zero: if it was not, it will fault back in. */
	/* Simply set it to zero: if it was not, it will fault back in. */
	lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0);
	lazy_hcall3(LHCALL_SET_PTE, current_cr3, addr, 0);
}
}


/*
/*
+20 −15
Original line number Original line Diff line number Diff line
@@ -6,18 +6,22 @@
#include <asm/processor-flags.h>
#include <asm/processor-flags.h>


/*G:020
/*G:020
 * Our story starts with the kernel booting into startup_32 in

 * arch/x86/kernel/head_32.S.  It expects a boot header, which is created by
 * Our story starts with the bzImage: booting starts at startup_32 in
 * the bootloader (the Launcher in our case).
 * arch/x86/boot/compressed/head_32.S.  This merely uncompresses the real
 * kernel in place and then jumps into it: startup_32 in
 * arch/x86/kernel/head_32.S.  Both routines expects a boot header in the %esi
 * register, which is created by the bootloader (the Launcher in our case).
 *
 *
 * The startup_32 function does very little: it clears the uninitialized global
 * The startup_32 function does very little: it clears the uninitialized global
 * C variables which we expect to be zero (ie. BSS) and then copies the boot
 * C variables which we expect to be zero (ie. BSS) and then copies the boot
 * header and kernel command line somewhere safe.  Finally it checks the
 * header and kernel command line somewhere safe, and populates some initial
 * 'hardware_subarch' field.  This was introduced in 2.6.24 for lguest and Xen:
 * page tables.  Finally it checks the 'hardware_subarch' field.  This was
 * if it's set to '1' (lguest's assigned number), then it calls us here.
 * introduced in 2.6.24 for lguest and Xen: if it's set to '1' (lguest's
 * assigned number), then it calls us here.
 *
 *
 * WARNING: be very careful here!  We're running at addresses equal to physical
 * WARNING: be very careful here!  We're running at addresses equal to physical
 * addesses (around 0), not above PAGE_OFFSET as most code expectes
 * addresses (around 0), not above PAGE_OFFSET as most code expects
 * (eg. 0xC0000000).  Jumps are relative, so they're OK, but we can't touch any
 * (eg. 0xC0000000).  Jumps are relative, so they're OK, but we can't touch any
 * data without remembering to subtract __PAGE_OFFSET!
 * data without remembering to subtract __PAGE_OFFSET!
 *
 *
@@ -27,13 +31,18 @@
.section .init.text, "ax", @progbits
.section .init.text, "ax", @progbits
ENTRY(lguest_entry)
ENTRY(lguest_entry)
	/*
	/*
	 * We make the "initialization" hypercall now to tell the Host about
	 * We make the "initialization" hypercall now to tell the Host where
	 * us, and also find out where it put our page tables.
	 * our lguest_data struct is.
	 */
	 */
	movl $LHCALL_LGUEST_INIT, %eax
	movl $LHCALL_LGUEST_INIT, %eax
	movl $lguest_data - __PAGE_OFFSET, %ebx
	movl $lguest_data - __PAGE_OFFSET, %ebx
	int $LGUEST_TRAP_ENTRY
	int $LGUEST_TRAP_ENTRY


	/* Now turn our pagetables on; setup by arch/x86/kernel/head_32.S. */
	movl $LHCALL_NEW_PGTABLE, %eax
	movl $(initial_page_table - __PAGE_OFFSET), %ebx
	int $LGUEST_TRAP_ENTRY

	/* Set up the initial stack so we can run C code. */
	/* Set up the initial stack so we can run C code. */
	movl $(init_thread_union+THREAD_SIZE),%esp
	movl $(init_thread_union+THREAD_SIZE),%esp


@@ -96,12 +105,8 @@ send_interrupts:
	 */
	 */
	pushl %eax
	pushl %eax
	movl $LHCALL_SEND_INTERRUPTS, %eax
	movl $LHCALL_SEND_INTERRUPTS, %eax
	/*
	/* This is the actual hypercall trap. */
	 * This is a vmcall instruction (same thing that KVM uses).  Older
	int  $LGUEST_TRAP_ENTRY
	 * assembler versions might not know the "vmcall" instruction, so we
	 * create one manually here.
	 */
	.byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */
	/* Put eax back the way we found it. */
	/* Put eax back the way we found it. */
	popl %eax
	popl %eax
	ret
	ret
Loading