Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit c565650b authored by Rusty Russell's avatar Rusty Russell
Browse files

lguest: send trap 13 through to userspace.



We copy 7 bytes at eip for userspace's instruction decode; we have to
carefully handle the case where eip is at the end of a page.  We can't
leave this to userspace since kernel has all the page table decode
logic.

The decode logic moves to userspace, basically unchanged.

Signed-off-by: default avatarRusty Russell <rusty@rustcorp.com.au>
parent c9e433e4
Loading
Loading
Loading
Loading
+43 −90
Original line number Diff line number Diff line
@@ -314,95 +314,52 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)
 * usually attached to a PC.
 *
 * When the Guest uses one of these instructions, we get a trap (General
 * Protection Fault) and come here.  We see if it's one of those troublesome
 * instructions and skip over it.  We return true if we did.
 * Protection Fault) and come here.  We queue this to be sent out to the
 * Launcher to handle.
 */
static int emulate_insn(struct lg_cpu *cpu)
{
	u8 insn;
	unsigned int insnlen = 0, in = 0, small_operand = 0;
	/*
	 * The eip contains the *virtual* address of the Guest's instruction:
	 * walk the Guest's page tables to find the "physical" address.
	 */
	unsigned long physaddr = guest_pa(cpu, cpu->regs->eip);

	/*
	 * This must be the Guest kernel trying to do something, not userspace!
	 * The bottom two bits of the CS segment register are the privilege
	 * level.
	 */
	if ((cpu->regs->cs & 3) != GUEST_PL)
		return 0;

	/* Decoding x86 instructions is icky. */
	insn = lgread(cpu, physaddr, u8);

/*
	 * Around 2.6.33, the kernel started using an emulation for the
	 * cmpxchg8b instruction in early boot on many configurations.  This
	 * code isn't paravirtualized, and it tries to disable interrupts.
	 * Ignore it, which will Mostly Work.
 * The eip contains the *virtual* address of the Guest's instruction:
 * we copy the instruction here so the Launcher doesn't have to walk
 * the page tables to decode it.  We handle the case (eg. in a kernel
 * module) where the instruction is over two pages, and the pages are
 * virtually but not physically contiguous.
 *
 * The longest possible x86 instruction is 15 bytes, but we don't handle
 * anything that strange.
 */
	if (insn == 0xfa) {
		/* "cli", or Clear Interrupt Enable instruction.  Skip it. */
		cpu->regs->eip++;
		return 1;
static void copy_from_guest(struct lg_cpu *cpu,
			    void *dst, unsigned long vaddr, size_t len)
{
	size_t to_page_end = PAGE_SIZE - (vaddr % PAGE_SIZE);
	unsigned long paddr;

	BUG_ON(len > PAGE_SIZE);

	/* If it goes over a page, copy in two parts. */
	if (len > to_page_end) {
		/* But make sure the next page is mapped! */
		if (__guest_pa(cpu, vaddr + to_page_end, &paddr))
			copy_from_guest(cpu, dst + to_page_end,
					vaddr + to_page_end,
					len - to_page_end);
		else
			/* Otherwise fill with zeroes. */
			memset(dst + to_page_end, 0, len - to_page_end);
		len = to_page_end;
	}

	/*
	 * 0x66 is an "operand prefix".  It means a 16, not 32 bit in/out.
	 */
	if (insn == 0x66) {
		small_operand = 1;
		/* The instruction is 1 byte so far, read the next byte. */
		insnlen = 1;
		insn = lgread(cpu, physaddr + insnlen, u8);
	/* This will kill the guest if it isn't mapped, but that
	 * shouldn't happen. */
	__lgread(cpu, dst, guest_pa(cpu, vaddr), len);
}

	/*
	 * We can ignore the lower bit for the moment and decode the 4 opcodes
	 * we need to emulate.
	 */
	switch (insn & 0xFE) {
	case 0xE4: /* in     <next byte>,%al */
		insnlen += 2;
		in = 1;
		break;
	case 0xEC: /* in     (%dx),%al */
		insnlen += 1;
		in = 1;
		break;
	case 0xE6: /* out    %al,<next byte> */
		insnlen += 2;
		break;
	case 0xEE: /* out    %al,(%dx) */
		insnlen += 1;
		break;
	default:
		/* OK, we don't know what this is, can't emulate. */
		return 0;
	}

	/*
	 * If it was an "IN" instruction, they expect the result to be read
	 * into %eax, so we change %eax.  We always return all-ones, which
	 * traditionally means "there's nothing there".
	 */
	if (in) {
		/* Lower bit tells means it's a 32/16 bit access */
		if (insn & 0x1) {
			if (small_operand)
				cpu->regs->eax |= 0xFFFF;
			else
				cpu->regs->eax = 0xFFFFFFFF;
		} else
			cpu->regs->eax |= 0xFF;
	}
	/* Finally, we've "done" the instruction, so move past it. */
	cpu->regs->eip += insnlen;
	/* Success! */
	return 1;
static void setup_emulate_insn(struct lg_cpu *cpu)
{
	cpu->pending.trap = 13;
	copy_from_guest(cpu, cpu->pending.insn, cpu->regs->eip,
			sizeof(cpu->pending.insn));
}

/*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */
@@ -410,13 +367,9 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu)
{
	switch (cpu->regs->trapnum) {
	case 13: /* We've intercepted a General Protection Fault. */
		/*
		 * Check if this was one of those annoying IN or OUT
		 * instructions which we need to emulate.  If so, we just go
		 * back into the Guest after we've done it.
		 */
		/* Hand to Launcher to emulate those pesky IN and OUT insns */
		if (cpu->regs->errcode == 0) {
			if (emulate_insn(cpu))
			setup_emulate_insn(cpu);
			return;
		}
		break;
+149 −0
Original line number Diff line number Diff line
@@ -41,6 +41,7 @@
#include <signal.h>
#include <pwd.h>
#include <grp.h>
#include <sys/user.h>

#ifndef VIRTIO_F_ANY_LAYOUT
#define VIRTIO_F_ANY_LAYOUT		27
@@ -1143,6 +1144,150 @@ static void handle_output(unsigned long addr)
	      strnlen(from_guest_phys(addr), guest_limit - addr));
}

/*L:216
 * This is where we emulate a handful of Guest instructions.  It's ugly
 * and we used to do it in the kernel but it grew over time.
 */

/*
 * We use the ptrace syscall's pt_regs struct to talk about registers
 * to lguest: these macros convert the names to the offsets.
 */
#define getreg(name) getreg_off(offsetof(struct user_regs_struct, name))
#define setreg(name, val) \
	setreg_off(offsetof(struct user_regs_struct, name), (val))

static u32 getreg_off(size_t offset)
{
	u32 r;
	unsigned long args[] = { LHREQ_GETREG, offset };

	if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0)
		err(1, "Getting register %u", offset);
	if (pread(lguest_fd, &r, sizeof(r), cpu_id) != sizeof(r))
		err(1, "Reading register %u", offset);

	return r;
}

static void setreg_off(size_t offset, u32 val)
{
	unsigned long args[] = { LHREQ_SETREG, offset, val };

	if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0)
		err(1, "Setting register %u", offset);
}

static void emulate_insn(const u8 insn[])
{
	unsigned long args[] = { LHREQ_TRAP, 13 };
	unsigned int insnlen = 0, in = 0, small_operand = 0, byte_access;
	unsigned int eax, port, mask;
	/*
	 * We always return all-ones on IO port reads, which traditionally
	 * means "there's nothing there".
	 */
	u32 val = 0xFFFFFFFF;

	/*
	 * This must be the Guest kernel trying to do something, not userspace!
	 * The bottom two bits of the CS segment register are the privilege
	 * level.
	 */
	if ((getreg(xcs) & 3) != 0x1)
		goto no_emulate;

	/* Decoding x86 instructions is icky. */

	/*
	 * Around 2.6.33, the kernel started using an emulation for the
	 * cmpxchg8b instruction in early boot on many configurations.  This
	 * code isn't paravirtualized, and it tries to disable interrupts.
	 * Ignore it, which will Mostly Work.
	 */
	if (insn[insnlen] == 0xfa) {
		/* "cli", or Clear Interrupt Enable instruction.  Skip it. */
		insnlen = 1;
		goto skip_insn;
	}

	/*
	 * 0x66 is an "operand prefix".  It means a 16, not 32 bit in/out.
	 */
	if (insn[insnlen] == 0x66) {
		small_operand = 1;
		/* The instruction is 1 byte so far, read the next byte. */
		insnlen = 1;
	}

	/* If the lower bit isn't set, it's a single byte access */
	byte_access = !(insn[insnlen] & 1);

	/*
	 * Now we can ignore the lower bit and decode the 4 opcodes
	 * we need to emulate.
	 */
	switch (insn[insnlen] & 0xFE) {
	case 0xE4: /* in     <next byte>,%al */
		port = insn[insnlen+1];
		insnlen += 2;
		in = 1;
		break;
	case 0xEC: /* in     (%dx),%al */
		port = getreg(edx) & 0xFFFF;
		insnlen += 1;
		in = 1;
		break;
	case 0xE6: /* out    %al,<next byte> */
		port = insn[insnlen+1];
		insnlen += 2;
		break;
	case 0xEE: /* out    %al,(%dx) */
		port = getreg(edx) & 0xFFFF;
		insnlen += 1;
		break;
	default:
		/* OK, we don't know what this is, can't emulate. */
		goto no_emulate;
	}

	/* Set a mask of the 1, 2 or 4 bytes, depending on size of IO */
	if (byte_access)
		mask = 0xFF;
	else if (small_operand)
		mask = 0xFFFF;
	else
		mask = 0xFFFFFFFF;

	/*
	 * If it was an "IN" instruction, they expect the result to be read
	 * into %eax, so we change %eax.
	 */
	eax = getreg(eax);

	if (in) {
		/* Clear the bits we're about to read */
		eax &= ~mask;
		/* Copy bits in from val. */
		eax |= val & mask;
		/* Now update the register. */
		setreg(eax, eax);
	}

	verbose("IO %s of %x to %u: %#08x\n",
		in ? "IN" : "OUT", mask, port, eax);
skip_insn:
	/* Finally, we've "done" the instruction, so move past it. */
	setreg(eip, getreg(eip) + insnlen);
	return;

no_emulate:
	/* Inject trap into Guest. */
	if (write(lguest_fd, args, sizeof(args)) < 0)
		err(1, "Reinjecting trap 13 for fault at %#x", getreg(eip));
}


/*L:190
 * Device Setup
 *
@@ -1832,6 +1977,10 @@ static void __attribute__((noreturn)) run_guest(void)
				verbose("Notify on address %#08x\n",
					notify.addr);
				handle_output(notify.addr);
			} else if (notify.trap == 13) {
				verbose("Emulating instruction at %#x\n",
					getreg(eip));
				emulate_insn(notify.insn);
			} else
				errx(1, "Unknown trap %i addr %#08x\n",
				     notify.trap, notify.addr);