diff --git a/libsel4vm/CMakeLists.txt b/libsel4vm/CMakeLists.txt index 5b5a79755..dbffcd99d 100644 --- a/libsel4vm/CMakeLists.txt +++ b/libsel4vm/CMakeLists.txt @@ -47,7 +47,6 @@ file( sources src/*.c src/arch/${KernelArch}/*.c - src/arch/${KernelArch}/i8259/*.c src/arch/${KernelArch}/processor/*.c src/sel4_arch/${KernelSel4Arch}/*.c ) @@ -67,6 +66,7 @@ target_include_directories( ) target_link_libraries( sel4vm + sel4vmmplatsupport muslc sel4 sel4simple diff --git a/libsel4vm/arch_include/x86/sel4vm/arch/boot_arch.h b/libsel4vm/arch_include/x86/sel4vm/arch/boot_arch.h new file mode 100644 index 000000000..632ccb0a7 --- /dev/null +++ b/libsel4vm/arch_include/x86/sel4vm/arch/boot_arch.h @@ -0,0 +1,16 @@ +/* + * Copyright 2022, UNSW (ABN 57 195 873 179) + * + * SPDX-License-Identifier: BSD-2-Clause + */ +#pragma once + +#include + +/*** + * @function vm_assign_vcpu_timer(vcpu, timer_functions) + * Assign a vcpu with timer functions to emulate a timer. + * @param {vm_vcpu_t *} vcpu A handle to the VCPU + * @param {int} target Logical target CPU ID + */ +void vm_assign_vcpu_timer(vm_vcpu_t *vcpu, struct timer_functions *timer_emul); \ No newline at end of file diff --git a/libsel4vm/arch_include/x86/sel4vm/arch/guest_x86_irq_controller.h b/libsel4vm/arch_include/x86/sel4vm/arch/guest_x86_irq_controller.h new file mode 100644 index 000000000..1eaa1486a --- /dev/null +++ b/libsel4vm/arch_include/x86/sel4vm/arch/guest_x86_irq_controller.h @@ -0,0 +1,68 @@ +/* + * Copyright 2022, UNSW (ABN 57 195 873 179) + * + * SPDX-License-Identifier: BSD-2-Clause + */ +#pragma once + +#include +#include +#include +#include + +#define I8259_NR_IRQS 16 +#define LAPIC_NR_IRQS 14 +#define NR_IRQS (I8259_NR_IRQS + LAPIC_NR_IRQS) + +/** + * seL4 vectors + * + * For x86 there are 256 idt entries. Vectors are used to index + * into the idt and invoke interrupt handlers. This is how vectors + * are reserved in seL4: + * + * 0 - 31 : system traps and exceptions (hardcoded) + * 32 - ... : start of seL4 vectors + * 32 - 47 : PIC interrupts/kernel only + * 48 : start of user interrupts, when an irq is passed into an seL4 + * system call, the _vector_ it is assigned to is (irq + 48). + */ +#define USER_IRQ_TO_CPU_VECTOR(x) ((x) + 48) + + +typedef struct x86_irq_msi_cookie { + /* The original MSI data the guest programmed. We copy + * this back on every irq injection. */ + pci_msi_data_t data; +} x86_irq_msi_cookie_t; + +typedef struct x86_irq_cookie { + bool is_msi; + x86_irq_msi_cookie_t msi_cookie; /* Cookie for MSI interrupts */ + seL4_CPtr irq_cap; /* Cap to ack on after EOI signal from guest */ +} x86_irq_cookie_t; + +/* Struct to store information needed when injecting or ack'ing interrupts */ +typedef struct irq_info { + irq_ack_fn_t callback; + x86_irq_cookie_t *cookie; +} irq_info_t; + +/*** + * @function vm_timer_inject_irq(vcpu) + * Inject an a timer IRQ. This is for when the IRQ controller handles the nitty-gritty + * IRQ assignments, and we have no way of telling where the timer IRQ is supposed to go. + * @param {vm_vcpu_t *} vcpu Handle to the VCPU + * @return 0 on success, otherwise -1 for error + */ +int vm_inject_timer_irq(vm_vcpu_t *vcpu); + +/*** + * @function vm_irq_set_msi_data(irq, msi_data) + * Save the msi data for an irq so we can patch values in later when the + * device invokes an msi. + * + * @param {int} irq the msi's irq + * @param {pci_msi_data_t *} msi_data the msi data we want to patch in later + */ +void vm_irq_set_msi_data(int irq, pci_msi_data_t *msi_data); diff --git a/libsel4vm/src/arch/x86/boot.c b/libsel4vm/src/arch/x86/boot.c index 45c1ab7f9..8d82cfe66 100644 --- a/libsel4vm/src/arch/x86/boot.c +++ b/libsel4vm/src/arch/x86/boot.c @@ -97,6 +97,11 @@ static int make_guest_page_dir(vm_t *vm) seL4_PageBits, seL4_AllRights, 1, make_guest_page_dir_continued, NULL); } +void vm_assign_vcpu_timer(vm_vcpu_t *vcpu, struct timer_functions *timer_emul) +{ + vm_apic_set_timer_and_update(vcpu->vcpu_arch.lapic, timer_emul); +} + int vm_init_arch(vm_t *vm) { int err; @@ -143,7 +148,7 @@ int vm_create_vcpu_arch(vm_t *vm, vm_vcpu_t *vcpu) int err; err = seL4_X86_VCPU_SetTCB(vcpu->vcpu.cptr, simple_get_tcb(vm->simple)); assert(err == seL4_NoError); - /* All LAPICs are created enabled, in virtual wire mode */ + /* All LAPICs are created enabled */ vm_create_lapic(vcpu, 1); vcpu->vcpu_arch.guest_state = calloc(1, sizeof(guest_state_t)); if (!vcpu->vcpu_arch.guest_state) { diff --git a/libsel4vm/src/arch/x86/guest_irq_controller.c b/libsel4vm/src/arch/x86/guest_irq_controller.c index 125331584..2833b1fdf 100644 --- a/libsel4vm/src/arch/x86/guest_irq_controller.c +++ b/libsel4vm/src/arch/x86/guest_irq_controller.c @@ -7,7 +7,7 @@ #include #include -#include "i8259/i8259.h" +#include "processor/i8259.h" #include "processor/apicdef.h" #include "processor/lapic.h" diff --git a/libsel4vm/src/arch/x86/i8259/i8259.h b/libsel4vm/src/arch/x86/i8259/i8259.h deleted file mode 100644 index c7bdad398..000000000 --- a/libsel4vm/src/arch/x86/i8259/i8259.h +++ /dev/null @@ -1,16 +0,0 @@ -/* - * Copyright 2019, Data61, CSIRO (ABN 41 687 119 230) - * - * SPDX-License-Identifier: GPL-2.0-only - */ - -#pragma once - -#include - -/* Init function */ -int i8259_pre_init(vm_t *vm); - -/* Functions to retrieve interrupt state */ -int i8259_get_interrupt(vm_t *vm); -int i8259_has_interrupt(vm_t *vm); diff --git a/libsel4vm/src/arch/x86/interrupt.c b/libsel4vm/src/arch/x86/interrupt.c index 4bab8f8a1..29865f993 100644 --- a/libsel4vm/src/arch/x86/interrupt.c +++ b/libsel4vm/src/arch/x86/interrupt.c @@ -15,7 +15,7 @@ #include #include "vm.h" -#include "i8259/i8259.h" +#include "processor/i8259.h" #include "guest_state.h" #include "processor/decode.h" #include "processor/lapic.h" diff --git a/libsel4vm/src/arch/x86/processor/apicdef.h b/libsel4vm/src/arch/x86/processor/apicdef.h index 9f0c5e86c..7769bd754 100644 --- a/libsel4vm/src/arch/x86/processor/apicdef.h +++ b/libsel4vm/src/arch/x86/processor/apicdef.h @@ -86,6 +86,7 @@ #define APIC_ICR_RR_INVALID 0x00000 #define APIC_ICR_RR_INPROG 0x10000 #define APIC_ICR_RR_VALID 0x20000 +#define APIC_INT_EDGETRIG 0x00000 #define APIC_INT_LEVELTRIG 0x08000 #define APIC_INT_ASSERT 0x04000 #define APIC_ICR_BUSY 0x01000 @@ -108,15 +109,21 @@ #define APIC_LVTTHMR 0x330 #define APIC_LVTPC 0x340 #define APIC_LVT0 0x350 + +/* This mask is shifted by 18 due to the i82489DX not using the TSC bit in the LVTT, ignore */ #define APIC_LVT_TIMER_BASE_MASK (0x3 << 18) #define GET_APIC_TIMER_BASE(x) (((x) >> 18) & 0x3) #define SET_APIC_TIMER_BASE(x) (((x) << 18)) + #define APIC_TIMER_BASE_CLKIN 0x0 #define APIC_TIMER_BASE_TMBASE 0x1 #define APIC_TIMER_BASE_DIV 0x2 + +#define APIC_LVT_TIMER_MASK (0x3 << 17) #define APIC_LVT_TIMER_ONESHOT (0 << 17) #define APIC_LVT_TIMER_PERIODIC (BIT(17)) #define APIC_LVT_TIMER_TSCDEADLINE (2 << 17) + #define APIC_LVT_MASKED (BIT(16)) #define APIC_LVT_LEVEL_TRIGGER (BIT(15)) #define APIC_LVT_REMOTE_IRR (BIT(14)) diff --git a/libsel4vm/src/arch/x86/i8259/i8259.c b/libsel4vm/src/arch/x86/processor/i8259.c similarity index 96% rename from libsel4vm/src/arch/x86/i8259/i8259.c rename to libsel4vm/src/arch/x86/processor/i8259.c index 69a3d2683..2a5b0b05e 100644 --- a/libsel4vm/src/arch/x86/i8259/i8259.c +++ b/libsel4vm/src/arch/x86/processor/i8259.c @@ -22,14 +22,13 @@ #include #include #include +#include #include #include "i8259.h" #define I8259_MASTER 0 #define I8259_SLAVE 1 -#define PIC_NUM_PINS 16 - /*first programmable interrupt controller, master*/ #define X86_IO_PIC_1_START 0x20 #define X86_IO_PIC_1_END 0x21 @@ -42,13 +41,6 @@ #define X86_IO_ELCR_START 0x4d0 #define X86_IO_ELCR_END 0x4d1 -typedef struct i8259_irq_ack { - irq_ack_fn_t callback; - void *cookie; -} i8259_irq_ack_t; - -static i8259_irq_ack_t irq_ack_fns[PIC_NUM_PINS]; - /* PIC Machine state. */ struct i8259_state { unsigned char last_irr; /* Edge detection */ @@ -175,8 +167,10 @@ static void pic_clear_isr(vm_t *vm, struct i8259_state *s, int irq) } if (irq != 2) { - if (irq_ack_fns[irq].callback) { - irq_ack_fns[irq].callback(vm->vcpus[BOOT_VCPU], irq, irq_ack_fns[irq].cookie); + if (irq >= I8259_NR_IRQS) + assert(0); + if (irq_info[irq].callback) { + irq_info[irq].callback(vm->vcpus[BOOT_VCPU], irq, (void *) irq_info[irq].cookie); } } } @@ -277,7 +271,7 @@ static void pic_reset(vm_t *vm, struct i8259_state *s) } #endif - for (irq = 0; irq < PIC_NUM_PINS / 2; irq++) { + for (irq = 0; irq < I8259_NR_IRQS / 2; irq++) { if (edge_irr & (1 << irq)) { pic_clear_isr(vm, s, irq); } @@ -368,7 +362,7 @@ static void pic_ioport_write(vm_vcpu_t *vcpu, struct i8259_state *s, unsigned in //off = (s == &s->pics_state->pics[0]) ? 0 : 8; s->imr = val; #if 0 - for (irq = 0; irq < PIC_NUM_PINS / 2; irq++) + for (irq = 0; irq < I8259_NR_IRQS / 2; irq++) if (imr_diff & (1 << irq)) /*FIXME: notify the status changes for IMR*/ kvm_fire_mask_notifiers( @@ -714,21 +708,21 @@ int vm_set_irq_level(vm_vcpu_t *vcpu, int irq, int irq_level) return 0; } -int vm_inject_irq(vm_vcpu_t *vcpu, int irq) +int i8259_inject_irq(vm_vcpu_t *vcpu, int irq) { vm_set_irq_level(vcpu, irq, 1); vm_set_irq_level(vcpu, irq, 0); return 0; } -int vm_register_irq(vm_vcpu_t *vcpu, int irq, irq_ack_fn_t fn, void *cookie) +int i8259_register_irq(vm_vcpu_t *vcpu, int irq, irq_ack_fn_t fn, void *cookie) { - if (irq < 0 || irq >= PIC_NUM_PINS) { + if (irq < 0 || irq >= I8259_NR_IRQS) { ZF_LOGE("irq %d is invalid", irq); return -1; } - i8259_irq_ack_t *ack = &irq_ack_fns[irq]; - ack->callback = fn; - ack->cookie = cookie; + irq_info_t *info = &irq_info[irq]; + info->callback = fn; + info->cookie = (x86_irq_cookie_t *) cookie; return 0; } diff --git a/libsel4vm/src/arch/x86/processor/i8259.h b/libsel4vm/src/arch/x86/processor/i8259.h new file mode 100644 index 000000000..4fe2a59c5 --- /dev/null +++ b/libsel4vm/src/arch/x86/processor/i8259.h @@ -0,0 +1,26 @@ +/* + * Copyright 2019, Data61, CSIRO (ABN 41 687 119 230) + * + * SPDX-License-Identifier: GPL-2.0-only + */ + +#pragma once + +#include +#include +#include + +extern irq_info_t irq_info[NR_IRQS]; + +/* Init function */ +int i8259_pre_init(vm_t *vm); + +/* Functions to retrieve interrupt state */ +int i8259_get_interrupt(vm_t *vm); +int i8259_has_interrupt(vm_t *vm); + +/* Inject IRQ into guest PIC */ +int i8259_inject_irq(vm_vcpu_t *vcpu, int irq); + +/* Register IRQ with an ack function for EOIs */ +int i8259_register_irq(vm_vcpu_t *vcpu, int irq, irq_ack_fn_t fn, void *cookie); diff --git a/libsel4vm/src/arch/x86/processor/lapic.c b/libsel4vm/src/arch/x86/processor/lapic.c index 282d40f83..51baba8fb 100644 --- a/libsel4vm/src/arch/x86/processor/lapic.c +++ b/libsel4vm/src/arch/x86/processor/lapic.c @@ -27,11 +27,14 @@ #include #include #include +#include +#include +#include #include "processor/lapic.h" #include "processor/apicdef.h" #include "processor/msr.h" -#include "i8259/i8259.h" +#include "i8259.h" #include "interrupt.h" #define APIC_BUS_CYCLE_NS 1 @@ -39,6 +42,8 @@ #define APIC_DEBUG 0 #define apic_debug(lvl,...) do{ if(lvl < APIC_DEBUG){printf(__VA_ARGS__);fflush(stdout);}}while (0) +#define mod_64(x, y) ((x) - (y) * ((x) / (y))) + #define APIC_LVT_NUM 6 /* 14 is the version for Xeon and Pentium 8.4.8*/ #define APIC_VERSION (0x14UL | ((APIC_LVT_NUM - 1) << 16)) @@ -50,6 +55,33 @@ #define MAX_APIC_VECTOR 256 #define APIC_VECTORS_PER_REG 32 +/* This needs to be generalised to per-cpu for SMP support */ +irq_info_t irq_info[I8259_NR_IRQS + LAPIC_NR_IRQS]; + +/* id used to identify this timer in the timer_server component */ +static int ts_id; +static uint64_t tsc_frequency = 0; + +static int64_t current_time_ns() +{ + return (int64_t) muldivu64(rdtsc_pure(), NS_IN_S, tsc_frequency); +} + +static inline uint64_t timer_tsc_freq(vm_lapic_t *apic) +{ + return apic->lapic_timer.timer_emul->tsc_freq(); +} + +static inline int timer_oneshot_absolute(vm_lapic_t *apic, uint64_t ns) +{ + return apic->lapic_timer.timer_emul->oneshot_absolute(ns); +} + +static inline int timer_stop(vm_lapic_t *apic) +{ + return apic->lapic_timer.timer_emul->stop(); +} + inline static int pic_get_interrupt(vm_t *vm) { return i8259_get_interrupt(vm); @@ -244,6 +276,21 @@ static void UNUSED dump_vector(const char *name, void *bitmap) printf("\n"); } +static inline int apic_lvtt_oneshot(vm_lapic_t *apic) +{ + return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_ONESHOT; +} + +static inline int apic_lvtt_period(vm_lapic_t *apic) +{ + return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_PERIODIC; +} + +static inline int apic_lvtt_tscdeadline(vm_lapic_t *apic) +{ + return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_TSCDEADLINE; +} + static int find_highest_vector(void *bitmap) { int vec; @@ -294,7 +341,7 @@ static inline int apic_find_highest_irr(vm_lapic_t *apic) static inline void apic_set_irr(int vec, vm_lapic_t *apic) { if (vec != 0x30) { - apic_debug(5, "!settting irr 0x%x\n", vec); + apic_debug(5, "!setting irr 0x%x\n", vec); } apic->irr_pending = true; @@ -388,6 +435,7 @@ void vm_apic_update_tmr(vm_vcpu_t *vcpu, uint32_t *tmr) static void apic_update_ppr(vm_vcpu_t *vcpu) { + /* Intel SDM 10.8.3.1 Task and Processor Priorities */ uint32_t tpr, isrv, ppr, old_ppr; int isr; vm_lapic_t *apic = vcpu->vcpu_arch.lapic; @@ -549,7 +597,12 @@ static int __apic_accept_irq(vm_vcpu_t *vcpu, int delivery_mode, if (!vm_apic_enabled(apic)) { break; } - apic_debug(4, "####fixed ipi 0x%x to vcpu %d\n", vector, vcpu->vcpu_id); + + /** Note: lapic.c currently assumes all trigger mode for lowest/fixed interrupt + * to be edge triggered, which is why the APIC_TMR register is not cleared. + */ + + apic_debug(4, "###fixed int 0x%x to vcpu %d\n", vector, vcpu->vcpu_id); result = 1; apic_set_irr(vector, apic); @@ -625,6 +678,15 @@ static int apic_set_eoi(vm_vcpu_t *vcpu) return vector; } + if (vector < NR_IRQS && irq_info[vector].callback) { + /* These callbacks are only setup for the PIC and APIC. For MSIs, the + * guest will choose what vector the interrupts head to and typically + * above 32 (our max supported irqs). Plus I don't think the MSIs need + * an explicit EOI signal. + * TODO: investigate */ + irq_info[vector].callback(vcpu, vector, (void *) irq_info[vector].cookie); + } + apic_clear_isr(vector, apic); apic_update_ppr(vcpu); @@ -659,6 +721,29 @@ static void apic_send_ipi(vm_vcpu_t *vcpu) vm_irq_delivery_to_apic(vcpu, &irq, NULL); } +static uint32_t apic_get_tmcct(vm_lapic_t *apic) +{ + int64_t remaining, now, ns; + uint32_t tmcct; + + /* if initial count is 0, current count should also be 0 */ + if (vm_apic_get_reg(apic, APIC_TMICT) == 0 || + apic->lapic_timer.period == 0) { + return 0; + } + + now = current_time_ns(); + remaining = apic->lapic_timer.target_expiration - now; + if (remaining < 0) { + remaining = 0; + } + + ns = mod_64(remaining, apic->lapic_timer.period); + tmcct = ns / (APIC_BUS_CYCLE_NS * apic->divide_count); + + return tmcct; +} + static uint32_t __apic_read(vm_lapic_t *apic, unsigned int offset) { uint32_t val = 0; @@ -675,9 +760,17 @@ static uint32_t __apic_read(vm_lapic_t *apic, unsigned int offset) apic_debug(2, "Access APIC ARBPRI register which is for P6\n"); break; - case APIC_TMCCT: /* Timer CCR */ + case APIC_TMCCT: + if (apic_lvtt_tscdeadline(apic)) { + /* Shouldn't even get to here but just in case */ + return 0; + } + // val = apic_get_tmcct(apic); break; case APIC_PROCPRI: + /* TODO: unknown if this is needed, Xen does not update PPR + * while KVM does when reading the PROCPRI register */ + // apic_update_ppr(apic->vcpu); val = vm_apic_get_reg(apic, offset); break; default: @@ -688,6 +781,213 @@ static uint32_t __apic_read(vm_lapic_t *apic, unsigned int offset) return val; } +static void vm_apic_inject_pending_timer_irqs(vm_lapic_t *apic) +{ + vm_apic_local_deliver(apic->vcpu, APIC_LVTT); + if (apic_lvtt_oneshot(apic)) { + apic->lapic_timer.tscdeadline = 0; // no clue why this is set to 0 tbh + apic->lapic_timer.target_expiration = 0; + } +} + +static void apic_timer_expired(vm_lapic_t *apic, bool from_timer_fn) +{ + vm_vcpu_t *vcpu = apic->vcpu; + + if (apic->lapic_timer.pending) { + // ZF_LOGE("Current pending timer IRQ"); + return; + } + + if (!from_timer_fn && vm_apic_enabled(apic)) { + vm_apic_inject_pending_timer_irqs(apic); + return; + } + + vm_apic_inject_pending_timer_irqs(apic); + apic->lapic_timer.pending++; + if (from_timer_fn) { + vm_vcpu_accept_interrupt(vcpu); + } +} + +static void update_divide_count(vm_lapic_t *apic) +{ + uint32_t tmp1, tmp2, tdcr; + + tdcr = vm_apic_get_reg(apic, APIC_TDCR); + tmp1 = tdcr & 0xf; + tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1; + apic->divide_count = 0x1 << (tmp2 & 0x7); +} + + +static void limit_periodic_timer_frequency(vm_lapic_t *apic) +{ + /* + * Do not allow the guest to program periodic timers with small + * interval (cuz that's what KVM does lmfao). + */ + if (apic_lvtt_period(apic) && apic->lapic_timer.period) { + int64_t min_period = MIN_TIMER_PERIOD_US * 1000LL; + + if (apic->lapic_timer.period < min_period) { + apic->lapic_timer.period = min_period; + } + } +} + +static inline int64_t tmict_to_ns(vm_lapic_t *apic, uint32_t tmict) +{ + return (int64_t) tmict * APIC_BUS_CYCLE_NS * (int64_t) apic->divide_count; +} + +static void update_target_expiration(vm_lapic_t *apic, uint32_t old_divisor) +{ + int64_t now, remaining, remaining_new; + + apic->lapic_timer.period = + tmict_to_ns(apic, vm_apic_get_reg(apic, APIC_TMICT)); + limit_periodic_timer_frequency(apic); + + now = current_time_ns(); + remaining = apic->lapic_timer.target_expiration - now; + if (remaining < 0) { + remaining = 0; + } + + remaining_new = muldivu64(remaining, (uint64_t) apic->divide_count, old_divisor); + apic->lapic_timer.target_expiration = now + remaining_new; +} + +static bool set_target_expiration(vm_lapic_t *apic, uint32_t count_reg) +{ + int64_t now = current_time_ns(); + int64_t deadline; + + apic->lapic_timer.period = + tmict_to_ns(apic, vm_apic_get_reg(apic, APIC_TMICT)); + + if (!apic->lapic_timer.period) { + apic->lapic_timer.tscdeadline = 0; + return false; + } + + limit_periodic_timer_frequency(apic); + deadline = apic->lapic_timer.period; + + if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { + if (unlikely(count_reg != APIC_TMICT)) { + deadline = tmict_to_ns(apic, vm_apic_get_reg(apic, count_reg)); + if (unlikely(deadline <= 0)) { + deadline = apic->lapic_timer.period; + } else if (unlikely(deadline > apic->lapic_timer.period)) { + apic_set_reg(apic, count_reg, 0); + deadline = apic->lapic_timer.period; + } + } + } + + /* KVM code for TSC deadline but we don't support. It's here + * in case anyone wants to try in the future. */ +#if 0 + apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) + + nsec_to_cycles(apic->vcpu, deadline); +#endif + apic->lapic_timer.target_expiration = now + deadline; + + return true; +} + +static void apic_cancel_timer(vm_lapic_t *apic) +{ + timer_stop(apic); + apic->lapic_timer.pending = 0; +} + +static void apic_update_lvtt(vm_lapic_t *apic) +{ + uint32_t timer_mode = vm_apic_get_reg(apic, APIC_LVTT) & + apic->lapic_timer.timer_mode_mask; + + if (apic->lapic_timer.timer_mode != timer_mode) { + if (apic_lvtt_tscdeadline(apic) != (timer_mode == + APIC_LVT_TIMER_TSCDEADLINE)) { + apic_cancel_timer(apic); + apic_set_reg(apic, APIC_TMICT, 0); + apic->lapic_timer.period = 0; + apic->lapic_timer.tscdeadline = 0; + } + apic->lapic_timer.timer_mode = timer_mode; + limit_periodic_timer_frequency(apic); + } +} + +static void advance_periodic_target_expiration(vm_lapic_t *apic) +{ + /* KVM syncs the periodic and tsc deadline counters in this function + * but I've ommitted for brevity's sake. */ + apic->lapic_timer.target_expiration += apic->lapic_timer.period; +} + +static void start_period(vm_lapic_t *apic) +{ + if (!apic->lapic_timer.period) + return; + + if (current_time_ns() > apic->lapic_timer.target_expiration) { + apic_timer_expired(apic, false); + + if (apic_lvtt_oneshot(apic)) + return; + + advance_periodic_target_expiration(apic); + } + + ZF_LOGE("Starting timer of period %ull", apic->lapic_timer.period); + + timer_oneshot_absolute(apic, apic->lapic_timer.target_expiration); +} + +static void start_timer(vm_lapic_t *apic) +{ + if (!apic_lvtt_period(apic) && apic->lapic_timer.period) { + return; + } + + if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { + start_period(apic); + } else { + /* TSC deadline unimplemented */ + } +} + +static void apic_restart_timer(vm_lapic_t *apic) +{ + if (!apic_lvtt_period(apic) && apic->lapic_timer.pending) { + return; + } + + start_timer(apic); +} + +static void __start_apic_timer(vm_lapic_t *apic, uint32_t count_reg) +{ + apic->lapic_timer.pending = 0; + + if ((apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) + && !set_target_expiration(apic, count_reg)) + return; + + apic_restart_timer(apic); +} + + +static void start_apic_timer(vm_lapic_t *apic) +{ + __start_apic_timer(apic, APIC_TMICT); +} + static void apic_manage_nmi_watchdog(vm_lapic_t *apic, uint32_t lvt0_val) { int nmi_wd_enabled = apic_lvt_nmi_mode(vm_apic_get_reg(apic, APIC_LVT0)); @@ -741,8 +1041,7 @@ static int apic_reg_write(vm_vcpu_t *vcpu, uint32_t reg, uint32_t val) apic_set_reg(apic, APIC_LVTT + 0x10 * i, lvt_val | APIC_LVT_MASKED); } - // atomic_set(&apic->lapic_timer.pending, 0); - + apic->lapic_timer.pending = 0; } break; } @@ -763,7 +1062,7 @@ static int apic_reg_write(vm_vcpu_t *vcpu, uint32_t reg, uint32_t val) case APIC_LVTPC: case APIC_LVT1: case APIC_LVTERR: - /* TODO: Check vector */ + /* KVM TODO: Check vector */ if (!vm_apic_sw_enabled(apic)) { val |= APIC_LVT_MASKED; } @@ -774,15 +1073,48 @@ static int apic_reg_write(vm_vcpu_t *vcpu, uint32_t reg, uint32_t val) break; case APIC_LVTT: + /* Timer LVT */ + ZF_LOGE("APIC_LVTT"); + if (!vm_apic_sw_enabled(apic)) { + val |= APIC_LVT_MASKED; + } + val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask); apic_set_reg(apic, APIC_LVTT, val); + apic_update_lvtt(apic); break; case APIC_TMICT: + /* Timer initial count */ + ZF_LOGE("APIC_TMICT"); + if (apic_lvtt_tscdeadline(apic)) { + ZF_LOGE("TSC deadline should never be in use"); + break; + } + apic_cancel_timer(apic); apic_set_reg(apic, APIC_TMICT, val); + start_apic_timer(apic); break; case APIC_TDCR: + /* Timer divide config */ + ZF_LOGE("APIC_TDCR"); + uint32_t old_divisor = apic->divide_count; apic_set_reg(apic, APIC_TDCR, val); + update_divide_count(apic); + if (apic->divide_count != old_divisor && + apic->lapic_timer.period) { + timer_stop(apic); + update_target_expiration(apic, old_divisor); + apic_restart_timer(apic); + } + break; + + case APIC_ESR: + /** + * Writing to the APIC ESR clears the ESR according to Pentium errata 3AP. + * Linux does it on purpose (lmao) so we must support, I guess. + */ + apic_set_reg(apic, APIC_ESR, 0); break; default: @@ -813,7 +1145,7 @@ void vm_apic_mmio_write(vm_vcpu_t *vcpu, void *cookie, uint32_t offset, /* too common printing */ if (offset != APIC_EOI) apic_debug(6, "lapic mmio write at %s: offset 0x%x with length 0x%x, and value is " - "0x%x\n", __func__, offset, len, data); + "0x%x\n", __func__, offset, len, data); apic_reg_write(vcpu, offset & 0xff0, data); } @@ -862,7 +1194,7 @@ void vm_apic_mmio_read(vm_vcpu_t *vcpu, void *cookie, uint32_t offset, apic_reg_read(apic, offset, len, data); - apic_debug(6, "lapic mmio read on vcpu %d, reg %08x = %08x\n", vcpu->vcpu_id, offset, *data); + apic_debug(6, "lapic mmio read on vcpu %d, reg 0x%x = 0x%x\n", vcpu->vcpu_id, offset, *data); return; } @@ -872,11 +1204,11 @@ memory_fault_result_t apic_fault_callback(vm_t *vm, vm_vcpu_t *vcpu, uintptr_t f { uint32_t data; if (is_vcpu_read_fault(vcpu)) { - vm_apic_mmio_read(vcpu, cookie, APIC_DEFAULT_PHYS_BASE - fault_addr, fault_length, &data); + vm_apic_mmio_read(vcpu, cookie, fault_addr - APIC_DEFAULT_PHYS_BASE, fault_length, &data); set_vcpu_fault_data(vcpu, data); } else { data = get_vcpu_fault_data(vcpu); - vm_apic_mmio_write(vcpu, cookie, APIC_DEFAULT_PHYS_BASE - fault_addr, fault_length, data); + vm_apic_mmio_write(vcpu, cookie, fault_addr - APIC_DEFAULT_PHYS_BASE, fault_length, data); } advance_vcpu_fault(vcpu); return FAULT_HANDLED; @@ -940,30 +1272,42 @@ void vm_lapic_reset(vm_vcpu_t *vcpu) vm_apic_set_id(apic, vcpu->vcpu_id); /* In agreement with ACPI code */ apic_set_reg(apic, APIC_LVR, APIC_VERSION); - for (i = 0; i < APIC_LVT_NUM; i++) { - apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); - } - - apic_set_reg(apic, APIC_DFR, 0xffffffffU); - apic_set_spiv(apic, 0xff); - apic_set_reg(apic, APIC_TASKPRI, 0); - vm_apic_set_ldr(apic, 0); - apic_set_reg(apic, APIC_ESR, 0); - apic_set_reg(apic, APIC_ICR, 0); - apic_set_reg(apic, APIC_ICR2, 0); - apic_set_reg(apic, APIC_TDCR, 0); - apic_set_reg(apic, APIC_TMICT, 0); for (i = 0; i < 8; i++) { apic_set_reg(apic, APIC_IRR + 0x10 * i, 0); apic_set_reg(apic, APIC_ISR + 0x10 * i, 0); apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); } + apic_set_reg(apic, APIC_ICR, 0); + apic_set_reg(apic, APIC_ICR2, 0); + apic_set_reg(apic, APIC_ESR, 0); + + /* Clear the LDR (unless in x2APIC mode) */ + apic_set_reg(apic, APIC_LDR, 0); + apic_set_reg(apic, APIC_TASKPRI, 0); + apic_set_reg(apic, APIC_TMICT, 0); + apic_set_reg(apic, APIC_TMCCT, 0); + apic_set_reg(apic, APIC_TDCR, 0); + + apic_set_reg(apic, APIC_DFR, 0xffffffffU); + + for (i = 0; i < APIC_LVT_NUM; i++) { + apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); + } + + apic_set_spiv(apic, 0xff); + apic->irr_pending = 0; apic->isr_count = 0; apic->highest_isr_cache = -1; apic_update_ppr(vcpu); + update_divide_count(apic); + apic->lapic_timer.pending = 0; + + apic->arb_prio = 0; - vcpu->vcpu_arch.lapic->arb_prio = 0; + /* AMD does not implement the TSC deadline timer, so until we have AMD cpu + * virtualization, just don't implement the TSC */ + apic->lapic_timer.timer_mode_mask = BIT(17); apic_debug(4, "%s: vcpu=%p, id=%d, base_msr=" "0x%016x\n", __func__, @@ -996,7 +1340,7 @@ int vm_create_lapic(vm_vcpu_t *vcpu, int enabled) vcpu->vcpu_arch.lapic = apic; - apic->regs = calloc(1, sizeof(struct local_apic_regs)); // TODO this is a page; allocate a page + apic->regs = calloc(1, PAGE_SIZE_4K); if (!apic->regs) { printf("calloc apic regs error for vcpu %x\n", vcpu->vcpu_id); @@ -1012,6 +1356,8 @@ int vm_create_lapic(vm_vcpu_t *vcpu, int enabled) /* mainly init registers */ vm_lapic_reset(vcpu); + apic->vcpu = vcpu; + return 0; nomem_free_apic: free(apic); @@ -1051,7 +1397,12 @@ int vm_apic_get_interrupt(vm_vcpu_t *vcpu) /* Return which vector is next up for servicing */ int vm_apic_has_interrupt(vm_vcpu_t *vcpu) { + /* Early boot sequence apic might not be set up yet */ vm_lapic_t *apic = vcpu->vcpu_arch.lapic; + if (!apic) { + return -1; + } + int highest_irr; if (vm_apic_accept_pic_intr(vcpu) && pic_has_interrupt(vcpu->vm)) { @@ -1059,15 +1410,18 @@ int vm_apic_has_interrupt(vm_vcpu_t *vcpu) } highest_irr = apic_find_highest_irr(apic); - if ((highest_irr == -1) || - ((highest_irr & 0xF0) <= vm_apic_get_reg(apic, APIC_PROCPRI))) { + if (highest_irr == -1) { + return -1; + } + + uint32_t ppr = vm_apic_get_reg(apic, APIC_PROCPRI); + if ((highest_irr & 0xF0) <= vm_apic_get_reg(apic, APIC_PROCPRI)) { return -1; } return highest_irr; } -#if 0 int vm_apic_local_deliver(vm_vcpu_t *vcpu, int lvt_type) { vm_lapic_t *apic = vcpu->vcpu_arch.lapic; @@ -1082,4 +1436,77 @@ int vm_apic_local_deliver(vm_vcpu_t *vcpu, int lvt_type) } return 0; } -#endif + +int vm_inject_timer_irq(vm_vcpu_t *vcpu) +{ + vm_lapic_t *apic = vcpu->vcpu_arch.lapic; + apic_timer_expired(apic, true); + apic->lapic_timer.pending = 0; + + /* if period timer, slap that mf back alive */ + if (apic_lvtt_period(apic)) { + advance_periodic_target_expiration(apic); + timer_oneshot_absolute(apic, apic->lapic_timer.target_expiration); + } else { + timer_stop(apic); + } + + return 0; +} + +void vm_apic_set_timer_and_update(vm_lapic_t *apic, struct timer_functions *timer_emul) +{ + apic->lapic_timer.timer_emul = timer_emul; + tsc_frequency = timer_tsc_freq(apic); +} + +void vm_irq_set_msi_data(int irq, pci_msi_data_t *msi_data) +{ + memcpy(&irq_info[irq].cookie->msi_cookie.data, msi_data, sizeof(*msi_data)); +} + +int vm_inject_irq(vm_vcpu_t *vcpu, int irq) +{ + /* if legacy irq send to PIC */ + if (irq < I8259_NR_IRQS) { + return i8259_inject_irq(vcpu, irq); + } + + int vector, delivery_mode, level, trig_mode; + + if (irq_info[irq].cookie->is_msi) { + /* If MSI we need to patch in the values the guest programmed originally */ + pci_msi_data_t msi_data = irq_info[irq].cookie->msi_cookie.data; + + /* The MSI data register and APIC ISR are practically the same. + * Even if they aren't MSIs are always fixed and edge triggered.*/ + vector = msi_data.value & APIC_VECTOR_MASK; + delivery_mode = msi_data.value & APIC_DM_FIXED_MASK; + trig_mode = msi_data.value & APIC_INT_LEVELTRIG; + level = msi_data.value & APIC_INT_ASSERT; /* Ignored if edge trig */ + } else { + /* Not sure what this should be programmed, just leaving it as fixed + * edge triggered for now. */ + vector = irq; + delivery_mode = APIC_DM_FIXED; + trig_mode = APIC_INT_EDGETRIG; + level = 0; /* Ignored if edge trig */ + } + + vm_lapic_t *apic = vcpu->vcpu_arch.lapic; + + int ret = __apic_accept_irq(vcpu, delivery_mode, vector, level, trig_mode, NULL); + return (ret) ? 0 : 1; +} + +int vm_register_irq(vm_vcpu_t *vcpu, int irq, irq_ack_fn_t fn, void *cookie) +{ + /* if legacy irq send to PIC to deal with*/ + if (irq < I8259_NR_IRQS) { + return i8259_register_irq(vcpu, irq, fn, cookie); + } + irq_info_t *info = &irq_info[irq]; + info->callback = fn; + info->cookie = (x86_irq_cookie_t *) cookie; + return 0; +} diff --git a/libsel4vm/src/arch/x86/processor/lapic.h b/libsel4vm/src/arch/x86/processor/lapic.h index 1e2b6dfc6..491db637e 100644 --- a/libsel4vm/src/arch/x86/processor/lapic.h +++ b/libsel4vm/src/arch/x86/processor/lapic.h @@ -7,6 +7,7 @@ #pragma once #include +#include enum vm_lapic_state { LAPIC_STATE_NEW, @@ -14,20 +15,28 @@ enum vm_lapic_state { LAPIC_STATE_RUN }; -#if 0 +#define MIN_TIMER_PERIOD_US 200 + struct vm_timer { - struct hrtimer timer; - int64_t period; /* unit: ns */ + /* emulated timer representation */ + struct timer_functions *timer_emul; + /* unit: ns */ + int64_t period; + int64_t target_expiration; + /* mask that indicates what timer modes are supported */ uint32_t timer_mode_mask; + /* current timer mode */ + uint32_t timer_mode; + int pending; + /* we don't support tsc deadlines but keep some references */ uint64_t tscdeadline; - atomic_t pending; /* accumulated triggered timers */ }; -#endif typedef struct vm_lapic { uint32_t apic_base; // BSP flag is ignored in this - //struct vm_timer lapic_timer; + /* Each local APIC implements a timer */ + struct vm_timer lapic_timer; uint32_t divide_count; bool irr_pending; @@ -45,6 +54,9 @@ typedef struct vm_lapic { enum vm_lapic_state state; int arb_prio; + + /* back pointer */ + vm_vcpu_t *vcpu; } vm_lapic_t; int vm_apic_enabled(vm_lapic_t *apic); @@ -70,3 +82,5 @@ memory_fault_result_t apic_fault_callback(vm_t *vm, vm_vcpu_t *vcpu, uintptr_t f uint64_t vm_get_lapic_tscdeadline_msr(vm_vcpu_t *vcpu); void vm_set_lapic_tscdeadline_msr(vm_vcpu_t *vcpu, uint64_t data); +void vm_apic_set_timer_and_update(vm_lapic_t *apic, struct timer_functions *timer_emul); + diff --git a/libsel4vm/src/arch/x86/vm.c b/libsel4vm/src/arch/x86/vm.c index 9a0668943..15dfbb1b1 100644 --- a/libsel4vm/src/arch/x86/vm.c +++ b/libsel4vm/src/arch/x86/vm.c @@ -20,7 +20,7 @@ #include #include "vm.h" -#include "i8259/i8259.h" +#include "processor/i8259.h" #include "interrupt.h" #include "guest_state.h" @@ -190,8 +190,7 @@ int vm_run_arch(vm_t *vm) err = vm->run.notification_callback(vm, badge, tag, vm->run.notification_callback_cookie); if (err == -1) { ret = VM_EXIT_HANDLE_ERROR; - } else if (i8259_has_interrupt(vm)) { - /* Check if this caused PIC to generate interrupt */ + } else { vm_check_external_interrupt(vm); } } else { diff --git a/libsel4vmmplatsupport/arch_include/x86/sel4vmmplatsupport/arch/drivers/timer_emul.h b/libsel4vmmplatsupport/arch_include/x86/sel4vmmplatsupport/arch/drivers/timer_emul.h new file mode 100644 index 000000000..1bb469f13 --- /dev/null +++ b/libsel4vmmplatsupport/arch_include/x86/sel4vmmplatsupport/arch/drivers/timer_emul.h @@ -0,0 +1,22 @@ +/* + * Copyright 2022, UNSW (ABN 57 195 873 179) + * + * SPDX-License-Identifier: BSD-2-Clause + */ +#pragma once +/*** + * @module vmm_pci_helper.h + * This interface presents a series of helpers for timer support on x86. + */ + +typedef uint64_t (*timer_tsc_freq_fn_t)(void); +typedef int (*timer_oneshot_absolute_fn_t)(uint64_t ns); +typedef int (*timer_oneshot_relative_fn_t)(uint64_t ns); +typedef int (*timer_stop_fn_t)(void); + +struct timer_functions { + timer_tsc_freq_fn_t tsc_freq; + timer_oneshot_absolute_fn_t oneshot_absolute; + timer_oneshot_relative_fn_t oneshot_relative; + timer_stop_fn_t stop; +}; diff --git a/libsel4vmmplatsupport/include/sel4vmmplatsupport/drivers/pci_helper.h b/libsel4vmmplatsupport/include/sel4vmmplatsupport/drivers/pci_helper.h index 1c7af64f5..4d25c9012 100644 --- a/libsel4vmmplatsupport/include/sel4vmmplatsupport/drivers/pci_helper.h +++ b/libsel4vmmplatsupport/include/sel4vmmplatsupport/drivers/pci_helper.h @@ -142,6 +142,20 @@ typedef struct pci_irq_emulation { int irq; } pci_irq_emulation_t; +/*** + * @struct pci_msi_emulation + * Wrapper data structure over a pci entry and its configuration space. This is leveraged to emulate + * MSI capabilities in an entry's configuration space + * @param {vmm_pci_entry_t} passthrough PCI entry being emulated + * @param {int} vmm_irq The vmm's irq we want the msi to be forwarded to + * @param {uint8_t} msi_cap_offset Offset of the MSI cap in the capability list + */ +typedef struct pci_msi_emulation { + vmm_pci_entry_t passthrough; + int vmm_irq; + uint8_t msi_cap_offset; +} pci_msi_emulation_t; + /*** * @struct pci_passthrough_device * Datastructure providing direct passthrough access to a pci entry configuration space @@ -243,6 +257,16 @@ vmm_pci_entry_t vmm_pci_create_bar_emulation(vmm_pci_entry_t existing, int num_b */ vmm_pci_entry_t vmm_pci_create_irq_emulation(vmm_pci_entry_t existing, int irq); +/*** + * @function vmm_pci_create_msi_emulation(existing, vmm_irq) + * Construct a pci entry the emulates configuration space interrupt read/write's. The rest of the configuration space is passed on + * @param {vmm_pci_entry_t} existing Existing PCI entry to wrap over and emulate its msi accesses + * @param {int} vmm_irq The vmm's irq we want the msi to be forwarded to + * @param {uint8_t} msi_cap_offset Offset of the MSI cap in the capability list + * @return `vmm_pci_entry_t` for emulated irq device + */ +vmm_pci_entry_t vmm_pci_create_msi_emulation(vmm_pci_entry_t existing, int vmm_irq, uint8_t msi_cap_offset); + /*** * @function vmm_pci_create_cap_emulation(existing, num_caps, cap, num_ranges, range_starts, range_ends) * Capability space emulation. Takes list of addresses to use to form a capability linked list, as well as a @@ -263,4 +287,4 @@ vmm_pci_entry_t vmm_pci_create_cap_emulation(vmm_pci_entry_t existing, int num_c * @param {vmm_pci_entry_t} existing Existing PCI entry to wrap over with ignored MSI capabilities * @return `vmm_pci_entry_t` with an emulated capability space (ignoring MSI capabilties) */ -vmm_pci_entry_t vmm_pci_no_msi_cap_emulation(vmm_pci_entry_t existing); +vmm_pci_entry_t vmm_pci_cap_emulation(vmm_pci_entry_t existing, bool enable_msi, int vmm_irq); diff --git a/libsel4vmmplatsupport/include/sel4vmmplatsupport/drivers/pci_msi.h b/libsel4vmmplatsupport/include/sel4vmmplatsupport/drivers/pci_msi.h new file mode 100644 index 000000000..1cc4f9621 --- /dev/null +++ b/libsel4vmmplatsupport/include/sel4vmmplatsupport/drivers/pci_msi.h @@ -0,0 +1,59 @@ + +/* + * Copyright 2022, UNSW (ABN 57 195 873 179) + * + * SPDX-License-Identifier: BSD-2-Clause + */ +#pragma once + +#include +#include +#include +#include + +typedef struct pci_msi_control { + union { + struct { + uint16_t enable :1, + multi_msg_capable :3, + multi_msg_enable :3, + addr_64_bit :1, + reserved :8; + }; + uint16_t value; + }; +} PACKED pci_msi_control_t; + +typedef struct pci_msi_data { + union { + struct { + uint32_t vector :8, + delivery_mode :3, + dest_mode_logical :1, + reserved :2, + assert :1, + is_level :1; + }; + uint32_t value; + }; +} PACKED pci_msi_data_t; + +typedef struct pci_msi_addr_lo { + union { + struct { + uint32_t reserved_0 :2, + dest_mode_logical :1, + redirect_hint :1, + reserved_1 :1, + virt_destid_8_14 :7, + destid_0_7 :8, + base_address :12; /* Always 0xFEE */ + }; + uint32_t value; + }; +} PACKED pci_msi_addr_lo_t; + +typedef struct pci_msi_addr_hi { + uint32_t reserved :8, + destid_8_31 :24; +} PACKED pci_msi_msg_addr_hi_t; diff --git a/libsel4vmmplatsupport/src/arch/x86/acpi.c b/libsel4vmmplatsupport/src/arch/x86/acpi.c index 12aea83e1..7a822ad90 100644 --- a/libsel4vmmplatsupport/src/arch/x86/acpi.c +++ b/libsel4vmmplatsupport/src/arch/x86/acpi.c @@ -151,7 +151,7 @@ int make_guest_acpi_tables(vm_t *vm) // MADT int madt_size = sizeof(acpi_madt_t) - /* + sizeof(acpi_madt_ioapic_t)*/ + // + sizeof(acpi_madt_ioapic_t) + sizeof(acpi_madt_local_apic_t) * cpus; acpi_madt_t *madt = calloc(1, madt_size); acpi_fill_table_head(&madt->header, "APIC", 3); @@ -180,7 +180,7 @@ int make_guest_acpi_tables(vm_t *vm) .type = ACPI_APIC_LOCAL, .length = sizeof(acpi_madt_local_apic_t) }, - .processor_id = i + 1, + .processor_id = i, .apic_id = i, .flags = APIC_FLAGS_ENABLED }; @@ -213,12 +213,13 @@ int make_guest_acpi_tables(vm_t *vm) } uintptr_t xsdt_addr = lower_bios_addr + (XSDT_START - LOWER_BIOS_START); + uintptr_t xsdt_paddr = XSDT_START; acpi_xsdt_t *xsdt = calloc(1, xsdt_size); acpi_fill_table_head(&xsdt->header, "XSDT", 1); // Add previous tables to XSDT pointer list - uintptr_t table_paddr = xsdt_addr + xsdt_size; + uintptr_t table_paddr = xsdt_paddr + xsdt_size; uint64_t *entry = (uint64_t *)((char *)xsdt + sizeof(acpi_xsdt_t)); for (int i = 1; i < num_tables; i++) { *entry++ = (uint64_t)table_paddr; @@ -247,11 +248,11 @@ int make_guest_acpi_tables(vm_t *vm) .oem_id = "NICTA ", .revision = 2, /* ACPI v3*/ .checksum = 0, - .rsdt_address = xsdt_addr, + .rsdt_address = xsdt_paddr, /* rsdt_addrss will not be inspected as the xsdt is present. This is not ACPI 1 compliant */ .length = sizeof(acpi_rsdp_t), - .xsdt_address = xsdt_addr, + .xsdt_address = xsdt_paddr, .extended_checksum = 0, .reserved = {0} }; diff --git a/libsel4vmmplatsupport/src/arch/x86/drivers/vmm_pci_helper.c b/libsel4vmmplatsupport/src/arch/x86/drivers/vmm_pci_helper.c index 0977d2da2..87dba6f9e 100644 --- a/libsel4vmmplatsupport/src/arch/x86/drivers/vmm_pci_helper.c +++ b/libsel4vmmplatsupport/src/arch/x86/drivers/vmm_pci_helper.c @@ -151,7 +151,7 @@ ioport_fault_result_t vmm_pci_io_port_out(vm_vcpu_t *vcpu, void *cookie, unsigne ZF_LOGI("Guest attempted access to non existent device %02x:%02x.%d register 0x%x", addr.bus, addr.dev, addr.fun, reg); return IO_FAULT_HANDLED; } - int err = dev->iowrite(dev->cookie, reg + offset, size, value); + int err = dev->iowrite(dev->cookie, reg, size, value); if (err) { return IO_FAULT_ERROR; } diff --git a/libsel4vmmplatsupport/src/drivers/pci_helper.c b/libsel4vmmplatsupport/src/drivers/pci_helper.c index 209cf8042..487f0dbb4 100644 --- a/libsel4vmmplatsupport/src/drivers/pci_helper.c +++ b/libsel4vmmplatsupport/src/drivers/pci_helper.c @@ -13,8 +13,12 @@ #include #include +#include + +#include #define PCI_CAPABILITY_SPACE_OFFSET 0x40 +#define PCI_MSI_CAP_SIZE 0x18 /* Read PCI memory device */ int vmm_pci_mem_device_read(void *cookie, int offset, int size, uint32_t *result) @@ -200,6 +204,56 @@ static int pci_irq_emul_write(void *cookie, int offset, int size, uint32_t value } } +static int pci_msi_emul_read(void *cookie, int offset, int size, uint32_t *result) +{ + /* No need to patch values, just passthrough */ + pci_msi_emulation_t *emul = (pci_msi_emulation_t *)cookie; + return emul->passthrough.ioread(emul->passthrough.cookie, offset, size, result); +} + +static int pci_msi_emul_write(void *cookie, int offset, int size, uint32_t value) +{ + pci_msi_emulation_t *emul = (pci_msi_emulation_t *)cookie; + if (offset >= emul->msi_cap_offset && offset < emul->msi_cap_offset + PCI_MSI_CAP_SIZE) { + /* Patch our values in before writing */ + uint8_t msi_cap_offset = emul->msi_cap_offset; + + pci_msi_control_t msi_control; + emul->passthrough.ioread(emul->passthrough.cookie, msi_cap_offset, 2, &msi_control); + + if (msi_control.addr_64_bit && offset - msi_cap_offset == 0x8) { + /* Right now we specify the exact APIC this MSI is delivered to + * and as such the top bits are not set */ + value = 0; + } else if (offset - msi_cap_offset == 0x8 || offset - msi_cap_offset == 0xC) { + /* If 64-bit addressing is enabled, the data register is at 0xC, else 0x8 + * refer to: PCI local specification 2.2 */ + pci_msi_data_t *msi_data = (pci_msi_data_t *) &value; + + /* The guest OS may program an invalid vector into the MSI vector + * in which case we want to leave it alone, or the device will attempt + * a DMA into a vector that's not set up yet. */ + // ZF_LOGE("Vector %d, Delivery Mode 0x%x, Trigger Mode %d", value & 0xff, (value >> 8) & 0b111, (value >> 15) & 1); + if (msi_data->vector == 0 || msi_data->vector == 239) { + /* Do nothing if MSI vector not a sane value (hardcoded to linux right now) */ + } else { + /* Save the data so we can patch it in later when the irq arrives. */ + vm_irq_set_msi_data(emul->vmm_irq, msi_data); + msi_data->vector = USER_IRQ_TO_CPU_VECTOR(emul->vmm_irq); + } + // ZF_LOGE("Vector %d, Delivery Mode 0x%x, Trigger Mode %d", value & 0xff, (value >> 8) & 0b111, (value >> 15) & 1); + } else if (offset - msi_cap_offset == 0x4) { + pci_msi_addr_lo_t *addr = &value; + /* Set the destination to the physical APIC. This is the usual value + * but we need a way to obtain the value from boot ACPI tables + * and not make assumptions. */ + addr->value = 0xfee00000; + // ZF_LOGE("Dest 0x%x, RH 0x%x, DM 0x%x\n", addr->destid_0_7, addr->redirect_hint, addr->dest_mode_logical); + } + } + return emul->passthrough.iowrite(emul->passthrough.cookie, offset, size, value); +} + static int pci_bar_emul_read(void *cookie, int offset, int size, uint32_t *result) { pci_bar_emulation_t *emul = (pci_bar_emulation_t *)cookie; @@ -243,6 +297,18 @@ vmm_pci_entry_t vmm_pci_create_bar_emulation(vmm_pci_entry_t existing, int num_b }; } +vmm_pci_entry_t vmm_pci_create_msi_emulation(vmm_pci_entry_t existing, int vmm_irq, uint8_t msi_cap_offset) +{ + pci_msi_emulation_t *msi_emul = calloc(1, sizeof(*msi_emul)); + assert(msi_emul); + msi_emul->passthrough = existing; + msi_emul->vmm_irq = vmm_irq; + msi_emul->msi_cap_offset = msi_cap_offset; + return (vmm_pci_entry_t) { + .cookie = msi_emul, .ioread = pci_msi_emul_read, .iowrite = pci_msi_emul_write + }; +} + vmm_pci_entry_t vmm_pci_create_irq_emulation(vmm_pci_entry_t existing, int irq) { pci_irq_emulation_t *irq_emul = calloc(1, sizeof(*irq_emul)); @@ -365,9 +431,10 @@ vmm_pci_entry_t vmm_pci_create_cap_emulation(vmm_pci_entry_t existing, int num_c #define MAX_CAPS 256 -vmm_pci_entry_t vmm_pci_no_msi_cap_emulation(vmm_pci_entry_t existing) +vmm_pci_entry_t vmm_pci_cap_emulation(vmm_pci_entry_t existing, bool enable_msi, int vmm_irq) { uint32_t value; + uint8_t msi_cap_offset = 0; int UNUSED error; /* Ensure this is a type 0 device */ value = 0; @@ -398,11 +465,19 @@ vmm_pci_entry_t vmm_pci_no_msi_cap_emulation(vmm_pci_entry_t existing) error = existing.ioread(existing.cookie, value, 1, &cap_type); assert(!error); if (cap_type == PCI_CAP_ID_MSI) { - assert(num_ignore < 2); - ignore_start[num_ignore] = value; - ignore_end[num_ignore] = value + 20; - num_ignore++; + if (enable_msi) { + msi_cap_offset = (uint8_t) value & 0xff; + assert(num_caps < MAX_CAPS); + caps[num_caps] = (uint8_t)value; + num_caps++; + } else { + assert(num_ignore < 2); + ignore_start[num_ignore] = value; + ignore_end[num_ignore] = value + 20; + num_ignore++; + } } else if (cap_type == PCI_CAP_ID_MSIX) { + assert(num_ignore < 2); ignore_start[num_ignore] = value; ignore_end[num_ignore] = value + 8; num_ignore++; @@ -414,9 +489,14 @@ vmm_pci_entry_t vmm_pci_no_msi_cap_emulation(vmm_pci_entry_t existing) error = existing.ioread(existing.cookie, value + 1, 1, &value); assert(!error); } + + if (enable_msi) { + existing = vmm_pci_create_msi_emulation(existing, vmm_irq, msi_cap_offset); + } + if (num_ignore > 0) { - return vmm_pci_create_cap_emulation(existing, num_caps, caps, num_ignore, ignore_start, ignore_end); - } else { - return existing; + existing = vmm_pci_create_cap_emulation(existing, num_caps, caps, num_ignore, ignore_start, ignore_end); } + + return existing; }