diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index f86ad3335529..56689e83fb75 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -68,18 +68,22 @@ static inline void kernel_fpu_begin(void) */ static inline void fpregs_lock(void) { +#ifndef __PKVM_HYP__ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) local_bh_disable(); else preempt_disable(); +#endif } static inline void fpregs_unlock(void) { +#ifndef __PKVM_HYP__ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) local_bh_enable(); else preempt_enable(); +#endif } /* diff --git a/arch/x86/include/asm/kvm_pkvm.h b/arch/x86/include/asm/kvm_pkvm.h index 033fbec08d48..d38f5423966d 100644 --- a/arch/x86/include/asm/kvm_pkvm.h +++ b/arch/x86/include/asm/kvm_pkvm.h @@ -312,6 +312,7 @@ enum pkvm_fn { __pkvm__cache_reg, __pkvm__update_cpuid_runtime, __pkvm__update_exception_bitmap, + __pkvm__vcpu_add_fpstate, }; #define HOST_HANDLE_EXIT 0 diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h index 0b86a5002c84..273fc5a6ffe8 100644 --- a/arch/x86/kernel/fpu/xstate.h +++ b/arch/x86/kernel/fpu/xstate.h @@ -184,7 +184,9 @@ static inline void os_xsave(struct fpstate *fpstate) u32 hmask = mask >> 32; int err; +#ifndef __PKVM_HYP__ WARN_ON_FPU(!alternatives_patched); +#endif xfd_validate_state(fpstate, mask, false); XSTATE_XSAVE(&fpstate->regs.xsave, lmask, hmask, err); diff --git a/arch/x86/kvm/pkvm/cpuid.c b/arch/x86/kvm/pkvm/cpuid.c index 9cb156ce1d5d..0420fbd75d81 100644 --- a/arch/x86/kvm/pkvm/cpuid.c +++ b/arch/x86/kvm/pkvm/cpuid.c @@ -146,17 +146,7 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu, if (!xfeatures) return 0; -#ifdef __PKVM_HYP__ - /* - * TODO: The guest fpu xfd feature is enabled by the host when the host - * KVM run its kvm_check_cpuid function before calling the - * vcpu_after_set_cpuid PV interface. Revisit when implements the fpu - * isolation. - */ - return 0; -#else return fpu_enable_guest_xfd_features(&vcpu->arch.guest_fpu, xfeatures); -#endif } /* Check whether the supplied CPUID data is equal to what is already set for the vCPU. */ diff --git a/arch/x86/kvm/pkvm/def.h b/arch/x86/kvm/pkvm/def.h index 874e911a6c35..e37dfce91adb 100644 --- a/arch/x86/kvm/pkvm/def.h +++ b/arch/x86/kvm/pkvm/def.h @@ -27,6 +27,8 @@ /* FIXME: Disable SGX to simplify POC */ #undef CONFIG_X86_SGX_KVM #undef CONFIG_PREEMPT_COUNT +#undef CONFIG_USE_X86_SEG_SUPPORT +#undef CONFIG_X86_DEBUG_FPU #define __NO_FORTIFY #include diff --git a/arch/x86/kvm/pkvm/fpu/core.c b/arch/x86/kvm/pkvm/fpu/core.c new file mode 100644 index 000000000000..ed367ee3e771 --- /dev/null +++ b/arch/x86/kvm/pkvm/fpu/core.c @@ -0,0 +1,307 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" +#include "legacy.h" +#include "xstate.h" +#include "context.h" + +#ifdef CONFIG_X86_64 +DEFINE_STATIC_KEY_FALSE(__fpu_state_size_dynamic); +DEFINE_PER_CPU(u64, xfd_state); +#endif + +/* The FPU state configuration data for kernel and user space */ +struct fpu_state_config fpu_kernel_cfg __ro_after_init; +struct fpu_state_config fpu_user_cfg __ro_after_init; + +/* + * Represents the initial FPU state. It's mostly (but not completely) zeroes, + * depending on the FPU hardware format: + */ +struct fpstate init_fpstate __ro_after_init; + +/* + * Track which context is using the FPU on the CPU: + */ +DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); + +/* + * Save the FPU register state in fpu->fpstate->regs. The register state is + * preserved. + * + * Must be called with fpregs_lock() held. + * + * The legacy FNSAVE instruction clears all FPU state unconditionally, so + * register state has to be reloaded. That might be a pointless exercise + * when the FPU is going to be used by another task right after that. But + * this only affects 20+ years old 32bit systems and avoids conditionals all + * over the place. + * + * FXSAVE and all XSAVE variants preserve the FPU register state. + */ +void save_fpregs_to_fpstate(struct fpu *fpu) +{ + if (likely(use_xsave())) { + os_xsave(fpu->fpstate); +#ifndef __PKVM_HYP__ + update_avx_timestamp(fpu); +#endif + return; + } + + if (likely(use_fxsr())) { + fxsave(&fpu->fpstate->regs.fxsave); + return; + } + + /* + * Legacy FPU register saving, FNSAVE always clears FPU registers, + * so we have to reload them from the memory state. + */ + asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->fpstate->regs.fsave)); + frstor(&fpu->fpstate->regs.fsave); +} + +void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask) +{ + /* + * AMD K7/K8 and later CPUs up to Zen don't save/restore + * FDP/FIP/FOP unless an exception is pending. Clear the x87 state + * here by setting it to fixed values. "m" is a random variable + * that should be in L1. + */ + if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK))) { + asm volatile( + "fnclex\n\t" + "emms\n\t" + "fildl %[addr]" /* set F?P to defined value */ + : : [addr] "m" (*fpstate)); + } + + if (use_xsave()) { + /* + * Dynamically enabled features are enabled in XCR0, but + * usage requires also that the corresponding bits in XFD + * are cleared. If the bits are set then using a related + * instruction will raise #NM. This allows to do the + * allocation of the larger FPU buffer lazy from #NM or if + * the task has no permission to kill it which would happen + * via #UD if the feature is disabled in XCR0. + * + * XFD state is following the same life time rules as + * XSTATE and to restore state correctly XFD has to be + * updated before XRSTORS otherwise the component would + * stay in or go into init state even if the bits are set + * in fpstate::regs::xsave::xfeatures. + */ + xfd_update_state(fpstate); + + /* + * Restoring state always needs to modify all features + * which are in @mask even if the current task cannot use + * extended features. + * + * So fpstate->xfeatures cannot be used here, because then + * a feature for which the task has no permission but was + * used by the previous task would not go into init state. + */ + mask = fpu_kernel_cfg.max_features & mask; + + os_xrstor(fpstate, mask); + } else { + if (use_fxsr()) + fxrstor(&fpstate->regs.fxsave); + else + frstor(&fpstate->regs.fsave); + } +} + +static inline void fpstate_init_fxstate(struct fpstate *fpstate) +{ + fpstate->regs.fxsave.cwd = 0x37f; + fpstate->regs.fxsave.mxcsr = MXCSR_DEFAULT; +} + +/* + * Legacy x87 fpstate state init: + */ +static inline void fpstate_init_fstate(struct fpstate *fpstate) +{ + fpstate->regs.fsave.cwd = 0xffff037fu; + fpstate->regs.fsave.swd = 0xffff0000u; + fpstate->regs.fsave.twd = 0xffffffffu; + fpstate->regs.fsave.fos = 0xffff0000u; +} + +/* + * Used in two places: + * 1) Early boot to setup init_fpstate for non XSAVE systems + * 2) fpu_init_fpstate_user() which is invoked from KVM + */ +void fpstate_init_user(struct fpstate *fpstate) +{ + if (!cpu_feature_enabled(X86_FEATURE_FPU)) { +#ifndef __PKVM_HYP__ + fpstate_init_soft(&fpstate->regs.soft); +#endif + return; + } + + xstate_init_xcomp_bv(&fpstate->regs.xsave, fpstate->xfeatures); + + if (cpu_feature_enabled(X86_FEATURE_FXSR)) + fpstate_init_fxstate(fpstate); + else + fpstate_init_fstate(fpstate); +} + +/* + * fpu_enable_guest_xfd_features - Check xfeatures against guest perm and enable + * @guest_fpu: Pointer to the guest FPU container + * @xfeatures: Features requested by guest CPUID + * + * Enable all dynamic xfeatures according to guest perm and requested CPUID. + * + * Return: 0 on success, error code otherwise + */ +int fpu_enable_guest_xfd_features(struct fpu_guest *guest_fpu, u64 xfeatures) +{ +#ifndef __PKVM_HYP__ + lockdep_assert_preemption_enabled(); +#endif + + /* Nothing to do if all requested features are already enabled. */ + xfeatures &= ~guest_fpu->xfeatures; + if (!xfeatures) + return 0; + + return __xfd_enable_feature(xfeatures, guest_fpu); +} +EXPORT_SYMBOL_GPL(fpu_enable_guest_xfd_features); + +#ifdef CONFIG_X86_64 +void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd) +{ + fpregs_lock(); + guest_fpu->fpstate->xfd = xfd; + if (guest_fpu->fpstate->in_use) + xfd_update_state(guest_fpu->fpstate); + fpregs_unlock(); +} +EXPORT_SYMBOL_GPL(fpu_update_guest_xfd); + +/** + * fpu_sync_guest_vmexit_xfd_state - Synchronize XFD MSR and software state + * + * Must be invoked from KVM after a VMEXIT before enabling interrupts when + * XFD write emulation is disabled. This is required because the guest can + * freely modify XFD and the state at VMEXIT is not guaranteed to be the + * same as the state on VMENTER. So software state has to be updated before + * any operation which depends on it can take place. + * + * Note: It can be invoked unconditionally even when write emulation is + * enabled for the price of a then pointless MSR read. + */ +void fpu_sync_guest_vmexit_xfd_state(void) +{ + struct fpstate *fps = current->thread.fpu.fpstate; + +#ifndef __PKVM_HYP__ + lockdep_assert_irqs_disabled(); +#endif + if (fpu_state_size_dynamic()) { + rdmsrl(MSR_IA32_XFD, fps->xfd); + __this_cpu_write(xfd_state, fps->xfd); + } +} +EXPORT_SYMBOL_GPL(fpu_sync_guest_vmexit_xfd_state); +#endif + +int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest) +{ + struct fpstate *guest_fps = guest_fpu->fpstate; + struct fpu *fpu = ¤t->thread.fpu; + struct fpstate *cur_fps = fpu->fpstate; + + fpregs_lock(); +#ifdef __PKVM_HYP__ +#ifdef CONFIG_X86_64 + if (fpu_state_size_dynamic() && enter_guest) { + /* + * Refresh the xfd_state before guest vmenter so that the xfd can be + * restored after guest vmexit. + */ + rdmsrl(MSR_IA32_XFD, cur_fps->xfd); + __this_cpu_write(xfd_state, cur_fps->xfd); + } +#endif + /* + * Only save the FPU registers when exit guest for the pVM. When enter + * the guest, the FPU registers hold the value of the host, which is + * saved by the host itself. + */ + if (guest_fps->is_confidential && !enter_guest) +#else + if (!cur_fps->is_confidential && !test_thread_flag(TIF_NEED_FPU_LOAD)) +#endif + save_fpregs_to_fpstate(fpu); + + /* Swap fpstate */ + if (enter_guest) { + fpu->__task_fpstate = cur_fps; + fpu->fpstate = guest_fps; + guest_fps->in_use = true; + } else { + guest_fps->in_use = false; + fpu->fpstate = fpu->__task_fpstate; + fpu->__task_fpstate = NULL; + } + + cur_fps = fpu->fpstate; + +#ifdef __PKVM_HYP__ + /* + * For the pVM, when enter guest, restore the FPU with the data from the + * pVM's xsave area. When exit guest, restore the FPU with the initial + * data to wipe the pVM's FPU registers. + * + * For the npVM, no need to restore. + */ + if (guest_fps->is_confidential) { +#else + if (!cur_fps->is_confidential) { +#endif + /* Includes XFD update */ + restore_fpregs_from_fpstate(cur_fps, XFEATURE_MASK_FPSTATE); + } else { + /* + * XSTATE is restored by firmware from encrypted + * memory. Make sure XFD state is correct while + * running with guest fpstate + */ + xfd_update_state(cur_fps); + } + + fpregs_mark_activate(); + fpregs_unlock(); + + return 0; +} +EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpstate); + +void fpregs_mark_activate(void) +{ + struct fpu *fpu = ¤t->thread.fpu; + + fpregs_activate(fpu); + fpu->last_cpu = smp_processor_id(); + clear_thread_flag(TIF_NEED_FPU_LOAD); +} diff --git a/arch/x86/kvm/pkvm/fpu/fpu.c b/arch/x86/kvm/pkvm/fpu/fpu.c new file mode 100644 index 000000000000..0c46cc764ade --- /dev/null +++ b/arch/x86/kvm/pkvm/fpu/fpu.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include + +#include "internal.h" +#include "fpu.h" +#include "internal.h" +#include "xstate.h" + +static DEFINE_PER_CPU(struct fpstate, percpu_fpstate); + +void pkvm_init_percpu_fpu(void) +{ + struct fpu *fpu = ¤t->thread.fpu; + + /* + * Set the current fpstate pointer to the percpu_fpstate, which is used + * to restore the FPU to the initial state before switching from a pVM + * to the host. + */ + fpu->fpstate = this_cpu_ptr(&percpu_fpstate); + fpstate_init_user(fpu->fpstate); + + /* The perm is initialized with the maximum features */ + fpu->perm.__state_perm = fpu_kernel_cfg.max_features; + fpu->perm.__state_size = fpu_kernel_cfg.max_size; + + fpu->guest_perm = fpu->perm; +} + +void pkvm_init_guest_fpu(struct fpu_guest *gfpu) +{ + u64 permitted = xstate_get_group_perm(true); + struct fpstate *fpstate = gfpu->fpstate; + + fpstate->xfeatures = fpu_kernel_cfg.default_features & permitted; + fpstate->user_xfeatures = fpu_user_cfg.default_features & permitted; + fpstate->xfd = 0; + + fpstate->in_use = false; + + fpstate_init_user(fpstate); + + gfpu->xfeatures = fpstate->user_xfeatures; +} diff --git a/arch/x86/kvm/pkvm/fpu/fpu.h b/arch/x86/kvm/pkvm/fpu/fpu.h new file mode 100644 index 000000000000..7e499dc9b893 --- /dev/null +++ b/arch/x86/kvm/pkvm/fpu/fpu.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __PKVM_X86_FPU_H +#define __PKVM_X86_FPU_H + +void pkvm_setup_xstate_cache(void); +void pkvm_init_percpu_fpu(void); +void pkvm_init_guest_fpu(struct fpu_guest *gfpu); + +#endif /* __PKVM_X86_FPU_H */ diff --git a/arch/x86/kvm/pkvm/fpu/xstate.c b/arch/x86/kvm/pkvm/fpu/xstate.c new file mode 100644 index 000000000000..a5c6f5aa7842 --- /dev/null +++ b/arch/x86/kvm/pkvm/fpu/xstate.c @@ -0,0 +1,310 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include + +#include "internal.h" +#include "xstate.h" +#include "fpu.h" + +#define for_each_extended_xfeature(bit, mask) \ + (bit) = FIRST_EXTENDED_XFEATURE; \ + for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask)) + +static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init = + { [ 0 ... XFEATURE_MAX - 1] = -1}; +static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init = + { [ 0 ... XFEATURE_MAX - 1] = -1}; +static unsigned int xstate_flags[XFEATURE_MAX] __ro_after_init; + +#define XSTATE_FLAG_SUPERVISOR BIT(0) + +static bool xfeature_is_supervisor(int xfeature_nr) +{ + return xstate_flags[xfeature_nr] & XSTATE_FLAG_SUPERVISOR; +} + +static bool xfeature_enabled(enum xfeature xfeature) +{ + return fpu_kernel_cfg.max_features & BIT_ULL(xfeature); +} + +/* + * Record the offsets and sizes of various xstates contained + * in the XSAVE state memory layout. + */ +static void __init setup_xstate_cache(void) +{ + u32 eax, ebx, ecx, edx, i; + /* start at the beginning of the "extended state" */ + unsigned int last_good_offset = offsetof(struct xregs_state, + extended_state_area); + /* + * The FP xstates and SSE xstates are legacy states. They are always + * in the fixed offsets in the xsave area in either compacted form + * or standard form. + */ + xstate_offsets[XFEATURE_FP] = 0; + xstate_sizes[XFEATURE_FP] = offsetof(struct fxregs_state, + xmm_space); + + xstate_offsets[XFEATURE_SSE] = xstate_sizes[XFEATURE_FP]; + xstate_sizes[XFEATURE_SSE] = sizeof_field(struct fxregs_state, + xmm_space); + + for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { + cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); + + xstate_sizes[i] = eax; + xstate_flags[i] = ecx; + + /* + * If an xfeature is supervisor state, the offset in EBX is + * invalid, leave it to -1. + */ + if (xfeature_is_supervisor(i)) + continue; + + xstate_offsets[i] = ebx; + + /* + * In our xstate size checks, we assume that the highest-numbered + * xstate feature has the highest offset in the buffer. Ensure + * it does. + */ + WARN_ONCE(last_good_offset > xstate_offsets[i], + "x86/fpu: misordered xstate at %d\n", last_good_offset); + + last_good_offset = xstate_offsets[i]; + } +} + +#define XSTATE_FLAG_ALIGNED64 BIT(1) + +static bool xfeature_is_aligned64(int xfeature_nr) +{ + return xstate_flags[xfeature_nr] & XSTATE_FLAG_ALIGNED64; +} + +static unsigned int xfeature_get_offset(u64 xcomp_bv, int xfeature) +{ + unsigned int offs, i; + + /* + * Non-compacted format and legacy features use the cached fixed + * offsets. + */ + if (!cpu_feature_enabled(X86_FEATURE_XCOMPACTED) || + xfeature <= XFEATURE_SSE) + return xstate_offsets[xfeature]; + + /* + * Compacted format offsets depend on the actual content of the + * compacted xsave area which is determined by the xcomp_bv header + * field. + */ + offs = FXSAVE_SIZE + XSAVE_HDR_SIZE; + for_each_extended_xfeature(i, xcomp_bv) { + if (xfeature_is_aligned64(i)) + offs = ALIGN(offs, 64); + if (i == xfeature) + break; + offs += xstate_sizes[i]; + } + return offs; +} + +static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted) +{ + unsigned int topmost = fls64(xfeatures) - 1; + unsigned int offset = xstate_offsets[topmost]; + + if (topmost <= XFEATURE_SSE) + return sizeof(struct xregs_state); + + if (compacted) + offset = xfeature_get_offset(xfeatures, topmost); + return offset + xstate_sizes[topmost]; +} + +/* + * Given an xstate feature nr, calculate where in the xsave + * buffer the state is. Callers should ensure that the buffer + * is valid. + */ +static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr) +{ + u64 xcomp_bv = xsave->header.xcomp_bv; + + if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr))) + return NULL; + + if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) { + if (WARN_ON_ONCE(!(xcomp_bv & BIT_ULL(xfeature_nr)))) + return NULL; + } + + return (void *)xsave + xfeature_get_offset(xcomp_bv, xfeature_nr); +} + +/* + * Given the xsave area and a state inside, this function returns the + * address of the state. + * + * This is the API that is called to get xstate address in either + * standard format or compacted format of xsave area. + * + * Note that if there is no data for the field in the xsave buffer + * this will return NULL. + * + * Inputs: + * xstate: the thread's storage area for all FPU data + * xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP, + * XFEATURE_SSE, etc...) + * Output: + * address of the state in the xsave area, or NULL if the + * field is not present in the xsave buffer. + */ +void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr) +{ + /* + * Do we even *have* xsave state? + */ + if (!boot_cpu_has(X86_FEATURE_XSAVE)) + return NULL; + + /* + * We should not ever be requesting features that we + * have not enabled. + */ + if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr))) + return NULL; + + /* + * This assumes the last 'xsave*' instruction to + * have requested that 'xfeature_nr' be saved. + * If it did not, we might be seeing and old value + * of the field in the buffer. + * + * This can happen because the last 'xsave' did not + * request that this feature be saved (unlikely) + * or because the "init optimization" caused it + * to not be saved. + */ + if (!(xsave->header.xfeatures & BIT_ULL(xfeature_nr))) + return NULL; + + return __raw_xsave_addr(xsave, xfeature_nr); +} +EXPORT_SYMBOL_GPL(get_xsave_addr); + +#if IS_ENABLED(CONFIG_KVM) +void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature) +{ + void *addr = get_xsave_addr(&fps->regs.xsave, xfeature); + + if (addr) + memset(addr, 0, xstate_sizes[xfeature]); +} +EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component); +#endif + +#ifdef CONFIG_X86_64 +int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu) +{ + u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC; +#ifdef __PKVM_HYP__ + struct fpstate *fps; + unsigned int ksize; + + if (!xfd_event) + return 0; + + if (!guest_fpu) + return -EINVAL; + + if ((xstate_get_group_perm(!!guest_fpu) & xfd_event) != xfd_event) + return -EPERM; + + fps = guest_fpu->fpstate; + ksize = xstate_calculate_size(fps->xfeatures | xfd_event, + cpu_feature_enabled(X86_FEATURE_XCOMPACTED)); + if (fps->size < ksize) + /* State size is insufficient. */ + return -ENOMEM; + + guest_fpu->xfeatures |= xfd_event; + fps->xfeatures |= xfd_event; + fps->user_xfeatures |= xfd_event; + fps->xfd &= ~xfd_event; + + xstate_init_xcomp_bv(&fps->regs.xsave, fps->xfeatures); + if (fps->in_use) + xfd_update_state(fps); + + return 0; +#else + struct fpu_state_perm *perm; + unsigned int ksize, usize; + struct fpu *fpu; + + if (!xfd_event) { + if (!guest_fpu) + pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err); + return 0; + } + + /* Protect against concurrent modifications */ + spin_lock_irq(¤t->sighand->siglock); + + /* If not permitted let it die */ + if ((xstate_get_group_perm(!!guest_fpu) & xfd_event) != xfd_event) { + spin_unlock_irq(¤t->sighand->siglock); + return -EPERM; + } + + fpu = ¤t->group_leader->thread.fpu; + perm = guest_fpu ? &fpu->guest_perm : &fpu->perm; + ksize = perm->__state_size; + usize = perm->__user_state_size; + + /* + * The feature is permitted. State size is sufficient. Dropping + * the lock is safe here even if more features are added from + * another task, the retrieved buffer sizes are valid for the + * currently requested feature(s). + */ + spin_unlock_irq(¤t->sighand->siglock); + + /* + * Try to allocate a new fpstate. If that fails there is no way + * out. + */ + if (fpstate_realloc(xfd_event, ksize, usize, guest_fpu)) + return -EFAULT; + return 0; +#endif +} +#endif + +#ifdef __PKVM_HYP__ +void pkvm_setup_xstate_cache(void) +{ + if (!boot_cpu_has(X86_FEATURE_FPU)) { + pr_info("x86/fpu: No FPU detected\n"); + return; + } + + if (!boot_cpu_has(X86_FEATURE_XSAVE)) { + pr_info("x86/fpu: x87 FPU will use %s\n", + boot_cpu_has(X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE"); + return; + } + + if (boot_cpu_data.cpuid_level < XSTATE_CPUID) { + WARN_ON_FPU(1); + return; + } + + setup_xstate_cache(); +} +#endif diff --git a/arch/x86/kvm/pkvm/pkvm.c b/arch/x86/kvm/pkvm/pkvm.c index 69a9db4a9a9f..3763234ecc98 100644 --- a/arch/x86/kvm/pkvm/pkvm.c +++ b/arch/x86/kvm/pkvm/pkvm.c @@ -4,6 +4,7 @@ #include "x86.h" #include "pkvm.h" #include "cpuid.h" +#include "fpu/fpu.h" #include //FIXME: clean up the header files #include @@ -187,7 +188,8 @@ static int pkvm_vm_init(struct kvm *shared_kvm, unsigned long gpa) return ret; } -static int attach_pkvm_vcpu_to_vm(struct pkvm_vcpu *pkvm_vcpu, struct pkvm_vm *pkvm_vm) +static int attach_pkvm_vcpu_to_vm(struct pkvm_vcpu *pkvm_vcpu, struct fpstate *fps, + struct pkvm_vm *pkvm_vm) { struct kvm_vcpu *vcpu; struct kvm *kvm; @@ -218,6 +220,7 @@ static int attach_pkvm_vcpu_to_vm(struct pkvm_vcpu *pkvm_vcpu, struct pkvm_vm *p */ vcpu->arch.apic = pkvm_vcpu->shared_vcpu->arch.apic; vcpu->arch.apic_base = pkvm_vcpu->shared_vcpu->arch.apic_base; + vcpu->arch.guest_fpu.fpstate = fps; ret = kvm_arch_vcpu_create(vcpu); if (ret) @@ -269,27 +272,33 @@ void put_pkvm_vm(struct pkvm_vm *pkvm_vm) WARN_ON(atomic_dec_if_positive(&pkvm_vm_ref->refcount) <= 0); } -static int pkvm_vcpu_create(struct kvm_vcpu *shared_vcpu, unsigned long gpa) +static int pkvm_vcpu_create(struct kvm_vcpu *shared_vcpu, unsigned long gpa, + unsigned long size) { struct pkvm_vcpu *pkvm_vcpu; - unsigned long pkvm_vcpu_pa; struct pkvm_vm *pkvm_vm; struct kvm *shared_kvm; - size_t pa_size; + struct fpstate *fps; + unsigned long pa; + size_t fpsize; + void *va; int ret; - pkvm_vcpu_pa = host_gpa2hpa(gpa); - if (!PAGE_ALIGNED(pkvm_vcpu_pa)) - return -EINVAL; + if (!VALID_PAGE(gpa) || + !PAGE_ALIGNED(gpa) || + !PAGE_ALIGNED(size) || + (size <= PAGE_ALIGN(pkvm_vcpu_sz))) + return -ENOMEM; - pa_size = PAGE_ALIGN(pkvm_vcpu_sz); - if (__pkvm_host_donate_hyp(pkvm_vcpu_pa, pa_size)) + pa = host_gpa2hpa(gpa); + if (__pkvm_host_donate_hyp(pa, size)) return -EINVAL; - pkvm_vcpu = pkvm_phys_to_virt(pkvm_vcpu_pa); - memset(pkvm_vcpu, 0, pa_size); + va = __pkvm_va(pa); + memset(va, 0, size); - pkvm_vcpu->size = pa_size; + pkvm_vcpu = va; + pkvm_vcpu->size = PAGE_ALIGN(pkvm_vcpu_sz); /* * TODO: Assume host is already share the kvm_vcpu structure * (represented by shared_vcpu) with pkvm. So just pin @@ -304,7 +313,42 @@ static int pkvm_vcpu_create(struct kvm_vcpu *shared_vcpu, unsigned long gpa) goto undonate; } - ret = attach_pkvm_vcpu_to_vm(pkvm_vcpu, pkvm_vm); + fpsize = size - pkvm_vcpu->size; + if (pkvm_is_protected_vm(to_kvm(pkvm_vm))) { + /* + * The pkvm hypervisor switches the FPU registers for the pVM + * thus the fpstate size should satisfy the fpu kernel config + * default size. + */ + if (fpsize < (fpu_kernel_cfg.default_size + + ALIGN(offsetof(struct fpstate, regs), 64))) { + ret = -EINVAL; + goto put_pkvm_vm; + } + } else { + /* + * The host switches the FPU registers for the npVM thus the + * fpstate in the pkvm hypervisor is not used except for the XFD + * MSR emulation. So the fpstate size should just satisfy the + * struct fpstate size except for the regs. + */ + if (fpsize < ALIGN(offsetof(struct fpstate, regs), 64)) { + ret = -EINVAL; + goto put_pkvm_vm; + } + } + + fps = (struct fpstate *)(va + pkvm_vcpu->size); + memset(fps, 0, fpsize); + /* + * Although the fpstate size represents the size of the register memory, + * use this field to save the size of the fpstate memory to simplify the + * undonating, which is the only usage of the fpstate size field in the + * pkvm hypervisor. + */ + fps->size = fpsize; + + ret = attach_pkvm_vcpu_to_vm(pkvm_vcpu, fps, pkvm_vm); if (ret) goto put_pkvm_vm; @@ -315,7 +359,7 @@ static int pkvm_vcpu_create(struct kvm_vcpu *shared_vcpu, unsigned long gpa) put_pkvm_vm: put_pkvm_vm(pkvm_vm); undonate: - __pkvm_hyp_donate_host(pkvm_vcpu_pa, pa_size); + __pkvm_hyp_donate_host(pa, size); return ret; } @@ -339,6 +383,9 @@ static void pkvm_vm_destroy(int handle) (void *)vcpu->arch.cpuid_entries, sizeof(struct kvm_cpuid_entry2) * vcpu->arch.cpuid_nent); + teardown_donated_memory(&shared_pkvm->teardown_mc, + (void *)vcpu->arch.guest_fpu.fpstate, + vcpu->arch.guest_fpu.fpstate->size); teardown_donated_memory(&shared_pkvm->teardown_mc, (void *)pkvm_vcpu, pkvm_vcpu->size); /* TODO: unpin shared kvm_vcpu */ @@ -451,6 +498,7 @@ static bool is_kvm_vcpu_accessible(struct kvm_vcpu *vcpu, unsigned long fn) case __pkvm__post_set_cr3: case __pkvm__cache_reg: case __pkvm__update_exception_bitmap: + case __pkvm__vcpu_add_fpstate: /* * FIXME: As the host still needs to pre-configure pVM's vcpu * state for booting, the protection is enforced by the pkvm @@ -539,6 +587,9 @@ static void pkvm_vcpu_load(struct pkvm_vcpu *pkvm_vcpu, int cpu) if (WARN_ON_ONCE(vcpu->cpu != -1 && vcpu->cpu != cpu)) return; + /* Save host pkru register if supported */ + vcpu->arch.host_pkru = read_pkru(); + kvm_x86_call(vcpu_load)(vcpu, cpu); set_pkvm_vcpu_inuse(pkvm_vcpu); @@ -703,44 +754,64 @@ static unsigned long pkvm_vcpu_run(struct pkvm_vcpu *pkvm_vcpu, bool force_immed return reqs; } -static unsigned long pkvm_vcpu_after_set_cpuid(struct pkvm_vcpu *pkvm_vcpu, unsigned long new_pa) +static unsigned long pkvm_vcpu_after_set_cpuid(struct pkvm_vcpu *pkvm_vcpu, + unsigned long new_entries_gpa, + unsigned long e2size) { + struct pkvm_memcache mc = { + .head = INVALID_PAGE, + .nr_pages = 0, + }; struct kvm_cpuid_entry2 *new, *old; - unsigned long ret = new_pa; + int new_nent, old_nent; struct kvm_vcpu *vcpu; - int nent; - u64 size; + unsigned long e2pa; - if (WARN_ON_ONCE(!pkvm_vcpu)) - return ret; + if (!VALID_PAGE(new_entries_gpa) || + !PAGE_ALIGNED(new_entries_gpa) || + !PAGE_ALIGNED(e2size) || + !e2size) + return INVALID_PAGE; - nent = pkvm_vcpu->shared_vcpu->arch.cpuid_nent; - size = PAGE_ALIGN(sizeof(struct kvm_cpuid_entry2) * nent); - if (__pkvm_host_donate_hyp(new_pa, size)) - return ret; + e2pa = host_gpa2hpa(new_entries_gpa); + if (WARN_ON_ONCE(!pkvm_vcpu) || + __pkvm_host_donate_hyp(e2pa, e2size)) + goto out; vcpu = to_kvm_vcpu(pkvm_vcpu); + old_nent = vcpu->arch.cpuid_nent; old = vcpu->arch.cpuid_entries; - new = __pkvm_va(new_pa); + new_nent = e2size / sizeof(struct kvm_cpuid_entry2); + new = __pkvm_va(e2pa); - if (kvm_set_cpuid(vcpu, new, nent) || vcpu->arch.cpuid_entries != new) { - /* New physical page is not consumed */ - __pkvm_hyp_donate_host(new_pa, size); - } else if (vcpu->arch.cpuid_entries == new) { - /* New physical page is consumed */ + if (!kvm_set_cpuid(vcpu, new, new_nent) && (vcpu->arch.cpuid_entries == new)) { + /* + * New physical page is consumed. Teardown the old cpuid + * entry memory pages if there is. + */ if (old) { - memset(old, 0, size); - /* Let the host VMM to free the old physical pages */ - ret = __pkvm_pa(old); - /* Before that, undonate the old physical pages */ - __pkvm_hyp_donate_host(ret, size); + e2pa = __pkvm_pa(old); + e2size = sizeof(struct kvm_cpuid_entry2) * old_nent; } else { - /* No physical page for the host VMM to free */ - ret = INVALID_PAGE; + e2pa = INVALID_PAGE; + e2size = 0; } } +out: + if (VALID_PAGE(e2pa)) + teardown_donated_memory(&mc, (void *)__pkvm_va(e2pa), e2size); - return ret; + if (VALID_PAGE(mc.head)) { + /* + * Store the nr_page in the first page of the teardowned memory + * at the offset skipping sizeof(phys_addr_t) where stores the + * next page physical address. + */ + unsigned long *nr_pages = __pkvm_va(mc.head) + sizeof(phys_addr_t); + + *nr_pages = mc.nr_pages; + } + return mc.head; } static void pkvm_reset_vcpu(struct pkvm_vcpu *pkvm_vcpu, bool init_event) @@ -1304,6 +1375,74 @@ static void pkvm_update_exception_bitmap(struct pkvm_vcpu *pkvm_vcpu) kvm_x86_call(update_exception_bitmap)(vcpu); } +static unsigned long pkvm_vcpu_add_fpstate(struct pkvm_vcpu *pkvm_vcpu, + unsigned long new_fps_gpa, + unsigned long fpsize) +{ + struct pkvm_memcache mc = { + .head = INVALID_PAGE, + .nr_pages = 0, + }; + struct fpstate *old_fps; + struct kvm_vcpu *vcpu; + unsigned long fpspa; + + if (!VALID_PAGE(new_fps_gpa) || + !PAGE_ALIGNED(new_fps_gpa) || + !PAGE_ALIGNED(fpsize) || + !fpsize) + return INVALID_PAGE; + + fpspa = host_gpa2hpa(new_fps_gpa); + if (WARN_ON_ONCE(!pkvm_vcpu)) + goto out; + + vcpu = to_kvm_vcpu(pkvm_vcpu); + old_fps = vcpu->arch.guest_fpu.fpstate; + /* + * The npVM's FPU state is managed by the host, it is not necessary to + * swap the fpstate in the pkvm hypervisor. The fpstate size should be + * checked for the pVM. See comments in pkvm_vcpu_create. + */ + if (!pkvm_is_protected_vcpu(vcpu) || + (fpsize < (fpu_kernel_cfg.default_size + + ALIGN(offsetof(struct fpstate, regs), 64)))) + goto out; + + if (__pkvm_host_donate_hyp(fpspa, fpsize)) + goto out; + + vcpu->arch.guest_fpu.fpstate = __pkvm_va(fpspa); + memset(vcpu->arch.guest_fpu.fpstate, 0, fpsize); + /* Save the fpsize in fpstate->size. See comments in pkvm_vcpu_create */ + vcpu->arch.guest_fpu.fpstate->size = fpsize; + pkvm_init_guest_fpu(&vcpu->arch.guest_fpu); + + if (old_fps) { + fpspa = __pkvm_pa(old_fps); + fpsize = old_fps->size; + } else { + fpspa = INVALID_PAGE; + fpsize = 0; + } +out: + if (VALID_PAGE(fpspa)) + teardown_donated_memory(&mc, (void *)__pkvm_va(fpspa), fpsize); + + if (VALID_PAGE(mc.head)) { + /* + * Store the nr_page in the first page of the teardowned memory + * at the offset skipping sizeof(phys_addr_t) where stores the + * next page physical address. + */ + unsigned long *nr_pages = __pkvm_va(mc.head) + sizeof(phys_addr_t); + + *nr_pages = mc.nr_pages; + } + + return mc.head; +} + static unsigned long pkvm_vcpu_handle_kvm_call(unsigned long fn, struct kvm_vcpu *shared_vcpu, unsigned long p2, unsigned long p3) @@ -1324,7 +1463,7 @@ static unsigned long pkvm_vcpu_handle_kvm_call(unsigned long fn, ret = pkvm_vcpu_run(pkvm_vcpu, (bool)p2); break; case __pkvm__vcpu_after_set_cpuid: - ret = pkvm_vcpu_after_set_cpuid(pkvm_vcpu, p2); + ret = pkvm_vcpu_after_set_cpuid(pkvm_vcpu, p2, p3); break; case __pkvm__vcpu_reset: pkvm_reset_vcpu(pkvm_vcpu, (bool)p2); @@ -1473,6 +1612,9 @@ static unsigned long pkvm_vcpu_handle_kvm_call(unsigned long fn, case __pkvm__update_exception_bitmap: pkvm_update_exception_bitmap(pkvm_vcpu); break; + case __pkvm__vcpu_add_fpstate: + ret = pkvm_vcpu_add_fpstate(pkvm_vcpu, p2, p3); + break; default: ret = -EINVAL; break; @@ -1506,7 +1648,7 @@ unsigned long handle_kvm_call(unsigned long fn, unsigned long p1, ret = 0; break; case __pkvm__vcpu_create: - ret = pkvm_vcpu_create((struct kvm_vcpu *)kern_pkvm_va((void *)p1), p2); + ret = pkvm_vcpu_create((struct kvm_vcpu *)kern_pkvm_va((void *)p1), p2, p3); break; default: ret = pkvm_vcpu_handle_kvm_call(fn, (struct kvm_vcpu *)kern_pkvm_va((void *)p1), diff --git a/arch/x86/kvm/pkvm/smp.c b/arch/x86/kvm/pkvm/smp.c index 2d4e92e4743f..f1a3b09e449e 100644 --- a/arch/x86/kvm/pkvm/smp.c +++ b/arch/x86/kvm/pkvm/smp.c @@ -9,6 +9,7 @@ unsigned long __per_cpu_offset[NR_CPUS]; DEFINE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off); DEFINE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot); +DEFINE_PER_CPU(struct task_struct, cur_task); struct cpumask __cpu_possible_mask __ro_after_init; unsigned int nr_cpu_ids; @@ -22,6 +23,7 @@ unsigned int pkvm_per_cpu_nr_pages(void) int setup_pkvm_per_cpu(int cpu, unsigned long base) { + struct task_struct *task; unsigned long elf_base; if (cpu >= ARRAY_SIZE(__per_cpu_offset)) @@ -32,5 +34,9 @@ int setup_pkvm_per_cpu(int cpu, unsigned long base) per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu]; per_cpu(pcpu_hot.cpu_number, cpu) = cpu; + task = per_cpu_ptr(&cur_task, cpu); + task->group_leader = task; + per_cpu(pcpu_hot.current_task, cpu) = task; + return 0; } diff --git a/arch/x86/kvm/pkvm/vmx/vmx.c b/arch/x86/kvm/pkvm/vmx/vmx.c index 08b3252403ed..a2ea902deb4e 100644 --- a/arch/x86/kvm/pkvm/vmx/vmx.c +++ b/arch/x86/kvm/pkvm/vmx/vmx.c @@ -4482,7 +4482,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) */ if (is_nm_fault(intr_info)) { kvm_queue_exception(vcpu, NM_VECTOR); - return 0; + return 1; } ex_no = intr_info & INTR_INFO_VECTOR_MASK; @@ -6405,6 +6405,63 @@ void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); } +static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) +{ + /* + * Save xfd_err to guest_fpu before interrupt is enabled, so the + * MSR value is not clobbered by the host activity before the guest + * has chance to consume it. + * + * Do not blindly read xfd_err here, since this exception might + * be caused by L1 interception on a platform which doesn't + * support xfd at all. + * + * Do it conditionally upon guest_fpu::xfd. xfd_err matters + * only when xfd contains a non-zero value. + * + * Queuing exception is done in vmx_handle_exit. See comment there. + */ + if (vcpu->arch.guest_fpu.fpstate->xfd) + rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); +} + +static void handle_exception_irqoff(struct kvm_vcpu *vcpu, u32 intr_info) +{ +#ifdef __PKVM_HYP__ + /* if exit due to NM, handle before interrupts are enabled */ + if (is_nm_fault(intr_info)) + handle_nm_fault_irqoff(vcpu); +#else + /* if exit due to PF check for async PF */ + if (is_page_fault(intr_info)) + vcpu->arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); + /* if exit due to NM, handle before interrupts are enabled */ + else if (is_nm_fault(intr_info)) + handle_nm_fault_irqoff(vcpu); + /* Handle machine checks before interrupts are enabled */ + else if (is_machine_check(intr_info)) + kvm_machine_check(); +#endif +} + +void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (vmx->emulation_required) + return; + +#ifdef __PKVM_HYP__ + if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI) + handle_exception_irqoff(vcpu, vmx_get_intr_info(vcpu)); +#else + if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) + handle_external_interrupt_irqoff(vcpu, vmx_get_intr_info(vcpu)); + else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI) + handle_exception_irqoff(vcpu, vmx_get_intr_info(vcpu)); +#endif +} + static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) { u32 exit_intr_info; @@ -7783,28 +7840,6 @@ static void vmx_sync_vcpu_state_post_switch(struct pkvm_vcpu *pkvm_vcpu) if (pkvm_is_protected_vcpu(vcpu) && pkvm_has_req_to_host(HOST_HANDLE_EXIT, vcpu)) update_protected_vcpu_state(vcpu, shared_vcpu); - - /* - * FIXME: The MSR_IA32_XFD handling in vmx_set_msr is skipped as - * currently the FPU switching is still managed by the host. So the - * MSR_IA32_XFD emulation is forwarded to the host to handle. On behalf - * of the host, updating the MSR interception and exeption bitmap before - * entering the guest according to the xfd_no_write_intercept flag. This - * should be removed once the XFD emulation can be done in the pkvm - * hypervisor. - */ - if (unlikely(shared_vcpu->arch.xfd_no_write_intercept ^ - vcpu->arch.xfd_no_write_intercept)) { - vcpu->arch.xfd_no_write_intercept = - shared_vcpu->arch.xfd_no_write_intercept; - if (shared_vcpu->arch.xfd_no_write_intercept) - vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD, - MSR_TYPE_RW); - else - vmx_enable_intercept_for_msr(vcpu, MSR_IA32_XFD, - MSR_TYPE_RW); - vmx_update_exception_bitmap(vcpu); - } } static void share_protected_vcpu_state(struct kvm_vcpu *vcpu, @@ -8024,6 +8059,8 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .load_mmu_pgd = vmx_load_mmu_pgd, + .handle_exit_irqoff = vmx_handle_exit_irqoff, + .setup_mce = vmx_setup_mce, }; diff --git a/arch/x86/kvm/pkvm/x86.c b/arch/x86/kvm/pkvm/x86.c index e19af92d95da..a39dee657ec2 100644 --- a/arch/x86/kvm/pkvm/x86.c +++ b/arch/x86/kvm/pkvm/x86.c @@ -12,6 +12,9 @@ #include #include "pkvm.h" #include +#include "fpu/fpu.h" + +#include #ifdef __PKVM_HYP__ #undef module_param_named @@ -747,6 +750,13 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state); +#ifdef CONFIG_X86_64 +static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.guest_supported_xcr0 & XFEATURE_MASK_USER_DYNAMIC; +} +#endif + static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) { u64 xcr0 = xcr; @@ -1767,6 +1777,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vcpu->arch.msr_misc_features_enables = data; break; +#endif #ifdef CONFIG_X86_64 case MSR_IA32_XFD: if (!msr_info->host_initiated && @@ -1788,7 +1799,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.guest_fpu.xfd_err = data; break; -#endif #endif default: #ifndef __PKVM_HYP__ /* FIXME: Leave to the host to emulate */ @@ -2086,6 +2096,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_K7_HWCR: msr_info->data = vcpu->arch.msr_hwcr; break; +#endif #ifdef CONFIG_X86_64 case MSR_IA32_XFD: if (!msr_info->host_initiated && @@ -2101,7 +2112,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vcpu->arch.guest_fpu.xfd_err; break; -#endif #endif default: #ifndef __PKVM_HYP__ /* FIXME: Leave to the host to emulate */ @@ -2708,6 +2718,22 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_halt); +/* Swap (qemu) user FPU context for the guest FPU context. */ +static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) +{ + /* Exclude PKRU, it's restored separately immediately after VM-Exit. */ + fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true); + trace_kvm_fpu(1); +} + +/* When vcpu_run ends, restore user space FPU context. */ +static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) +{ + fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, false); + ++vcpu->stat.fpu_reload; + trace_kvm_fpu(0); +} + int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) { #ifdef __PKVM_HYP__ @@ -2717,6 +2743,11 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; + pkvm_init_guest_fpu(&vcpu->arch.guest_fpu); + + if (pkvm_is_protected_vcpu(vcpu)) + fpstate_set_confidential(&vcpu->arch.guest_fpu); + return kvm_x86_call(vcpu_create)(vcpu); #else struct page *page; @@ -2873,14 +2904,28 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_async_pf_hash_reset(vcpu); vcpu->arch.apf.halted = false; +#ifdef __PKVM_HYP__ /* - * FIXME: As the guest fpu is still managed by the host and the pkvm - * hypervisor doesn't have valid fpstate in its vcpu->arch.guest_fpu, - * reset the fpstate for MPX is done by the host when runs its - * kvm_vcpu_reset. To add fpu isolation, revisit to see how to do this - * in the pkvm hypervisor. + * The pkvm hypervisor does the FPU switching for the pVM and the host + * does the FPU switching for the npVM, which means that the pkvm + * hypervisor only needs to take care the fpstate of the pVM. So only + * needs to clearing the BNDREGS/BNDCSR for the pVM. */ -#ifndef __PKVM_HYP__ + if (pkvm_is_protected_vcpu(vcpu) && + vcpu->arch.guest_fpu.fpstate && kvm_mpx_supported()) { + struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate; + bool in_use = fpstate->in_use; + + if (in_use) + kvm_put_guest_fpu(vcpu); + + fpstate_clear_xstate_component(fpstate, XFEATURE_BNDREGS); + fpstate_clear_xstate_component(fpstate, XFEATURE_BNDCSR); + + if (in_use) + kvm_load_guest_fpu(vcpu); + } +#else if (vcpu->arch.guest_fpu.fpstate && kvm_mpx_supported()) { struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate; @@ -3291,105 +3336,133 @@ static void kvm_restore_user_return_msr(void) } } -unsigned long kvm_vcpu_enter_guest(struct kvm_vcpu *vcpu, bool force_immediate_exit) +static int __kvm_vcpu_enter_guest(struct kvm_vcpu *vcpu, bool force_immediate_exit) { - struct kvm_vcpu *hvcpu = this_cpu_read(host_vcpu); + bool req_immediate_exit = false; fastpath_t exit_fastpath; - int ret, i; + int ret; - pkvm_reset_reqs_to_host(vcpu); + if (kvm_request_pending(vcpu)) { + if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) + kvm_vcpu_flush_tlb_all(vcpu); - if (kvm_x86_call(vcpu_pre_run)(vcpu) <= 0) - return 0; + if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) + kvm_vcpu_flush_tlb_current(vcpu); - vcpu->arch.last_vmentry_cpu = vcpu->cpu; + if (kvm_check_request(KVM_REQ_EVENT, vcpu)) + kvm_check_and_inject_events(vcpu, &req_immediate_exit); + } - /* TODO: Save the host VMM fpu and load the guest fpu */ + kvm_x86_call(prepare_switch_to_guest)(vcpu); - /* Save the host debug registers */ - get_debugreg(hvcpu->arch.dr7, 7); - for (i = 0; i < KVM_NR_DB_REGS; i++) - get_debugreg(hvcpu->arch.db[i], i); + /* + * Make sure vcpu->mode is changed to IN_GUEST_MODE before + * running to mark this vcpu should be kicked for any new + * vcpu request. + */ + smp_store_mb(vcpu->mode, IN_GUEST_MODE); - vcpu->arch.host_debugctl = get_debugctlmsr(); + if (req_immediate_exit) + kvm_make_request(KVM_REQ_EVENT, vcpu); + else + req_immediate_exit = force_immediate_exit; + + if (vcpu->arch.guest_fpu.xfd_err) + wrmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); + + if (unlikely(vcpu->arch.switch_db_regs)) { + set_debugreg(0, 7); + set_debugreg(vcpu->arch.eff_db[0], 0); + set_debugreg(vcpu->arch.eff_db[1], 1); + set_debugreg(vcpu->arch.eff_db[2], 2); + set_debugreg(vcpu->arch.eff_db[3], 3); + /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */ + if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) + kvm_x86_call(set_dr6)(vcpu, vcpu->arch.dr6); + } for (;;) { - bool req_immediate_exit = false; + exit_fastpath = kvm_x86_call(vcpu_run)(vcpu, req_immediate_exit); - if (kvm_request_pending(vcpu)) { - if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) - kvm_vcpu_flush_tlb_all(vcpu); + if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST)) + break; + } - if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) - kvm_vcpu_flush_tlb_current(vcpu); + /* Sync the guest debug registers */ + if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) { + WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP); + kvm_x86_call(sync_dirty_debug_regs)(vcpu); + kvm_update_dr0123(vcpu); + kvm_update_dr7(vcpu); + } - if (kvm_check_request(KVM_REQ_EVENT, vcpu)) - kvm_check_and_inject_events(vcpu, &req_immediate_exit); - } + /* + * Make sure vcpu->mode is changed to OUTSIDE_GUEST_MODE after + * vmexit to mark this vcpu no need to be kicked for any new + * vcpu request. + */ + smp_store_mb(vcpu->mode, OUTSIDE_GUEST_MODE); - kvm_x86_call(prepare_switch_to_guest)(vcpu); + /* + * Sync xfd before calling handle_exit_irqoff() which may + * rely on the fact that guest_fpu::xfd is up-to-date (e.g. + * in #NM irqoff handler). + */ + if (vcpu->arch.xfd_no_write_intercept) + fpu_sync_guest_vmexit_xfd_state(); - /* - * Make sure vcpu->mode is changed to IN_GUEST_MODE before - * running to mark this vcpu should be kicked for any new - * vcpu request. - */ - smp_store_mb(vcpu->mode, IN_GUEST_MODE); - - if (req_immediate_exit) - kvm_make_request(KVM_REQ_EVENT, vcpu); - else - req_immediate_exit = force_immediate_exit; - - if (unlikely(vcpu->arch.switch_db_regs)) { - set_debugreg(0, 7); - set_debugreg(vcpu->arch.eff_db[0], 0); - set_debugreg(vcpu->arch.eff_db[1], 1); - set_debugreg(vcpu->arch.eff_db[2], 2); - set_debugreg(vcpu->arch.eff_db[3], 3); - /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */ - if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) - kvm_x86_call(set_dr6)(vcpu, vcpu->arch.dr6); - } + kvm_x86_call(handle_exit_irqoff)(vcpu); - exit_fastpath = kvm_x86_call(vcpu_run)(vcpu, req_immediate_exit); + if (vcpu->arch.guest_fpu.xfd_err) + wrmsrl(MSR_IA32_XFD_ERR, 0); - /* Sync the guest debug registers */ - if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) { - WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP); - kvm_x86_call(sync_dirty_debug_regs)(vcpu); - kvm_update_dr0123(vcpu); - kvm_update_dr7(vcpu); - } + ret = kvm_x86_call(handle_exit)(vcpu, exit_fastpath); + if (ret <= 0) { + pkvm_make_req_to_host(HOST_HANDLE_EXIT, vcpu); + goto out; + } - /* - * Make sure vcpu->mode is changed to OUTSIDE_GUEST_MODE after - * vmexit to mark this vcpu no need to be kicked for any new - * vcpu request. - */ - smp_store_mb(vcpu->mode, OUTSIDE_GUEST_MODE); + if (unlikely(force_immediate_exit) || pkvm_reqs_to_host(vcpu)) + goto out; - if (unlikely(exit_fastpath == EXIT_FASTPATH_REENTER_GUEST)) - continue; + return 1; +out: + kvm_x86_call(prepare_switch_to_host)(vcpu); + return 0; +} - ret = kvm_x86_call(handle_exit)(vcpu, exit_fastpath); - if (ret <= 0) { - pkvm_make_req_to_host(HOST_HANDLE_EXIT, vcpu); - break; - } +unsigned long kvm_vcpu_enter_guest(struct kvm_vcpu *vcpu, bool force_immediate_exit) +{ + struct kvm_vcpu *hvcpu = this_cpu_read(host_vcpu); + int i; + + pkvm_reset_reqs_to_host(vcpu); + + if (kvm_x86_call(vcpu_pre_run)(vcpu) <= 0) + return 0; + + vcpu->arch.last_vmentry_cpu = vcpu->cpu; + + kvm_load_guest_fpu(vcpu); + + /* Save the host debug registers */ + get_debugreg(hvcpu->arch.dr7, 7); + for (i = 0; i < KVM_NR_DB_REGS; i++) + get_debugreg(hvcpu->arch.db[i], i); + + vcpu->arch.host_debugctl = get_debugctlmsr(); - if (unlikely(force_immediate_exit) || pkvm_reqs_to_host(vcpu)) + for (;;) { + if (__kvm_vcpu_enter_guest(vcpu, force_immediate_exit) != 1) break; } - kvm_x86_call(prepare_switch_to_host)(vcpu); - /* Restore the host debug registers */ set_debugreg(hvcpu->arch.dr7, 7); for (i = 0; i < KVM_NR_DB_REGS; i++) set_debugreg(hvcpu->arch.db[i], i); - /* TODO: Restore the host VMM fpu and save the guest fpu */ + kvm_put_guest_fpu(vcpu); kvm_restore_user_return_msr(); diff --git a/arch/x86/kvm/vmx/pkvm/hyp/Makefile b/arch/x86/kvm/vmx/pkvm/hyp/Makefile index 0fc5db3f85bc..605489adb21b 100644 --- a/arch/x86/kvm/vmx/pkvm/hyp/Makefile +++ b/arch/x86/kvm/vmx/pkvm/hyp/Makefile @@ -16,7 +16,7 @@ pkvm-hyp-y := vmx_asm.o vmexit.o memory.o page_alloc.o early_alloc.o pgtable.o m pkvm := ../../../pkvm pkvm-hyp-y += $(pkvm)/smp.o $(pkvm)/pkvm.o $(pkvm)/x86.o $(pkvm)/vmx/vmx.o $(pkvm)/vmx/vmenter.o \ - $(pkvm)/cpuid.o $(pkvm)/lapic.o + $(pkvm)/cpuid.o $(pkvm)/lapic.o $(pkvm)/fpu/core.o $(pkvm)/fpu/xstate.o $(pkvm)/fpu/fpu.o lib-dir := lib lib2-dir := ../../../../../../lib @@ -31,6 +31,9 @@ pkvm-hyp-$(CONFIG_LIST_HARDENED) += $(lib-dir)/list_debug.o pkvm-obj := $(patsubst %.o,%.pkvm.o,$(pkvm-hyp-y)) obj-$(CONFIG_PKVM_INTEL) += pkvm.o +CFLAGS_$(pkvm)/fpu/xstate.pkvm.o += -iquote $(srctree)/arch/x86/kernel/fpu +CFLAGS_$(pkvm)/fpu/fpu.pkvm.o += -iquote $(srctree)/arch/x86/kernel/fpu +CFLAGS_$(pkvm)/fpu/core.pkvm.o += -iquote $(srctree)/arch/x86/kernel/fpu AFLAGS_$(pkvm)/vmx/vmenter.pkvm.o += -iquote $(obj)/$(pkvm) $(obj)/$(pkvm)/vmx/vmenter.pkvm.o: $(obj)/$(pkvm)/kvm-asm-offsets.h diff --git a/arch/x86/kvm/vmx/pkvm/hyp/init_finalise.c b/arch/x86/kvm/vmx/pkvm/hyp/init_finalise.c index cacb57f79dbf..754db2e25a82 100644 --- a/arch/x86/kvm/vmx/pkvm/hyp/init_finalise.c +++ b/arch/x86/kvm/vmx/pkvm/hyp/init_finalise.c @@ -29,6 +29,7 @@ #include #include #include +#include bool pvmfw_present; phys_addr_t pvmfw_base; @@ -382,6 +383,8 @@ int __pkvm_init_finalise(struct kvm_vcpu *vcpu, struct pkvm_section sections[], if (ret) goto out; + pkvm_setup_xstate_cache(); + pkvm_init = true; switch_pgt: @@ -413,6 +416,8 @@ int __pkvm_init_finalise(struct kvm_vcpu *vcpu, struct pkvm_section sections[], ept_sync_global(); + pkvm_init_percpu_fpu(); + ret = pkvm_setup_lapic(pcpu, vcpu->cpu); out: return ret; diff --git a/arch/x86/kvm/vmx/pkvm/hyp/vmexit.c b/arch/x86/kvm/vmx/pkvm/hyp/vmexit.c index 1b96d3d68757..8ee0361350bc 100644 --- a/arch/x86/kvm/vmx/pkvm/hyp/vmexit.c +++ b/arch/x86/kvm/vmx/pkvm/hyp/vmexit.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include "trace.h" #include "vmexit.h" diff --git a/arch/x86/kvm/vmx/pkvm/include/pkvm.h b/arch/x86/kvm/vmx/pkvm/include/pkvm.h index a4988df910f4..beed14cef67b 100644 --- a/arch/x86/kvm/vmx/pkvm/include/pkvm.h +++ b/arch/x86/kvm/vmx/pkvm/include/pkvm.h @@ -146,6 +146,12 @@ extern bool pkvm_sym(pvmfw_present); extern phys_addr_t pkvm_sym(pvmfw_base); extern phys_addr_t pkvm_sym(pvmfw_size); +extern struct fpu_state_config pkvm_sym(fpu_kernel_cfg); +extern struct fpu_state_config pkvm_sym(fpu_user_cfg); +#ifdef CONFIG_X86_64 +DECLARE_STATIC_KEY_FALSE(pkvm_sym(__fpu_state_size_dynamic)); +#endif + PKVM_DECLARE(void, __pkvm_vmexit_entry, (void)); PKVM_DECLARE(bool, pkvm_vmexit_main, (struct kvm_vcpu *vcpu)); PKVM_DECLARE(void, pkvm_init_host_state_area, (struct pkvm_pcpu *pcpu, int cpu)); diff --git a/arch/x86/kvm/vmx/pkvm/pkvm_host.c b/arch/x86/kvm/vmx/pkvm/pkvm_host.c index 50e9ce548916..ced8b53d372e 100644 --- a/arch/x86/kvm/vmx/pkvm/pkvm_host.c +++ b/arch/x86/kvm/vmx/pkvm/pkvm_host.c @@ -1316,6 +1316,12 @@ static void __init setup_pkvm_syms(void) cpumask_copy(&pkvm_sym(__cpu_possible_mask), cpu_possible_mask); pkvm_sym(nr_cpu_ids) = nr_cpu_ids; pkvm_sym(x86_pred_cmd) = x86_pred_cmd; + pkvm_sym(fpu_kernel_cfg) = fpu_kernel_cfg; + pkvm_sym(fpu_user_cfg) = fpu_user_cfg; +#ifdef CONFIG_X86_64 + if (fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC) + static_branch_enable(&pkvm_sym(__fpu_state_size_dynamic)); +#endif } int __init vmx_pkvm_init(void) diff --git a/arch/x86/kvm/vmx/pkvm_high.c b/arch/x86/kvm/vmx/pkvm_high.c index 1505f0b5272e..827c8ed21475 100644 --- a/arch/x86/kvm/vmx/pkvm_high.c +++ b/arch/x86/kvm/vmx/pkvm_high.c @@ -177,6 +177,8 @@ static bool pkvm_hyp_emulated_msr(u32 msr) case MSR_FS_BASE: case MSR_GS_BASE: case MSR_KERNEL_GS_BASE: + case MSR_IA32_XFD: + case MSR_IA32_XFD_ERR: #endif case MSR_IA32_SYSENTER_CS: case MSR_IA32_SYSENTER_EIP: @@ -900,13 +902,29 @@ static int pkvm_vcpu_create(struct kvm_vcpu *vcpu) } pkvm_vcpu_sz = PAGE_ALIGN(PKVM_SHADOW_VCPU_STATE_SIZE); + if (pkvm_is_protected_vcpu(vcpu)) + /* + * The pVM FPU registers will be switched by the pkvm + * hypervisor. Allocate the fpstate regs memory according + * to the real guest_fpu.fpstate size. + */ + pkvm_vcpu_sz += PAGE_ALIGN(vcpu->arch.guest_fpu.fpstate->size + + ALIGN(offsetof(struct fpstate, regs), 64)); + else + /* + * The npVM FPU registers will be switched by the host. No need + * to count the real guest_fpu.fpstate size but just strcut + * fpstate size except for the regs. + */ + pkvm_vcpu_sz += PAGE_ALIGN(ALIGN(offsetof(struct fpstate, regs), 64)); + pkvm_vcpu = alloc_pages_exact(pkvm_vcpu_sz, GFP_KERNEL_ACCOUNT); if (!pkvm_vcpu) goto free_ve; /* TODO: share struct vcpu_vmx with pkvm */ - ret = kvm_call_pkvm(vcpu_create, vcpu, __pa(pkvm_vcpu)); + ret = kvm_call_pkvm(vcpu_create, vcpu, __pa(pkvm_vcpu), pkvm_vcpu_sz); if (ret < 0) goto free_pages; @@ -1046,8 +1064,6 @@ static int pkvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) static int pkvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { - int ret; - /* Use PV interface to set the MSR emulated by the pkvm hypervisor */ if (pkvm_hyp_emulated_msr(msr_info->index)) { if (!vcpu->arch.guest_state_protected) { @@ -1065,20 +1081,7 @@ static int pkvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } /* Otherwise handle by the host VMM itself */ - ret = kvm_set_msr_common(vcpu, msr_info); - if (ret) - return ret; - - /* - * FIXME: The pkvm hypervisor will disable the write intercept for the - * XFD MSR. But as the FPU switching is done by the host, has to set the - * xfd_no_write_intercept here. Once the FPU switching can be done in - * the pkvm hypervisor, this can be removed. - */ - if (msr_info->index == MSR_IA32_XFD && msr_info->data) - vcpu->arch.xfd_no_write_intercept = true; - - return 0; + return kvm_set_msr_common(vcpu, msr_info); } static u64 pkvm_get_segment_base(struct kvm_vcpu *vcpu, int seg) @@ -1433,8 +1436,15 @@ static fastpath_t pkvm_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit * can be enabled earlier. */ if (unlikely(vcpu->kvm->arch.has_protected_state && - !vcpu->arch.guest_state_protected)) + !vcpu->arch.guest_state_protected)) { vcpu->arch.guest_state_protected = true; + /* + * Mark the guest_fpu as confidential to avoid the host VMM to do the + * FPU switching for the pVM as this will be done by the pkvm + * hypervisor. + */ + fpstate_set_confidential(&vcpu->arch.guest_fpu); + } if (unlikely(vmx->exit_reason.full == 0xdead)) { vmx->fail = 1; @@ -1756,17 +1766,56 @@ static void pkvm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, u64 *info1, } } +static int pkvm_vcpu_realloc_fpstate(struct kvm_vcpu *vcpu) +{ + unsigned long old_fpspa; + size_t fpsize; + void *fps; + + fpsize = PAGE_ALIGN(vcpu->arch.guest_fpu.fpstate->size + + ALIGN(offsetof(struct fpstate, regs), 64)); + fps = alloc_pages_exact(fpsize, GFP_KERNEL_ACCOUNT); + if (!fps) + return -ENOMEM; + + old_fpspa = kvm_call_pkvm(vcpu_add_fpstate, vcpu, __pa(fps), fpsize); + if (VALID_PAGE(old_fpspa)) { + unsigned long *nr_pages = __va(old_fpspa) + sizeof(phys_addr_t); + struct pkvm_memcache mc = { + .head = old_fpspa, + .nr_pages = *nr_pages, + }; + + free_pkvm_memcache(&mc); + } + + return 0; +} + static void pkvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *e2 = vcpu->arch.cpuid_entries; int nent = vcpu->arch.cpuid_nent; - unsigned long unused_pa; + unsigned long old_entries_pa; void *entries; size_t size; if (vcpu->arch.guest_state_protected || !e2 || !nent) return; + /* + * With expoing the FPU dynamica feature via the cpuid, the fpstate + * allocated when creating the vcpu may not be sufficient for the + * guest. As the pVM's FPU state is managed by the pkvm hypervisor + * while the npVM's FPU state is managed by the host, re-allocating the + * fpstate is only necessary for the pVM, and should be done before + * adding the new cpuid entries to the pkvm hypervisor. + */ + if ((vcpu->arch.guest_fpu.xfeatures & XFEATURE_MASK_USER_DYNAMIC) && + pkvm_is_protected_vcpu(vcpu) && + pkvm_vcpu_realloc_fpstate(vcpu)) + return; + size = sizeof(struct kvm_cpuid_entry2) * nent; entries = alloc_pages_exact(size, GFP_KERNEL_ACCOUNT); if (!entries) { @@ -1776,10 +1825,16 @@ static void pkvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) memcpy(entries, (void *)e2, size); - unused_pa = kvm_call_pkvm(vcpu_after_set_cpuid, vcpu, __pa(entries)); - if (VALID_PAGE(unused_pa)) { - entries = __va(unused_pa); - free_pages_exact(entries, size); + old_entries_pa = kvm_call_pkvm(vcpu_after_set_cpuid, vcpu, + __pa(entries), PAGE_ALIGN(size)); + if (VALID_PAGE(old_entries_pa)) { + unsigned long *nr_pages = __va(old_entries_pa) + sizeof(phys_addr_t); + struct pkvm_memcache mc = { + .head = old_entries_pa, + .nr_pages = *nr_pages, + }; + + free_pkvm_memcache(&mc); } }