diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index f86ad3335529..56689e83fb75 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -68,18 +68,22 @@ static inline void kernel_fpu_begin(void)
  */
 static inline void fpregs_lock(void)
 {
+#ifndef __PKVM_HYP__
 	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
 		local_bh_disable();
 	else
 		preempt_disable();
+#endif
 }
 
 static inline void fpregs_unlock(void)
 {
+#ifndef __PKVM_HYP__
 	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
 		local_bh_enable();
 	else
 		preempt_enable();
+#endif
 }
 
 /*
diff --git a/arch/x86/include/asm/kvm_pkvm.h b/arch/x86/include/asm/kvm_pkvm.h
index 033fbec08d48..d38f5423966d 100644
--- a/arch/x86/include/asm/kvm_pkvm.h
+++ b/arch/x86/include/asm/kvm_pkvm.h
@@ -312,6 +312,7 @@ enum pkvm_fn {
 	__pkvm__cache_reg,
 	__pkvm__update_cpuid_runtime,
 	__pkvm__update_exception_bitmap,
+	__pkvm__vcpu_add_fpstate,
 };
 
 #define HOST_HANDLE_EXIT		0
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
index 0b86a5002c84..273fc5a6ffe8 100644
--- a/arch/x86/kernel/fpu/xstate.h
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -184,7 +184,9 @@ static inline void os_xsave(struct fpstate *fpstate)
 	u32 hmask = mask >> 32;
 	int err;
 
+#ifndef __PKVM_HYP__
 	WARN_ON_FPU(!alternatives_patched);
+#endif
 	xfd_validate_state(fpstate, mask, false);
 
 	XSTATE_XSAVE(&fpstate->regs.xsave, lmask, hmask, err);
diff --git a/arch/x86/kvm/pkvm/cpuid.c b/arch/x86/kvm/pkvm/cpuid.c
index 9cb156ce1d5d..0420fbd75d81 100644
--- a/arch/x86/kvm/pkvm/cpuid.c
+++ b/arch/x86/kvm/pkvm/cpuid.c
@@ -146,17 +146,7 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu,
 	if (!xfeatures)
 		return 0;
 
-#ifdef __PKVM_HYP__
-	/*
-	 * TODO: The guest fpu xfd feature is enabled by the host when the host
-	 * KVM run its kvm_check_cpuid function before calling the
-	 * vcpu_after_set_cpuid PV interface. Revisit when implements the fpu
-	 * isolation.
-	 */
-	return 0;
-#else
 	return fpu_enable_guest_xfd_features(&vcpu->arch.guest_fpu, xfeatures);
-#endif
 }
 
 /* Check whether the supplied CPUID data is equal to what is already set for the vCPU. */
diff --git a/arch/x86/kvm/pkvm/def.h b/arch/x86/kvm/pkvm/def.h
index 874e911a6c35..e37dfce91adb 100644
--- a/arch/x86/kvm/pkvm/def.h
+++ b/arch/x86/kvm/pkvm/def.h
@@ -27,6 +27,8 @@
 /* FIXME: Disable SGX to simplify POC */
 #undef CONFIG_X86_SGX_KVM
 #undef CONFIG_PREEMPT_COUNT
+#undef CONFIG_USE_X86_SEG_SUPPORT
+#undef CONFIG_X86_DEBUG_FPU
 #define __NO_FORTIFY
 
 #include <linux/types.h>
diff --git a/arch/x86/kvm/pkvm/fpu/core.c b/arch/x86/kvm/pkvm/fpu/core.c
new file mode 100644
index 000000000000..ed367ee3e771
--- /dev/null
+++ b/arch/x86/kvm/pkvm/fpu/core.c
@@ -0,0 +1,307 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <asm/fpu/api.h>
+#include <asm/fpu/sched.h>
+#include <asm/fpu/signal.h>
+#include <asm/fpu/types.h>
+#include <asm/cpufeatures.h>
+#include <asm/cpufeature.h>
+#include <asm/msr.h>
+#include <asm/smap.h>
+
+#include "internal.h"
+#include "legacy.h"
+#include "xstate.h"
+#include "context.h"
+
+#ifdef CONFIG_X86_64
+DEFINE_STATIC_KEY_FALSE(__fpu_state_size_dynamic);
+DEFINE_PER_CPU(u64, xfd_state);
+#endif
+
+/* The FPU state configuration data for kernel and user space */
+struct fpu_state_config fpu_kernel_cfg __ro_after_init;
+struct fpu_state_config fpu_user_cfg __ro_after_init;
+
+/*
+ * Represents the initial FPU state. It's mostly (but not completely) zeroes,
+ * depending on the FPU hardware format:
+ */
+struct fpstate init_fpstate __ro_after_init;
+
+/*
+ * Track which context is using the FPU on the CPU:
+ */
+DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
+
+/*
+ * Save the FPU register state in fpu->fpstate->regs. The register state is
+ * preserved.
+ *
+ * Must be called with fpregs_lock() held.
+ *
+ * The legacy FNSAVE instruction clears all FPU state unconditionally, so
+ * register state has to be reloaded. That might be a pointless exercise
+ * when the FPU is going to be used by another task right after that. But
+ * this only affects 20+ years old 32bit systems and avoids conditionals all
+ * over the place.
+ *
+ * FXSAVE and all XSAVE variants preserve the FPU register state.
+ */
+void save_fpregs_to_fpstate(struct fpu *fpu)
+{
+	if (likely(use_xsave())) {
+		os_xsave(fpu->fpstate);
+#ifndef __PKVM_HYP__
+		update_avx_timestamp(fpu);
+#endif
+		return;
+	}
+
+	if (likely(use_fxsr())) {
+		fxsave(&fpu->fpstate->regs.fxsave);
+		return;
+	}
+
+	/*
+	 * Legacy FPU register saving, FNSAVE always clears FPU registers,
+	 * so we have to reload them from the memory state.
+	 */
+	asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->fpstate->regs.fsave));
+	frstor(&fpu->fpstate->regs.fsave);
+}
+
+void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask)
+{
+	/*
+	 * AMD K7/K8 and later CPUs up to Zen don't save/restore
+	 * FDP/FIP/FOP unless an exception is pending. Clear the x87 state
+	 * here by setting it to fixed values.  "m" is a random variable
+	 * that should be in L1.
+	 */
+	if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK))) {
+		asm volatile(
+			"fnclex\n\t"
+			"emms\n\t"
+			"fildl %[addr]"	/* set F?P to defined value */
+			: : [addr] "m" (*fpstate));
+	}
+
+	if (use_xsave()) {
+		/*
+		 * Dynamically enabled features are enabled in XCR0, but
+		 * usage requires also that the corresponding bits in XFD
+		 * are cleared.  If the bits are set then using a related
+		 * instruction will raise #NM. This allows to do the
+		 * allocation of the larger FPU buffer lazy from #NM or if
+		 * the task has no permission to kill it which would happen
+		 * via #UD if the feature is disabled in XCR0.
+		 *
+		 * XFD state is following the same life time rules as
+		 * XSTATE and to restore state correctly XFD has to be
+		 * updated before XRSTORS otherwise the component would
+		 * stay in or go into init state even if the bits are set
+		 * in fpstate::regs::xsave::xfeatures.
+		 */
+		xfd_update_state(fpstate);
+
+		/*
+		 * Restoring state always needs to modify all features
+		 * which are in @mask even if the current task cannot use
+		 * extended features.
+		 *
+		 * So fpstate->xfeatures cannot be used here, because then
+		 * a feature for which the task has no permission but was
+		 * used by the previous task would not go into init state.
+		 */
+		mask = fpu_kernel_cfg.max_features & mask;
+
+		os_xrstor(fpstate, mask);
+	} else {
+		if (use_fxsr())
+			fxrstor(&fpstate->regs.fxsave);
+		else
+			frstor(&fpstate->regs.fsave);
+	}
+}
+
+static inline void fpstate_init_fxstate(struct fpstate *fpstate)
+{
+	fpstate->regs.fxsave.cwd = 0x37f;
+	fpstate->regs.fxsave.mxcsr = MXCSR_DEFAULT;
+}
+
+/*
+ * Legacy x87 fpstate state init:
+ */
+static inline void fpstate_init_fstate(struct fpstate *fpstate)
+{
+	fpstate->regs.fsave.cwd = 0xffff037fu;
+	fpstate->regs.fsave.swd = 0xffff0000u;
+	fpstate->regs.fsave.twd = 0xffffffffu;
+	fpstate->regs.fsave.fos = 0xffff0000u;
+}
+
+/*
+ * Used in two places:
+ * 1) Early boot to setup init_fpstate for non XSAVE systems
+ * 2) fpu_init_fpstate_user() which is invoked from KVM
+ */
+void fpstate_init_user(struct fpstate *fpstate)
+{
+	if (!cpu_feature_enabled(X86_FEATURE_FPU)) {
+#ifndef __PKVM_HYP__
+		fpstate_init_soft(&fpstate->regs.soft);
+#endif
+		return;
+	}
+
+	xstate_init_xcomp_bv(&fpstate->regs.xsave, fpstate->xfeatures);
+
+	if (cpu_feature_enabled(X86_FEATURE_FXSR))
+		fpstate_init_fxstate(fpstate);
+	else
+		fpstate_init_fstate(fpstate);
+}
+
+/*
+ * fpu_enable_guest_xfd_features - Check xfeatures against guest perm and enable
+ * @guest_fpu:         Pointer to the guest FPU container
+ * @xfeatures:         Features requested by guest CPUID
+ *
+ * Enable all dynamic xfeatures according to guest perm and requested CPUID.
+ *
+ * Return: 0 on success, error code otherwise
+ */
+int fpu_enable_guest_xfd_features(struct fpu_guest *guest_fpu, u64 xfeatures)
+{
+#ifndef __PKVM_HYP__
+	lockdep_assert_preemption_enabled();
+#endif
+
+	/* Nothing to do if all requested features are already enabled. */
+	xfeatures &= ~guest_fpu->xfeatures;
+	if (!xfeatures)
+		return 0;
+
+	return __xfd_enable_feature(xfeatures, guest_fpu);
+}
+EXPORT_SYMBOL_GPL(fpu_enable_guest_xfd_features);
+
+#ifdef CONFIG_X86_64
+void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd)
+{
+	fpregs_lock();
+	guest_fpu->fpstate->xfd = xfd;
+	if (guest_fpu->fpstate->in_use)
+		xfd_update_state(guest_fpu->fpstate);
+	fpregs_unlock();
+}
+EXPORT_SYMBOL_GPL(fpu_update_guest_xfd);
+
+/**
+ * fpu_sync_guest_vmexit_xfd_state - Synchronize XFD MSR and software state
+ *
+ * Must be invoked from KVM after a VMEXIT before enabling interrupts when
+ * XFD write emulation is disabled. This is required because the guest can
+ * freely modify XFD and the state at VMEXIT is not guaranteed to be the
+ * same as the state on VMENTER. So software state has to be updated before
+ * any operation which depends on it can take place.
+ *
+ * Note: It can be invoked unconditionally even when write emulation is
+ * enabled for the price of a then pointless MSR read.
+ */
+void fpu_sync_guest_vmexit_xfd_state(void)
+{
+	struct fpstate *fps = current->thread.fpu.fpstate;
+
+#ifndef __PKVM_HYP__
+	lockdep_assert_irqs_disabled();
+#endif
+	if (fpu_state_size_dynamic()) {
+		rdmsrl(MSR_IA32_XFD, fps->xfd);
+		__this_cpu_write(xfd_state, fps->xfd);
+	}
+}
+EXPORT_SYMBOL_GPL(fpu_sync_guest_vmexit_xfd_state);
+#endif
+
+int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest)
+{
+	struct fpstate *guest_fps = guest_fpu->fpstate;
+	struct fpu *fpu = &current->thread.fpu;
+	struct fpstate *cur_fps = fpu->fpstate;
+
+	fpregs_lock();
+#ifdef __PKVM_HYP__
+#ifdef CONFIG_X86_64
+	if (fpu_state_size_dynamic() && enter_guest) {
+		/*
+		 * Refresh the xfd_state before guest vmenter so that the xfd can be
+		 * restored after guest vmexit.
+		 */
+		rdmsrl(MSR_IA32_XFD, cur_fps->xfd);
+		__this_cpu_write(xfd_state, cur_fps->xfd);
+	}
+#endif
+	/*
+	 * Only save the FPU registers when exit guest for the pVM. When enter
+	 * the guest, the FPU registers hold the value of the host, which is
+	 * saved by the host itself.
+	 */
+	if (guest_fps->is_confidential && !enter_guest)
+#else
+	if (!cur_fps->is_confidential && !test_thread_flag(TIF_NEED_FPU_LOAD))
+#endif
+		save_fpregs_to_fpstate(fpu);
+
+	/* Swap fpstate */
+	if (enter_guest) {
+		fpu->__task_fpstate = cur_fps;
+		fpu->fpstate = guest_fps;
+		guest_fps->in_use = true;
+	} else {
+		guest_fps->in_use = false;
+		fpu->fpstate = fpu->__task_fpstate;
+		fpu->__task_fpstate = NULL;
+	}
+
+	cur_fps = fpu->fpstate;
+
+#ifdef __PKVM_HYP__
+	/*
+	 * For the pVM, when enter guest, restore the FPU with the data from the
+	 * pVM's xsave area. When exit guest, restore the FPU with the initial
+	 * data to wipe the pVM's FPU registers.
+	 *
+	 * For the npVM, no need to restore.
+	 */
+	if (guest_fps->is_confidential) {
+#else
+	if (!cur_fps->is_confidential) {
+#endif
+		/* Includes XFD update */
+		restore_fpregs_from_fpstate(cur_fps, XFEATURE_MASK_FPSTATE);
+	} else {
+		/*
+		 * XSTATE is restored by firmware from encrypted
+		 * memory. Make sure XFD state is correct while
+		 * running with guest fpstate
+		 */
+		xfd_update_state(cur_fps);
+	}
+
+	fpregs_mark_activate();
+	fpregs_unlock();
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpstate);
+
+void fpregs_mark_activate(void)
+{
+	struct fpu *fpu = &current->thread.fpu;
+
+	fpregs_activate(fpu);
+	fpu->last_cpu = smp_processor_id();
+	clear_thread_flag(TIF_NEED_FPU_LOAD);
+}
diff --git a/arch/x86/kvm/pkvm/fpu/fpu.c b/arch/x86/kvm/pkvm/fpu/fpu.c
new file mode 100644
index 000000000000..0c46cc764ade
--- /dev/null
+++ b/arch/x86/kvm/pkvm/fpu/fpu.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/sched.h>
+#include <asm/fpu/types.h>
+#include <asm/cpufeatures.h>
+#include <asm/cpufeature.h>
+#include <asm/current.h>
+
+#include "internal.h"
+#include "fpu.h"
+#include "internal.h"
+#include "xstate.h"
+
+static DEFINE_PER_CPU(struct fpstate, percpu_fpstate);
+
+void pkvm_init_percpu_fpu(void)
+{
+	struct fpu *fpu = &current->thread.fpu;
+
+	/*
+	 * Set the current fpstate pointer to the percpu_fpstate, which is used
+	 * to restore the FPU to the initial state before switching from a pVM
+	 * to the host.
+	 */
+	fpu->fpstate = this_cpu_ptr(&percpu_fpstate);
+	fpstate_init_user(fpu->fpstate);
+
+	/* The perm is initialized with the maximum features */
+	fpu->perm.__state_perm		= fpu_kernel_cfg.max_features;
+	fpu->perm.__state_size		= fpu_kernel_cfg.max_size;
+
+	fpu->guest_perm = fpu->perm;
+}
+
+void pkvm_init_guest_fpu(struct fpu_guest *gfpu)
+{
+	u64 permitted = xstate_get_group_perm(true);
+	struct fpstate *fpstate = gfpu->fpstate;
+
+	fpstate->xfeatures	= fpu_kernel_cfg.default_features & permitted;
+	fpstate->user_xfeatures	= fpu_user_cfg.default_features & permitted;
+	fpstate->xfd		= 0;
+
+	fpstate->in_use		= false;
+
+	fpstate_init_user(fpstate);
+
+	gfpu->xfeatures		= fpstate->user_xfeatures;
+}
diff --git a/arch/x86/kvm/pkvm/fpu/fpu.h b/arch/x86/kvm/pkvm/fpu/fpu.h
new file mode 100644
index 000000000000..7e499dc9b893
--- /dev/null
+++ b/arch/x86/kvm/pkvm/fpu/fpu.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __PKVM_X86_FPU_H
+#define __PKVM_X86_FPU_H
+
+void pkvm_setup_xstate_cache(void);
+void pkvm_init_percpu_fpu(void);
+void pkvm_init_guest_fpu(struct fpu_guest *gfpu);
+
+#endif /* __PKVM_X86_FPU_H */
diff --git a/arch/x86/kvm/pkvm/fpu/xstate.c b/arch/x86/kvm/pkvm/fpu/xstate.c
new file mode 100644
index 000000000000..a5c6f5aa7842
--- /dev/null
+++ b/arch/x86/kvm/pkvm/fpu/xstate.c
@@ -0,0 +1,310 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <asm/fpu/types.h>
+#include <asm/fpu/xstate.h>
+
+#include "internal.h"
+#include "xstate.h"
+#include "fpu.h"
+
+#define for_each_extended_xfeature(bit, mask)				\
+	(bit) = FIRST_EXTENDED_XFEATURE;				\
+	for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask))
+
+static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init =
+	{ [ 0 ... XFEATURE_MAX - 1] = -1};
+static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init =
+	{ [ 0 ... XFEATURE_MAX - 1] = -1};
+static unsigned int xstate_flags[XFEATURE_MAX] __ro_after_init;
+
+#define XSTATE_FLAG_SUPERVISOR	BIT(0)
+
+static bool xfeature_is_supervisor(int xfeature_nr)
+{
+	return xstate_flags[xfeature_nr] & XSTATE_FLAG_SUPERVISOR;
+}
+
+static bool xfeature_enabled(enum xfeature xfeature)
+{
+	return fpu_kernel_cfg.max_features & BIT_ULL(xfeature);
+}
+
+/*
+ * Record the offsets and sizes of various xstates contained
+ * in the XSAVE state memory layout.
+ */
+static void __init setup_xstate_cache(void)
+{
+	u32 eax, ebx, ecx, edx, i;
+	/* start at the beginning of the "extended state" */
+	unsigned int last_good_offset = offsetof(struct xregs_state,
+						 extended_state_area);
+	/*
+	 * The FP xstates and SSE xstates are legacy states. They are always
+	 * in the fixed offsets in the xsave area in either compacted form
+	 * or standard form.
+	 */
+	xstate_offsets[XFEATURE_FP]	= 0;
+	xstate_sizes[XFEATURE_FP]	= offsetof(struct fxregs_state,
+						   xmm_space);
+
+	xstate_offsets[XFEATURE_SSE]	= xstate_sizes[XFEATURE_FP];
+	xstate_sizes[XFEATURE_SSE]	= sizeof_field(struct fxregs_state,
+						       xmm_space);
+
+	for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
+		cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
+
+		xstate_sizes[i] = eax;
+		xstate_flags[i] = ecx;
+
+		/*
+		 * If an xfeature is supervisor state, the offset in EBX is
+		 * invalid, leave it to -1.
+		 */
+		if (xfeature_is_supervisor(i))
+			continue;
+
+		xstate_offsets[i] = ebx;
+
+		/*
+		 * In our xstate size checks, we assume that the highest-numbered
+		 * xstate feature has the highest offset in the buffer.  Ensure
+		 * it does.
+		 */
+		WARN_ONCE(last_good_offset > xstate_offsets[i],
+			  "x86/fpu: misordered xstate at %d\n", last_good_offset);
+
+		last_good_offset = xstate_offsets[i];
+	}
+}
+
+#define XSTATE_FLAG_ALIGNED64	BIT(1)
+
+static bool xfeature_is_aligned64(int xfeature_nr)
+{
+	return xstate_flags[xfeature_nr] & XSTATE_FLAG_ALIGNED64;
+}
+
+static unsigned int xfeature_get_offset(u64 xcomp_bv, int xfeature)
+{
+	unsigned int offs, i;
+
+	/*
+	 * Non-compacted format and legacy features use the cached fixed
+	 * offsets.
+	 */
+	if (!cpu_feature_enabled(X86_FEATURE_XCOMPACTED) ||
+	    xfeature <= XFEATURE_SSE)
+		return xstate_offsets[xfeature];
+
+	/*
+	 * Compacted format offsets depend on the actual content of the
+	 * compacted xsave area which is determined by the xcomp_bv header
+	 * field.
+	 */
+	offs = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+	for_each_extended_xfeature(i, xcomp_bv) {
+		if (xfeature_is_aligned64(i))
+			offs = ALIGN(offs, 64);
+		if (i == xfeature)
+			break;
+		offs += xstate_sizes[i];
+	}
+	return offs;
+}
+
+static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted)
+{
+	unsigned int topmost = fls64(xfeatures) -  1;
+	unsigned int offset = xstate_offsets[topmost];
+
+	if (topmost <= XFEATURE_SSE)
+		return sizeof(struct xregs_state);
+
+	if (compacted)
+		offset = xfeature_get_offset(xfeatures, topmost);
+	return offset + xstate_sizes[topmost];
+}
+
+/*
+ * Given an xstate feature nr, calculate where in the xsave
+ * buffer the state is.  Callers should ensure that the buffer
+ * is valid.
+ */
+static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
+{
+	u64 xcomp_bv = xsave->header.xcomp_bv;
+
+	if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
+		return NULL;
+
+	if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) {
+		if (WARN_ON_ONCE(!(xcomp_bv & BIT_ULL(xfeature_nr))))
+			return NULL;
+	}
+
+	return (void *)xsave + xfeature_get_offset(xcomp_bv, xfeature_nr);
+}
+
+/*
+ * Given the xsave area and a state inside, this function returns the
+ * address of the state.
+ *
+ * This is the API that is called to get xstate address in either
+ * standard format or compacted format of xsave area.
+ *
+ * Note that if there is no data for the field in the xsave buffer
+ * this will return NULL.
+ *
+ * Inputs:
+ *	xstate: the thread's storage area for all FPU data
+ *	xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP,
+ *	XFEATURE_SSE, etc...)
+ * Output:
+ *	address of the state in the xsave area, or NULL if the
+ *	field is not present in the xsave buffer.
+ */
+void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
+{
+	/*
+	 * Do we even *have* xsave state?
+	 */
+	if (!boot_cpu_has(X86_FEATURE_XSAVE))
+		return NULL;
+
+	/*
+	 * We should not ever be requesting features that we
+	 * have not enabled.
+	 */
+	if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
+		return NULL;
+
+	/*
+	 * This assumes the last 'xsave*' instruction to
+	 * have requested that 'xfeature_nr' be saved.
+	 * If it did not, we might be seeing and old value
+	 * of the field in the buffer.
+	 *
+	 * This can happen because the last 'xsave' did not
+	 * request that this feature be saved (unlikely)
+	 * or because the "init optimization" caused it
+	 * to not be saved.
+	 */
+	if (!(xsave->header.xfeatures & BIT_ULL(xfeature_nr)))
+		return NULL;
+
+	return __raw_xsave_addr(xsave, xfeature_nr);
+}
+EXPORT_SYMBOL_GPL(get_xsave_addr);
+
+#if IS_ENABLED(CONFIG_KVM)
+void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature)
+{
+	void *addr = get_xsave_addr(&fps->regs.xsave, xfeature);
+
+	if (addr)
+		memset(addr, 0, xstate_sizes[xfeature]);
+}
+EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component);
+#endif
+
+#ifdef CONFIG_X86_64
+int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu)
+{
+	u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC;
+#ifdef __PKVM_HYP__
+	struct fpstate *fps;
+	unsigned int ksize;
+
+	if (!xfd_event)
+		return 0;
+
+	if (!guest_fpu)
+		return -EINVAL;
+
+	if ((xstate_get_group_perm(!!guest_fpu) & xfd_event) != xfd_event)
+		return -EPERM;
+
+	fps = guest_fpu->fpstate;
+	ksize = xstate_calculate_size(fps->xfeatures | xfd_event,
+				      cpu_feature_enabled(X86_FEATURE_XCOMPACTED));
+	if (fps->size < ksize)
+		/* State size is insufficient. */
+		return -ENOMEM;
+
+	guest_fpu->xfeatures |= xfd_event;
+	fps->xfeatures |= xfd_event;
+	fps->user_xfeatures |= xfd_event;
+	fps->xfd &= ~xfd_event;
+
+	xstate_init_xcomp_bv(&fps->regs.xsave, fps->xfeatures);
+	if (fps->in_use)
+		xfd_update_state(fps);
+
+	return 0;
+#else
+	struct fpu_state_perm *perm;
+	unsigned int ksize, usize;
+	struct fpu *fpu;
+
+	if (!xfd_event) {
+		if (!guest_fpu)
+			pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err);
+		return 0;
+	}
+
+	/* Protect against concurrent modifications */
+	spin_lock_irq(&current->sighand->siglock);
+
+	/* If not permitted let it die */
+	if ((xstate_get_group_perm(!!guest_fpu) & xfd_event) != xfd_event) {
+		spin_unlock_irq(&current->sighand->siglock);
+		return -EPERM;
+	}
+
+	fpu = &current->group_leader->thread.fpu;
+	perm = guest_fpu ? &fpu->guest_perm : &fpu->perm;
+	ksize = perm->__state_size;
+	usize = perm->__user_state_size;
+
+	/*
+	 * The feature is permitted. State size is sufficient.  Dropping
+	 * the lock is safe here even if more features are added from
+	 * another task, the retrieved buffer sizes are valid for the
+	 * currently requested feature(s).
+	 */
+	spin_unlock_irq(&current->sighand->siglock);
+
+	/*
+	 * Try to allocate a new fpstate. If that fails there is no way
+	 * out.
+	 */
+	if (fpstate_realloc(xfd_event, ksize, usize, guest_fpu))
+		return -EFAULT;
+	return 0;
+#endif
+}
+#endif
+
+#ifdef __PKVM_HYP__
+void pkvm_setup_xstate_cache(void)
+{
+	if (!boot_cpu_has(X86_FEATURE_FPU)) {
+		pr_info("x86/fpu: No FPU detected\n");
+		return;
+	}
+
+	if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
+		pr_info("x86/fpu: x87 FPU will use %s\n",
+			boot_cpu_has(X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE");
+		return;
+	}
+
+	if (boot_cpu_data.cpuid_level < XSTATE_CPUID) {
+		WARN_ON_FPU(1);
+		return;
+	}
+
+	setup_xstate_cache();
+}
+#endif
diff --git a/arch/x86/kvm/pkvm/pkvm.c b/arch/x86/kvm/pkvm/pkvm.c
index 69a9db4a9a9f..3763234ecc98 100644
--- a/arch/x86/kvm/pkvm/pkvm.c
+++ b/arch/x86/kvm/pkvm/pkvm.c
@@ -4,6 +4,7 @@
 #include "x86.h"
 #include "pkvm.h"
 #include "cpuid.h"
+#include "fpu/fpu.h"
 #include <asm/pkvm_spinlock.h>
 //FIXME: clean up the header files
 #include <vmx/pkvm/hyp/mem_protect.h>
@@ -187,7 +188,8 @@ static int pkvm_vm_init(struct kvm *shared_kvm, unsigned long gpa)
 	return ret;
 }
 
-static int attach_pkvm_vcpu_to_vm(struct pkvm_vcpu *pkvm_vcpu, struct pkvm_vm *pkvm_vm)
+static int attach_pkvm_vcpu_to_vm(struct pkvm_vcpu *pkvm_vcpu, struct fpstate *fps,
+				  struct pkvm_vm *pkvm_vm)
 {
 	struct kvm_vcpu *vcpu;
 	struct kvm *kvm;
@@ -218,6 +220,7 @@ static int attach_pkvm_vcpu_to_vm(struct pkvm_vcpu *pkvm_vcpu, struct pkvm_vm *p
 	 */
 	vcpu->arch.apic = pkvm_vcpu->shared_vcpu->arch.apic;
 	vcpu->arch.apic_base = pkvm_vcpu->shared_vcpu->arch.apic_base;
+	vcpu->arch.guest_fpu.fpstate = fps;
 
 	ret = kvm_arch_vcpu_create(vcpu);
 	if (ret)
@@ -269,27 +272,33 @@ void put_pkvm_vm(struct pkvm_vm *pkvm_vm)
 	WARN_ON(atomic_dec_if_positive(&pkvm_vm_ref->refcount) <= 0);
 }
 
-static int pkvm_vcpu_create(struct kvm_vcpu *shared_vcpu, unsigned long gpa)
+static int pkvm_vcpu_create(struct kvm_vcpu *shared_vcpu, unsigned long gpa,
+			    unsigned long size)
 {
 	struct pkvm_vcpu *pkvm_vcpu;
-	unsigned long pkvm_vcpu_pa;
 	struct pkvm_vm *pkvm_vm;
 	struct kvm *shared_kvm;
-	size_t pa_size;
+	struct fpstate *fps;
+	unsigned long pa;
+	size_t fpsize;
+	void *va;
 	int ret;
 
-	pkvm_vcpu_pa = host_gpa2hpa(gpa);
-	if (!PAGE_ALIGNED(pkvm_vcpu_pa))
-		return -EINVAL;
+	if (!VALID_PAGE(gpa) ||
+	    !PAGE_ALIGNED(gpa) ||
+	    !PAGE_ALIGNED(size) ||
+	    (size <= PAGE_ALIGN(pkvm_vcpu_sz)))
+		return -ENOMEM;
 
-	pa_size = PAGE_ALIGN(pkvm_vcpu_sz);
-	if (__pkvm_host_donate_hyp(pkvm_vcpu_pa, pa_size))
+	pa = host_gpa2hpa(gpa);
+	if (__pkvm_host_donate_hyp(pa, size))
 		return -EINVAL;
 
-	pkvm_vcpu = pkvm_phys_to_virt(pkvm_vcpu_pa);
-	memset(pkvm_vcpu, 0, pa_size);
+	va = __pkvm_va(pa);
+	memset(va, 0, size);
 
-	pkvm_vcpu->size = pa_size;
+	pkvm_vcpu = va;
+	pkvm_vcpu->size = PAGE_ALIGN(pkvm_vcpu_sz);
 	/*
 	 * TODO: Assume host is already share the kvm_vcpu structure
 	 * (represented by shared_vcpu) with pkvm. So just pin
@@ -304,7 +313,42 @@ static int pkvm_vcpu_create(struct kvm_vcpu *shared_vcpu, unsigned long gpa)
 		goto undonate;
 	}
 
-	ret = attach_pkvm_vcpu_to_vm(pkvm_vcpu, pkvm_vm);
+	fpsize = size - pkvm_vcpu->size;
+	if (pkvm_is_protected_vm(to_kvm(pkvm_vm))) {
+		/*
+		 * The pkvm hypervisor switches the FPU registers for the pVM
+		 * thus the fpstate size should satisfy the fpu kernel config
+		 * default size.
+		 */
+		if (fpsize < (fpu_kernel_cfg.default_size +
+			      ALIGN(offsetof(struct fpstate, regs), 64))) {
+			ret = -EINVAL;
+			goto put_pkvm_vm;
+		}
+	} else {
+		/*
+		 * The host switches the FPU registers for the npVM thus the
+		 * fpstate in the pkvm hypervisor is not used except for the XFD
+		 * MSR emulation. So the fpstate size should just satisfy the
+		 * struct fpstate size except for the regs.
+		 */
+		if (fpsize < ALIGN(offsetof(struct fpstate, regs), 64)) {
+			ret = -EINVAL;
+			goto put_pkvm_vm;
+		}
+	}
+
+	fps = (struct fpstate *)(va + pkvm_vcpu->size);
+	memset(fps, 0, fpsize);
+	/*
+	 * Although the fpstate size represents the size of the register memory,
+	 * use this field to save the size of the fpstate memory to simplify the
+	 * undonating, which is the only usage of the fpstate size field in the
+	 * pkvm hypervisor.
+	 */
+	fps->size = fpsize;
+
+	ret = attach_pkvm_vcpu_to_vm(pkvm_vcpu, fps, pkvm_vm);
 	if (ret)
 		goto put_pkvm_vm;
 
@@ -315,7 +359,7 @@ static int pkvm_vcpu_create(struct kvm_vcpu *shared_vcpu, unsigned long gpa)
 put_pkvm_vm:
 	put_pkvm_vm(pkvm_vm);
 undonate:
-	__pkvm_hyp_donate_host(pkvm_vcpu_pa, pa_size);
+	__pkvm_hyp_donate_host(pa, size);
 	return ret;
 }
 
@@ -339,6 +383,9 @@ static void pkvm_vm_destroy(int handle)
 					(void *)vcpu->arch.cpuid_entries,
 					sizeof(struct kvm_cpuid_entry2) *
 					vcpu->arch.cpuid_nent);
+		teardown_donated_memory(&shared_pkvm->teardown_mc,
+					(void *)vcpu->arch.guest_fpu.fpstate,
+					vcpu->arch.guest_fpu.fpstate->size);
 		teardown_donated_memory(&shared_pkvm->teardown_mc,
 					(void *)pkvm_vcpu, pkvm_vcpu->size);
 		/* TODO: unpin shared kvm_vcpu */
@@ -451,6 +498,7 @@ static bool is_kvm_vcpu_accessible(struct kvm_vcpu *vcpu, unsigned long fn)
 	case __pkvm__post_set_cr3:
 	case __pkvm__cache_reg:
 	case __pkvm__update_exception_bitmap:
+	case __pkvm__vcpu_add_fpstate:
 		/*
 		 * FIXME: As the host still needs to pre-configure pVM's vcpu
 		 * state for booting, the protection is enforced by the pkvm
@@ -539,6 +587,9 @@ static void pkvm_vcpu_load(struct pkvm_vcpu *pkvm_vcpu, int cpu)
 	if (WARN_ON_ONCE(vcpu->cpu != -1 && vcpu->cpu != cpu))
 		return;
 
+	/* Save host pkru register if supported */
+	vcpu->arch.host_pkru = read_pkru();
+
 	kvm_x86_call(vcpu_load)(vcpu, cpu);
 
 	set_pkvm_vcpu_inuse(pkvm_vcpu);
@@ -703,44 +754,64 @@ static unsigned long pkvm_vcpu_run(struct pkvm_vcpu *pkvm_vcpu, bool force_immed
 	return reqs;
 }
 
-static unsigned long pkvm_vcpu_after_set_cpuid(struct pkvm_vcpu *pkvm_vcpu, unsigned long new_pa)
+static unsigned long pkvm_vcpu_after_set_cpuid(struct pkvm_vcpu *pkvm_vcpu,
+					       unsigned long new_entries_gpa,
+					       unsigned long e2size)
 {
+	struct pkvm_memcache mc = {
+		.head = INVALID_PAGE,
+		.nr_pages = 0,
+	};
 	struct kvm_cpuid_entry2 *new, *old;
-	unsigned long ret = new_pa;
+	int new_nent, old_nent;
 	struct kvm_vcpu *vcpu;
-	int nent;
-	u64 size;
+	unsigned long e2pa;
 
-	if (WARN_ON_ONCE(!pkvm_vcpu))
-		return ret;
+	if (!VALID_PAGE(new_entries_gpa) ||
+	    !PAGE_ALIGNED(new_entries_gpa) ||
+	    !PAGE_ALIGNED(e2size) ||
+	    !e2size)
+		return INVALID_PAGE;
 
-	nent = pkvm_vcpu->shared_vcpu->arch.cpuid_nent;
-	size = PAGE_ALIGN(sizeof(struct kvm_cpuid_entry2) * nent);
-	if (__pkvm_host_donate_hyp(new_pa, size))
-		return ret;
+	e2pa = host_gpa2hpa(new_entries_gpa);
+	if (WARN_ON_ONCE(!pkvm_vcpu) ||
+	    __pkvm_host_donate_hyp(e2pa, e2size))
+		goto out;
 
 	vcpu = to_kvm_vcpu(pkvm_vcpu);
+	old_nent = vcpu->arch.cpuid_nent;
 	old = vcpu->arch.cpuid_entries;
-	new = __pkvm_va(new_pa);
+	new_nent = e2size / sizeof(struct kvm_cpuid_entry2);
+	new = __pkvm_va(e2pa);
 
-	if (kvm_set_cpuid(vcpu, new, nent) || vcpu->arch.cpuid_entries != new) {
-		/* New physical page is not consumed */
-		__pkvm_hyp_donate_host(new_pa, size);
-	} else if (vcpu->arch.cpuid_entries == new) {
-		/* New physical page is consumed */
+	if (!kvm_set_cpuid(vcpu, new, new_nent) && (vcpu->arch.cpuid_entries == new)) {
+		/*
+		 * New physical page is consumed. Teardown the old cpuid
+		 * entry memory pages if there is.
+		 */
 		if (old) {
-			memset(old, 0, size);
-			/* Let the host VMM to free the old physical pages */
-			ret = __pkvm_pa(old);
-			/* Before that, undonate the old physical pages */
-			__pkvm_hyp_donate_host(ret, size);
+			e2pa = __pkvm_pa(old);
+			e2size = sizeof(struct kvm_cpuid_entry2) * old_nent;
 		} else {
-			/* No physical page for the host VMM to free */
-			ret = INVALID_PAGE;
+			e2pa = INVALID_PAGE;
+			e2size = 0;
 		}
 	}
+out:
+	if (VALID_PAGE(e2pa))
+		teardown_donated_memory(&mc, (void *)__pkvm_va(e2pa), e2size);
 
-	return ret;
+	if (VALID_PAGE(mc.head)) {
+		/*
+		 * Store the nr_page in the first page of the teardowned memory
+		 * at the offset skipping sizeof(phys_addr_t) where stores the
+		 * next page physical address.
+		 */
+		unsigned long *nr_pages = __pkvm_va(mc.head) + sizeof(phys_addr_t);
+
+		*nr_pages = mc.nr_pages;
+	}
+	return mc.head;
 }
 
 static void pkvm_reset_vcpu(struct pkvm_vcpu *pkvm_vcpu, bool init_event)
@@ -1304,6 +1375,74 @@ static void pkvm_update_exception_bitmap(struct pkvm_vcpu *pkvm_vcpu)
 	kvm_x86_call(update_exception_bitmap)(vcpu);
 }
 
+static unsigned long pkvm_vcpu_add_fpstate(struct pkvm_vcpu *pkvm_vcpu,
+					   unsigned long new_fps_gpa,
+					   unsigned long fpsize)
+{
+	struct pkvm_memcache mc = {
+		.head = INVALID_PAGE,
+		.nr_pages = 0,
+	};
+	struct fpstate *old_fps;
+	struct kvm_vcpu *vcpu;
+	unsigned long fpspa;
+
+	if (!VALID_PAGE(new_fps_gpa) ||
+	    !PAGE_ALIGNED(new_fps_gpa) ||
+	    !PAGE_ALIGNED(fpsize) ||
+	    !fpsize)
+		return INVALID_PAGE;
+
+	fpspa = host_gpa2hpa(new_fps_gpa);
+	if (WARN_ON_ONCE(!pkvm_vcpu))
+		goto out;
+
+	vcpu = to_kvm_vcpu(pkvm_vcpu);
+	old_fps = vcpu->arch.guest_fpu.fpstate;
+	/*
+	 * The npVM's FPU state is managed by the host, it is not necessary to
+	 * swap the fpstate in the pkvm hypervisor. The fpstate size should be
+	 * checked for the pVM. See comments in pkvm_vcpu_create.
+	 */
+	if (!pkvm_is_protected_vcpu(vcpu) ||
+	    (fpsize < (fpu_kernel_cfg.default_size +
+		       ALIGN(offsetof(struct fpstate, regs), 64))))
+		goto out;
+
+	if (__pkvm_host_donate_hyp(fpspa, fpsize))
+		goto out;
+
+	vcpu->arch.guest_fpu.fpstate = __pkvm_va(fpspa);
+	memset(vcpu->arch.guest_fpu.fpstate, 0, fpsize);
+	/* Save the fpsize in fpstate->size. See comments in pkvm_vcpu_create */
+	vcpu->arch.guest_fpu.fpstate->size = fpsize;
+	pkvm_init_guest_fpu(&vcpu->arch.guest_fpu);
+
+	if (old_fps) {
+		fpspa = __pkvm_pa(old_fps);
+		fpsize = old_fps->size;
+	} else {
+		fpspa = INVALID_PAGE;
+		fpsize = 0;
+	}
+out:
+	if (VALID_PAGE(fpspa))
+		teardown_donated_memory(&mc, (void *)__pkvm_va(fpspa), fpsize);
+
+	if (VALID_PAGE(mc.head)) {
+		/*
+		 * Store the nr_page in the first page of the teardowned memory
+		 * at the offset skipping sizeof(phys_addr_t) where stores the
+		 * next page physical address.
+		 */
+		unsigned long *nr_pages = __pkvm_va(mc.head) + sizeof(phys_addr_t);
+
+		*nr_pages = mc.nr_pages;
+	}
+
+	return mc.head;
+}
+
 static unsigned long pkvm_vcpu_handle_kvm_call(unsigned long fn,
 					       struct kvm_vcpu *shared_vcpu,
 					       unsigned long p2, unsigned  long p3)
@@ -1324,7 +1463,7 @@ static unsigned long pkvm_vcpu_handle_kvm_call(unsigned long fn,
 		ret = pkvm_vcpu_run(pkvm_vcpu, (bool)p2);
 		break;
 	case __pkvm__vcpu_after_set_cpuid:
-		ret = pkvm_vcpu_after_set_cpuid(pkvm_vcpu, p2);
+		ret = pkvm_vcpu_after_set_cpuid(pkvm_vcpu, p2, p3);
 		break;
 	case __pkvm__vcpu_reset:
 		pkvm_reset_vcpu(pkvm_vcpu, (bool)p2);
@@ -1473,6 +1612,9 @@ static unsigned long pkvm_vcpu_handle_kvm_call(unsigned long fn,
 	case __pkvm__update_exception_bitmap:
 		pkvm_update_exception_bitmap(pkvm_vcpu);
 		break;
+	case __pkvm__vcpu_add_fpstate:
+		ret = pkvm_vcpu_add_fpstate(pkvm_vcpu, p2, p3);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
@@ -1506,7 +1648,7 @@ unsigned long handle_kvm_call(unsigned long fn, unsigned long p1,
 		ret = 0;
 		break;
 	case __pkvm__vcpu_create:
-		ret = pkvm_vcpu_create((struct kvm_vcpu *)kern_pkvm_va((void *)p1), p2);
+		ret = pkvm_vcpu_create((struct kvm_vcpu *)kern_pkvm_va((void *)p1), p2, p3);
 		break;
 	default:
 		ret = pkvm_vcpu_handle_kvm_call(fn, (struct kvm_vcpu *)kern_pkvm_va((void *)p1),
diff --git a/arch/x86/kvm/pkvm/smp.c b/arch/x86/kvm/pkvm/smp.c
index 2d4e92e4743f..f1a3b09e449e 100644
--- a/arch/x86/kvm/pkvm/smp.c
+++ b/arch/x86/kvm/pkvm/smp.c
@@ -9,6 +9,7 @@
 unsigned long __per_cpu_offset[NR_CPUS];
 DEFINE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off);
 DEFINE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot);
+DEFINE_PER_CPU(struct task_struct, cur_task);
 struct cpumask __cpu_possible_mask __ro_after_init;
 unsigned int nr_cpu_ids;
 
@@ -22,6 +23,7 @@ unsigned int pkvm_per_cpu_nr_pages(void)
 
 int setup_pkvm_per_cpu(int cpu, unsigned long base)
 {
+	struct task_struct *task;
 	unsigned long elf_base;
 
 	if (cpu >= ARRAY_SIZE(__per_cpu_offset))
@@ -32,5 +34,9 @@ int setup_pkvm_per_cpu(int cpu, unsigned long base)
 	per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
 	per_cpu(pcpu_hot.cpu_number, cpu) = cpu;
 
+	task = per_cpu_ptr(&cur_task, cpu);
+	task->group_leader = task;
+	per_cpu(pcpu_hot.current_task, cpu) = task;
+
 	return 0;
 }
diff --git a/arch/x86/kvm/pkvm/vmx/vmx.c b/arch/x86/kvm/pkvm/vmx/vmx.c
index 08b3252403ed..a2ea902deb4e 100644
--- a/arch/x86/kvm/pkvm/vmx/vmx.c
+++ b/arch/x86/kvm/pkvm/vmx/vmx.c
@@ -4482,7 +4482,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
 	 */
 	if (is_nm_fault(intr_info)) {
 		kvm_queue_exception(vcpu, NM_VECTOR);
-		return 0;
+		return 1;
 	}
 
 	ex_no = intr_info & INTR_INFO_VECTOR_MASK;
@@ -6405,6 +6405,63 @@ void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
 	vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
 }
 
+static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * Save xfd_err to guest_fpu before interrupt is enabled, so the
+	 * MSR value is not clobbered by the host activity before the guest
+	 * has chance to consume it.
+	 *
+	 * Do not blindly read xfd_err here, since this exception might
+	 * be caused by L1 interception on a platform which doesn't
+	 * support xfd at all.
+	 *
+	 * Do it conditionally upon guest_fpu::xfd. xfd_err matters
+	 * only when xfd contains a non-zero value.
+	 *
+	 * Queuing exception is done in vmx_handle_exit. See comment there.
+	 */
+	if (vcpu->arch.guest_fpu.fpstate->xfd)
+		rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
+}
+
+static void handle_exception_irqoff(struct kvm_vcpu *vcpu, u32 intr_info)
+{
+#ifdef __PKVM_HYP__
+	/* if exit due to NM, handle before interrupts are enabled */
+	if (is_nm_fault(intr_info))
+		handle_nm_fault_irqoff(vcpu);
+#else
+	/* if exit due to PF check for async PF */
+	if (is_page_fault(intr_info))
+		vcpu->arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();
+	/* if exit due to NM, handle before interrupts are enabled */
+	else if (is_nm_fault(intr_info))
+		handle_nm_fault_irqoff(vcpu);
+	/* Handle machine checks before interrupts are enabled */
+	else if (is_machine_check(intr_info))
+		kvm_machine_check();
+#endif
+}
+
+void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (vmx->emulation_required)
+		return;
+
+#ifdef __PKVM_HYP__
+	if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI)
+		handle_exception_irqoff(vcpu, vmx_get_intr_info(vcpu));
+#else
+	if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
+		handle_external_interrupt_irqoff(vcpu, vmx_get_intr_info(vcpu));
+	else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI)
+		handle_exception_irqoff(vcpu, vmx_get_intr_info(vcpu));
+#endif
+}
+
 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
 {
 	u32 exit_intr_info;
@@ -7783,28 +7840,6 @@ static void vmx_sync_vcpu_state_post_switch(struct pkvm_vcpu *pkvm_vcpu)
 	if (pkvm_is_protected_vcpu(vcpu) &&
 	    pkvm_has_req_to_host(HOST_HANDLE_EXIT, vcpu))
 		update_protected_vcpu_state(vcpu, shared_vcpu);
-
-	/*
-	 * FIXME: The MSR_IA32_XFD handling in vmx_set_msr is skipped as
-	 * currently the FPU switching is still managed by the host. So the
-	 * MSR_IA32_XFD emulation is forwarded to the host to handle. On behalf
-	 * of the host, updating the MSR interception and exeption bitmap before
-	 * entering the guest according to the xfd_no_write_intercept flag. This
-	 * should be removed once the XFD emulation can be done in the pkvm
-	 * hypervisor.
-	 */
-	if (unlikely(shared_vcpu->arch.xfd_no_write_intercept ^
-		     vcpu->arch.xfd_no_write_intercept)) {
-		vcpu->arch.xfd_no_write_intercept =
-			shared_vcpu->arch.xfd_no_write_intercept;
-		if (shared_vcpu->arch.xfd_no_write_intercept)
-			vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD,
-						      MSR_TYPE_RW);
-		else
-			vmx_enable_intercept_for_msr(vcpu, MSR_IA32_XFD,
-						     MSR_TYPE_RW);
-		vmx_update_exception_bitmap(vcpu);
-	}
 }
 
 static void share_protected_vcpu_state(struct kvm_vcpu *vcpu,
@@ -8024,6 +8059,8 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
 
 	.load_mmu_pgd = vmx_load_mmu_pgd,
 
+	.handle_exit_irqoff = vmx_handle_exit_irqoff,
+
 	.setup_mce = vmx_setup_mce,
 };
 
diff --git a/arch/x86/kvm/pkvm/x86.c b/arch/x86/kvm/pkvm/x86.c
index e19af92d95da..a39dee657ec2 100644
--- a/arch/x86/kvm/pkvm/x86.c
+++ b/arch/x86/kvm/pkvm/x86.c
@@ -12,6 +12,9 @@
 #include <pmu.h>
 #include "pkvm.h"
 #include <mmu.h>
+#include "fpu/fpu.h"
+
+#include <trace/events/kvm.h>
 
 #ifdef __PKVM_HYP__
 #undef module_param_named
@@ -747,6 +750,13 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state);
 
+#ifdef CONFIG_X86_64
+static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.guest_supported_xcr0 & XFEATURE_MASK_USER_DYNAMIC;
+}
+#endif
+
 static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
 {
 	u64 xcr0 = xcr;
@@ -1767,6 +1777,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			return 1;
 		vcpu->arch.msr_misc_features_enables = data;
 		break;
+#endif
 #ifdef CONFIG_X86_64
 	case MSR_IA32_XFD:
 		if (!msr_info->host_initiated &&
@@ -1788,7 +1799,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 
 		vcpu->arch.guest_fpu.xfd_err = data;
 		break;
-#endif
 #endif
 	default:
 #ifndef __PKVM_HYP__ /* FIXME: Leave to the host to emulate */
@@ -2086,6 +2096,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_K7_HWCR:
 		msr_info->data = vcpu->arch.msr_hwcr;
 		break;
+#endif
 #ifdef CONFIG_X86_64
 	case MSR_IA32_XFD:
 		if (!msr_info->host_initiated &&
@@ -2101,7 +2112,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 
 		msr_info->data = vcpu->arch.guest_fpu.xfd_err;
 		break;
-#endif
 #endif
 	default:
 #ifndef __PKVM_HYP__ /* FIXME: Leave to the host to emulate */
@@ -2708,6 +2718,22 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
 
+/* Swap (qemu) user FPU context for the guest FPU context. */
+static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
+{
+	/* Exclude PKRU, it's restored separately immediately after VM-Exit. */
+	fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true);
+	trace_kvm_fpu(1);
+}
+
+/* When vcpu_run ends, restore user space FPU context. */
+static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
+{
+	fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, false);
+	++vcpu->stat.fpu_reload;
+	trace_kvm_fpu(0);
+}
+
 int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 {
 #ifdef __PKVM_HYP__
@@ -2717,6 +2743,11 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
 
+	pkvm_init_guest_fpu(&vcpu->arch.guest_fpu);
+
+	if (pkvm_is_protected_vcpu(vcpu))
+		fpstate_set_confidential(&vcpu->arch.guest_fpu);
+
 	return kvm_x86_call(vcpu_create)(vcpu);
 #else
 	struct page *page;
@@ -2873,14 +2904,28 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 	kvm_async_pf_hash_reset(vcpu);
 	vcpu->arch.apf.halted = false;
 
+#ifdef __PKVM_HYP__
 	/*
-	 * FIXME: As the guest fpu is still managed by the host and the pkvm
-	 * hypervisor doesn't have valid fpstate in its vcpu->arch.guest_fpu,
-	 * reset the fpstate for MPX is done by the host when runs its
-	 * kvm_vcpu_reset. To add fpu isolation, revisit to see how to do this
-	 * in the pkvm hypervisor.
+	 * The pkvm hypervisor does the FPU switching for the pVM and the host
+	 * does the FPU switching for the npVM, which means that the pkvm
+	 * hypervisor only needs to take care the fpstate of the pVM. So only
+	 * needs to clearing the BNDREGS/BNDCSR for the pVM.
 	 */
-#ifndef __PKVM_HYP__
+	if (pkvm_is_protected_vcpu(vcpu) &&
+	    vcpu->arch.guest_fpu.fpstate && kvm_mpx_supported()) {
+		struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate;
+		bool in_use = fpstate->in_use;
+
+		if (in_use)
+			kvm_put_guest_fpu(vcpu);
+
+		fpstate_clear_xstate_component(fpstate, XFEATURE_BNDREGS);
+		fpstate_clear_xstate_component(fpstate, XFEATURE_BNDCSR);
+
+		if (in_use)
+			kvm_load_guest_fpu(vcpu);
+	}
+#else
 	if (vcpu->arch.guest_fpu.fpstate && kvm_mpx_supported()) {
 		struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate;
 
@@ -3291,105 +3336,133 @@ static void kvm_restore_user_return_msr(void)
 	}
 }
 
-unsigned long kvm_vcpu_enter_guest(struct kvm_vcpu *vcpu, bool force_immediate_exit)
+static int __kvm_vcpu_enter_guest(struct kvm_vcpu *vcpu, bool force_immediate_exit)
 {
-	struct kvm_vcpu *hvcpu = this_cpu_read(host_vcpu);
+	bool req_immediate_exit = false;
 	fastpath_t exit_fastpath;
-	int ret, i;
+	int ret;
 
-	pkvm_reset_reqs_to_host(vcpu);
+	if (kvm_request_pending(vcpu)) {
+		if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
+			kvm_vcpu_flush_tlb_all(vcpu);
 
-	if (kvm_x86_call(vcpu_pre_run)(vcpu) <= 0)
-		return 0;
+		if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
+			kvm_vcpu_flush_tlb_current(vcpu);
 
-	vcpu->arch.last_vmentry_cpu = vcpu->cpu;
+		if (kvm_check_request(KVM_REQ_EVENT, vcpu))
+			kvm_check_and_inject_events(vcpu, &req_immediate_exit);
+	}
 
-	/* TODO: Save the host VMM fpu and load the guest fpu */
+	kvm_x86_call(prepare_switch_to_guest)(vcpu);
 
-	/* Save the host debug registers */
-	get_debugreg(hvcpu->arch.dr7, 7);
-	for (i = 0; i < KVM_NR_DB_REGS; i++)
-		get_debugreg(hvcpu->arch.db[i], i);
+	/*
+	 * Make sure vcpu->mode is changed to IN_GUEST_MODE before
+	 * running to mark this vcpu should be kicked for any new
+	 * vcpu request.
+	 */
+	smp_store_mb(vcpu->mode, IN_GUEST_MODE);
 
-	vcpu->arch.host_debugctl = get_debugctlmsr();
+	if (req_immediate_exit)
+		kvm_make_request(KVM_REQ_EVENT, vcpu);
+	else
+		req_immediate_exit = force_immediate_exit;
+
+	if (vcpu->arch.guest_fpu.xfd_err)
+		wrmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
+
+	if (unlikely(vcpu->arch.switch_db_regs)) {
+		set_debugreg(0, 7);
+		set_debugreg(vcpu->arch.eff_db[0], 0);
+		set_debugreg(vcpu->arch.eff_db[1], 1);
+		set_debugreg(vcpu->arch.eff_db[2], 2);
+		set_debugreg(vcpu->arch.eff_db[3], 3);
+		/* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
+		if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
+			kvm_x86_call(set_dr6)(vcpu, vcpu->arch.dr6);
+	}
 
 	for (;;) {
-		bool req_immediate_exit = false;
+		exit_fastpath = kvm_x86_call(vcpu_run)(vcpu, req_immediate_exit);
 
-		if (kvm_request_pending(vcpu)) {
-			if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
-				kvm_vcpu_flush_tlb_all(vcpu);
+		if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
+			break;
+	}
 
-			if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
-				kvm_vcpu_flush_tlb_current(vcpu);
+	/* Sync the guest debug registers */
+	if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
+		WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
+		kvm_x86_call(sync_dirty_debug_regs)(vcpu);
+		kvm_update_dr0123(vcpu);
+		kvm_update_dr7(vcpu);
+	}
 
-			if (kvm_check_request(KVM_REQ_EVENT, vcpu))
-				kvm_check_and_inject_events(vcpu, &req_immediate_exit);
-		}
+	/*
+	 * Make sure vcpu->mode is changed to OUTSIDE_GUEST_MODE after
+	 * vmexit to mark this vcpu no need to be kicked for any new
+	 * vcpu request.
+	 */
+	smp_store_mb(vcpu->mode, OUTSIDE_GUEST_MODE);
 
-		kvm_x86_call(prepare_switch_to_guest)(vcpu);
+	/*
+	 * Sync xfd before calling handle_exit_irqoff() which may
+	 * rely on the fact that guest_fpu::xfd is up-to-date (e.g.
+	 * in #NM irqoff handler).
+	 */
+	if (vcpu->arch.xfd_no_write_intercept)
+		fpu_sync_guest_vmexit_xfd_state();
 
-		/*
-		 * Make sure vcpu->mode is changed to IN_GUEST_MODE before
-		 * running to mark this vcpu should be kicked for any new
-		 * vcpu request.
-		 */
-		smp_store_mb(vcpu->mode, IN_GUEST_MODE);
-
-		if (req_immediate_exit)
-			kvm_make_request(KVM_REQ_EVENT, vcpu);
-		else
-			req_immediate_exit = force_immediate_exit;
-
-		if (unlikely(vcpu->arch.switch_db_regs)) {
-			set_debugreg(0, 7);
-			set_debugreg(vcpu->arch.eff_db[0], 0);
-			set_debugreg(vcpu->arch.eff_db[1], 1);
-			set_debugreg(vcpu->arch.eff_db[2], 2);
-			set_debugreg(vcpu->arch.eff_db[3], 3);
-			/* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
-			if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
-				kvm_x86_call(set_dr6)(vcpu, vcpu->arch.dr6);
-		}
+	kvm_x86_call(handle_exit_irqoff)(vcpu);
 
-		exit_fastpath = kvm_x86_call(vcpu_run)(vcpu, req_immediate_exit);
+	if (vcpu->arch.guest_fpu.xfd_err)
+		wrmsrl(MSR_IA32_XFD_ERR, 0);
 
-		/* Sync the guest debug registers */
-		if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
-			WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
-			kvm_x86_call(sync_dirty_debug_regs)(vcpu);
-			kvm_update_dr0123(vcpu);
-			kvm_update_dr7(vcpu);
-		}
+	ret = kvm_x86_call(handle_exit)(vcpu, exit_fastpath);
+	if (ret <= 0) {
+		pkvm_make_req_to_host(HOST_HANDLE_EXIT, vcpu);
+		goto out;
+	}
 
-		/*
-		 * Make sure vcpu->mode is changed to OUTSIDE_GUEST_MODE after
-		 * vmexit to mark this vcpu no need to be kicked for any new
-		 * vcpu request.
-		 */
-		smp_store_mb(vcpu->mode, OUTSIDE_GUEST_MODE);
+	if (unlikely(force_immediate_exit) || pkvm_reqs_to_host(vcpu))
+		goto out;
 
-		if (unlikely(exit_fastpath == EXIT_FASTPATH_REENTER_GUEST))
-			continue;
+	return 1;
+out:
+	kvm_x86_call(prepare_switch_to_host)(vcpu);
+	return 0;
+}
 
-		ret = kvm_x86_call(handle_exit)(vcpu, exit_fastpath);
-		if (ret <= 0) {
-			pkvm_make_req_to_host(HOST_HANDLE_EXIT, vcpu);
-			break;
-		}
+unsigned long kvm_vcpu_enter_guest(struct kvm_vcpu *vcpu, bool force_immediate_exit)
+{
+	struct kvm_vcpu *hvcpu = this_cpu_read(host_vcpu);
+	int i;
+
+	pkvm_reset_reqs_to_host(vcpu);
+
+	if (kvm_x86_call(vcpu_pre_run)(vcpu) <= 0)
+		return 0;
+
+	vcpu->arch.last_vmentry_cpu = vcpu->cpu;
+
+	kvm_load_guest_fpu(vcpu);
+
+	/* Save the host debug registers */
+	get_debugreg(hvcpu->arch.dr7, 7);
+	for (i = 0; i < KVM_NR_DB_REGS; i++)
+		get_debugreg(hvcpu->arch.db[i], i);
+
+	vcpu->arch.host_debugctl = get_debugctlmsr();
 
-		if (unlikely(force_immediate_exit) || pkvm_reqs_to_host(vcpu))
+	for (;;) {
+		if (__kvm_vcpu_enter_guest(vcpu, force_immediate_exit) != 1)
 			break;
 	}
 
-	kvm_x86_call(prepare_switch_to_host)(vcpu);
-
 	/* Restore the host debug registers */
 	set_debugreg(hvcpu->arch.dr7, 7);
 	for (i = 0; i < KVM_NR_DB_REGS; i++)
 		set_debugreg(hvcpu->arch.db[i], i);
 
-	/* TODO: Restore the host VMM fpu and save the guest fpu */
+	kvm_put_guest_fpu(vcpu);
 
 	kvm_restore_user_return_msr();
 
diff --git a/arch/x86/kvm/vmx/pkvm/hyp/Makefile b/arch/x86/kvm/vmx/pkvm/hyp/Makefile
index 0fc5db3f85bc..605489adb21b 100644
--- a/arch/x86/kvm/vmx/pkvm/hyp/Makefile
+++ b/arch/x86/kvm/vmx/pkvm/hyp/Makefile
@@ -16,7 +16,7 @@ pkvm-hyp-y	:= vmx_asm.o vmexit.o memory.o page_alloc.o early_alloc.o pgtable.o m
 
 pkvm		:= ../../../pkvm
 pkvm-hyp-y	+= $(pkvm)/smp.o $(pkvm)/pkvm.o $(pkvm)/x86.o $(pkvm)/vmx/vmx.o $(pkvm)/vmx/vmenter.o \
-		   $(pkvm)/cpuid.o $(pkvm)/lapic.o
+		   $(pkvm)/cpuid.o $(pkvm)/lapic.o $(pkvm)/fpu/core.o $(pkvm)/fpu/xstate.o $(pkvm)/fpu/fpu.o
 
 lib-dir		:= lib
 lib2-dir	:= ../../../../../../lib
@@ -31,6 +31,9 @@ pkvm-hyp-$(CONFIG_LIST_HARDENED)	+= $(lib-dir)/list_debug.o
 pkvm-obj 	:= $(patsubst %.o,%.pkvm.o,$(pkvm-hyp-y))
 obj-$(CONFIG_PKVM_INTEL)	+= pkvm.o
 
+CFLAGS_$(pkvm)/fpu/xstate.pkvm.o	+= -iquote $(srctree)/arch/x86/kernel/fpu
+CFLAGS_$(pkvm)/fpu/fpu.pkvm.o		+= -iquote $(srctree)/arch/x86/kernel/fpu
+CFLAGS_$(pkvm)/fpu/core.pkvm.o		+= -iquote $(srctree)/arch/x86/kernel/fpu
 AFLAGS_$(pkvm)/vmx/vmenter.pkvm.o	+= -iquote $(obj)/$(pkvm)
 
 $(obj)/$(pkvm)/vmx/vmenter.pkvm.o: $(obj)/$(pkvm)/kvm-asm-offsets.h
diff --git a/arch/x86/kvm/vmx/pkvm/hyp/init_finalise.c b/arch/x86/kvm/vmx/pkvm/hyp/init_finalise.c
index cacb57f79dbf..754db2e25a82 100644
--- a/arch/x86/kvm/vmx/pkvm/hyp/init_finalise.c
+++ b/arch/x86/kvm/vmx/pkvm/hyp/init_finalise.c
@@ -29,6 +29,7 @@
 #include <vmx/vmx.h>
 #include <pkvm/vmx/vmx.h>
 #include <pkvm/pkvm.h>
+#include <pkvm/fpu/fpu.h>
 
 bool pvmfw_present;
 phys_addr_t pvmfw_base;
@@ -382,6 +383,8 @@ int __pkvm_init_finalise(struct kvm_vcpu *vcpu, struct pkvm_section sections[],
 	if (ret)
 		goto out;
 
+	pkvm_setup_xstate_cache();
+
 	pkvm_init = true;
 
 switch_pgt:
@@ -413,6 +416,8 @@ int __pkvm_init_finalise(struct kvm_vcpu *vcpu, struct pkvm_section sections[],
 
 	ept_sync_global();
 
+	pkvm_init_percpu_fpu();
+
 	ret = pkvm_setup_lapic(pcpu, vcpu->cpu);
 out:
 	return ret;
diff --git a/arch/x86/kvm/vmx/pkvm/hyp/vmexit.c b/arch/x86/kvm/vmx/pkvm/hyp/vmexit.c
index 1b96d3d68757..8ee0361350bc 100644
--- a/arch/x86/kvm/vmx/pkvm/hyp/vmexit.c
+++ b/arch/x86/kvm/vmx/pkvm/hyp/vmexit.c
@@ -6,6 +6,7 @@
 #include <linux/memblock.h>
 #include <asm/kvm_pkvm.h>
 #include <pkvm/pkvm.h>
+#include <pkvm/fpu/fpu.h>
 #include <pkvm.h>
 #include "trace.h"
 #include "vmexit.h"
diff --git a/arch/x86/kvm/vmx/pkvm/include/pkvm.h b/arch/x86/kvm/vmx/pkvm/include/pkvm.h
index a4988df910f4..beed14cef67b 100644
--- a/arch/x86/kvm/vmx/pkvm/include/pkvm.h
+++ b/arch/x86/kvm/vmx/pkvm/include/pkvm.h
@@ -146,6 +146,12 @@ extern bool pkvm_sym(pvmfw_present);
 extern phys_addr_t pkvm_sym(pvmfw_base);
 extern phys_addr_t pkvm_sym(pvmfw_size);
 
+extern struct fpu_state_config pkvm_sym(fpu_kernel_cfg);
+extern struct fpu_state_config pkvm_sym(fpu_user_cfg);
+#ifdef CONFIG_X86_64
+DECLARE_STATIC_KEY_FALSE(pkvm_sym(__fpu_state_size_dynamic));
+#endif
+
 PKVM_DECLARE(void, __pkvm_vmexit_entry, (void));
 PKVM_DECLARE(bool, pkvm_vmexit_main, (struct kvm_vcpu *vcpu));
 PKVM_DECLARE(void, pkvm_init_host_state_area, (struct pkvm_pcpu *pcpu, int cpu));
diff --git a/arch/x86/kvm/vmx/pkvm/pkvm_host.c b/arch/x86/kvm/vmx/pkvm/pkvm_host.c
index 50e9ce548916..ced8b53d372e 100644
--- a/arch/x86/kvm/vmx/pkvm/pkvm_host.c
+++ b/arch/x86/kvm/vmx/pkvm/pkvm_host.c
@@ -1316,6 +1316,12 @@ static void __init setup_pkvm_syms(void)
 	cpumask_copy(&pkvm_sym(__cpu_possible_mask), cpu_possible_mask);
 	pkvm_sym(nr_cpu_ids) = nr_cpu_ids;
 	pkvm_sym(x86_pred_cmd) = x86_pred_cmd;
+	pkvm_sym(fpu_kernel_cfg) = fpu_kernel_cfg;
+	pkvm_sym(fpu_user_cfg) = fpu_user_cfg;
+#ifdef CONFIG_X86_64
+	if (fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC)
+		static_branch_enable(&pkvm_sym(__fpu_state_size_dynamic));
+#endif
 }
 
 int __init vmx_pkvm_init(void)
diff --git a/arch/x86/kvm/vmx/pkvm_high.c b/arch/x86/kvm/vmx/pkvm_high.c
index 1505f0b5272e..827c8ed21475 100644
--- a/arch/x86/kvm/vmx/pkvm_high.c
+++ b/arch/x86/kvm/vmx/pkvm_high.c
@@ -177,6 +177,8 @@ static bool pkvm_hyp_emulated_msr(u32 msr)
 	case MSR_FS_BASE:
 	case MSR_GS_BASE:
 	case MSR_KERNEL_GS_BASE:
+	case MSR_IA32_XFD:
+	case MSR_IA32_XFD_ERR:
 #endif
 	case MSR_IA32_SYSENTER_CS:
 	case MSR_IA32_SYSENTER_EIP:
@@ -900,13 +902,29 @@ static int pkvm_vcpu_create(struct kvm_vcpu *vcpu)
 	}
 
 	pkvm_vcpu_sz = PAGE_ALIGN(PKVM_SHADOW_VCPU_STATE_SIZE);
+	if (pkvm_is_protected_vcpu(vcpu))
+		/*
+		 * The pVM FPU registers will be switched by the pkvm
+		 * hypervisor. Allocate the fpstate regs memory according
+		 * to the real guest_fpu.fpstate size.
+		 */
+		pkvm_vcpu_sz += PAGE_ALIGN(vcpu->arch.guest_fpu.fpstate->size +
+					   ALIGN(offsetof(struct fpstate, regs), 64));
+	else
+		/*
+		 * The npVM FPU registers will be switched by the host. No need
+		 * to count the real guest_fpu.fpstate size but just strcut
+		 * fpstate size except for the regs.
+		 */
+		pkvm_vcpu_sz += PAGE_ALIGN(ALIGN(offsetof(struct fpstate, regs), 64));
+
 	pkvm_vcpu = alloc_pages_exact(pkvm_vcpu_sz, GFP_KERNEL_ACCOUNT);
 	if (!pkvm_vcpu)
 		goto free_ve;
 
 	/* TODO: share struct vcpu_vmx with pkvm */
 
-	ret = kvm_call_pkvm(vcpu_create, vcpu, __pa(pkvm_vcpu));
+	ret = kvm_call_pkvm(vcpu_create, vcpu, __pa(pkvm_vcpu), pkvm_vcpu_sz);
 	if (ret < 0)
 		goto free_pages;
 
@@ -1046,8 +1064,6 @@ static int pkvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 
 static int pkvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
-	int ret;
-
 	/* Use PV interface to set the MSR emulated by the pkvm hypervisor */
 	if (pkvm_hyp_emulated_msr(msr_info->index)) {
 		if (!vcpu->arch.guest_state_protected) {
@@ -1065,20 +1081,7 @@ static int pkvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	}
 
 	/* Otherwise handle by the host VMM itself */
-	ret = kvm_set_msr_common(vcpu, msr_info);
-	if (ret)
-		return ret;
-
-	/*
-	 * FIXME: The pkvm hypervisor will disable the write intercept for the
-	 * XFD MSR. But as the FPU switching is done by the host, has to set the
-	 * xfd_no_write_intercept here. Once the FPU switching can be done in
-	 * the pkvm hypervisor, this can be removed.
-	 */
-	if (msr_info->index == MSR_IA32_XFD && msr_info->data)
-		vcpu->arch.xfd_no_write_intercept = true;
-
-	return 0;
+	return kvm_set_msr_common(vcpu, msr_info);
 }
 
 static u64 pkvm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
@@ -1433,8 +1436,15 @@ static fastpath_t pkvm_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit
 	 * can be enabled earlier.
 	 */
 	if (unlikely(vcpu->kvm->arch.has_protected_state &&
-		     !vcpu->arch.guest_state_protected))
+		     !vcpu->arch.guest_state_protected)) {
 		vcpu->arch.guest_state_protected = true;
+		/*
+		 * Mark the guest_fpu as confidential to avoid the host VMM to do the
+		 * FPU switching for the pVM as this will be done by the pkvm
+		 * hypervisor.
+		 */
+		fpstate_set_confidential(&vcpu->arch.guest_fpu);
+	}
 
 	if (unlikely(vmx->exit_reason.full == 0xdead)) {
 		vmx->fail = 1;
@@ -1756,17 +1766,56 @@ static void pkvm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, u64 *info1,
 	}
 }
 
+static int pkvm_vcpu_realloc_fpstate(struct kvm_vcpu *vcpu)
+{
+	unsigned long old_fpspa;
+	size_t fpsize;
+	void *fps;
+
+	fpsize = PAGE_ALIGN(vcpu->arch.guest_fpu.fpstate->size +
+			    ALIGN(offsetof(struct fpstate, regs), 64));
+	fps = alloc_pages_exact(fpsize, GFP_KERNEL_ACCOUNT);
+	if (!fps)
+		return -ENOMEM;
+
+	old_fpspa = kvm_call_pkvm(vcpu_add_fpstate, vcpu, __pa(fps), fpsize);
+	if (VALID_PAGE(old_fpspa)) {
+		unsigned long *nr_pages = __va(old_fpspa) + sizeof(phys_addr_t);
+		struct pkvm_memcache mc = {
+			.head = old_fpspa,
+			.nr_pages = *nr_pages,
+		};
+
+		free_pkvm_memcache(&mc);
+	}
+
+	return 0;
+}
+
 static void pkvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
 {
 	struct kvm_cpuid_entry2 *e2 = vcpu->arch.cpuid_entries;
 	int nent = vcpu->arch.cpuid_nent;
-	unsigned long unused_pa;
+	unsigned long old_entries_pa;
 	void *entries;
 	size_t size;
 
 	if (vcpu->arch.guest_state_protected || !e2 || !nent)
 		return;
 
+	/*
+	 * With expoing the FPU dynamica feature via the cpuid, the fpstate
+	 * allocated when creating the vcpu may not be sufficient for the
+	 * guest. As the pVM's FPU state is managed by the pkvm hypervisor
+	 * while the npVM's FPU state is managed by the host, re-allocating the
+	 * fpstate is only necessary for the pVM, and should be done before
+	 * adding the new cpuid entries to the pkvm hypervisor.
+	 */
+	if ((vcpu->arch.guest_fpu.xfeatures & XFEATURE_MASK_USER_DYNAMIC) &&
+	    pkvm_is_protected_vcpu(vcpu) &&
+	    pkvm_vcpu_realloc_fpstate(vcpu))
+		return;
+
 	size = sizeof(struct kvm_cpuid_entry2) * nent;
 	entries = alloc_pages_exact(size, GFP_KERNEL_ACCOUNT);
 	if (!entries) {
@@ -1776,10 +1825,16 @@ static void pkvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
 
 	memcpy(entries, (void *)e2, size);
 
-	unused_pa = kvm_call_pkvm(vcpu_after_set_cpuid, vcpu, __pa(entries));
-	if (VALID_PAGE(unused_pa)) {
-		entries = __va(unused_pa);
-		free_pages_exact(entries, size);
+	old_entries_pa = kvm_call_pkvm(vcpu_after_set_cpuid, vcpu,
+				       __pa(entries), PAGE_ALIGN(size));
+	if (VALID_PAGE(old_entries_pa)) {
+		unsigned long *nr_pages = __va(old_entries_pa) + sizeof(phys_addr_t);
+		struct pkvm_memcache mc = {
+			.head = old_entries_pa,
+			.nr_pages = *nr_pages,
+		};
+
+		free_pkvm_memcache(&mc);
 	}
 }