diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index fcb565b0fb..8d579e2cf9 100644 --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -1302,13 +1302,8 @@ static void load_segments(struct vcpu *n) per_cpu(dirty_segment_mask, cpu) = 0; #ifdef CONFIG_HVM - if ( !is_pv_32bit_vcpu(n) && !cpu_has_fsgsbase && cpu_has_svm && - !((uregs->fs | uregs->gs) & ~3) && - /* - * The remaining part is just for optimization: If only shadow GS - * needs loading, there's nothing to be gained here. - */ - (n->arch.pv.fs_base | n->arch.pv.gs_base_user | n->arch.pv.ldt_ents) ) + if ( cpu_has_svm && !is_pv_32bit_vcpu(n) && + !(read_cr4() & X86_CR4_FSGSBASE) && !((uregs->fs | uregs->gs) & ~3) ) { unsigned long gsb = n->arch.flags & TF_kernel_mode ? n->arch.pv.gs_base_kernel : n->arch.pv.gs_base_user; @@ -1487,7 +1482,8 @@ static void save_segments(struct vcpu *v) regs->fs = read_sreg(fs); regs->gs = read_sreg(gs); - if ( cpu_has_fsgsbase && !is_pv_32bit_vcpu(v) ) + /* %fs/%gs bases can only be stale if WR{FS,GS}BASE are usable. */ + if ( (read_cr4() & X86_CR4_FSGSBASE) && !is_pv_32bit_vcpu(v) ) { v->arch.pv.fs_base = __rdfsbase(); if ( v->arch.flags & TF_kernel_mode ) @@ -1691,8 +1687,8 @@ static void __context_switch(void) #if defined(CONFIG_PV) && defined(CONFIG_HVM) /* Prefetch the VMCB if we expect to use it later in the context switch */ - if ( is_pv_domain(nd) && !is_pv_32bit_domain(nd) && !is_idle_domain(nd) && - !cpu_has_fsgsbase && cpu_has_svm ) + if ( cpu_has_svm && is_pv_domain(nd) && !is_pv_32bit_domain(nd) && + !is_idle_domain(nd) && !(read_cr4() & X86_CR4_FSGSBASE) ) svm_load_segs(0, 0, 0, 0, 0, 0, 0); #endif diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c index 2584b90ce2..23d72e81e3 100644 --- a/xen/arch/x86/hvm/svm/svm.c +++ b/xen/arch/x86/hvm/svm/svm.c @@ -1574,8 +1574,7 @@ static int svm_cpu_up_prepare(unsigned int cpu) goto err; #ifdef CONFIG_PV - if ( !cpu_has_fsgsbase ) - per_cpu(host_vmcb_va, cpu) = __map_domain_page_global(pg); + per_cpu(host_vmcb_va, cpu) = __map_domain_page_global(pg); #endif clear_domain_page(page_to_mfn(pg)); diff --git a/xen/arch/x86/pv/domain.c b/xen/arch/x86/pv/domain.c index 91123a8b6c..3e82cffcae 100644 --- a/xen/arch/x86/pv/domain.c +++ b/xen/arch/x86/pv/domain.c @@ -140,6 +140,15 @@ unsigned long pv_make_cr4(const struct vcpu *v) if ( d->arch.vtsc || (v->arch.pv.ctrlreg[4] & X86_CR4_TSD) ) cr4 |= X86_CR4_TSD; + /* + * The {RD,WR}{FS,GS}BASE are only useable in 64bit code segments. While + * we must not have CR4.FSGSBASE set behind the back of a 64bit PV kernel, + * we do leave it set in 32bit PV context to speed up Xen's context switch + * path. + */ + if ( !is_pv_32bit_domain(d) && !(v->arch.pv.ctrlreg[4] & X86_CR4_FSGSBASE) ) + cr4 &= ~X86_CR4_FSGSBASE; + return cr4; } @@ -371,7 +380,8 @@ void toggle_guest_mode(struct vcpu *v) { ASSERT(!is_pv_32bit_vcpu(v)); - if ( cpu_has_fsgsbase ) + /* %fs/%gs bases can only be stale if WR{FS,GS}BASE are usable. */ + if ( read_cr4() & X86_CR4_FSGSBASE ) { if ( v->arch.flags & TF_kernel_mode ) v->arch.pv.gs_base_kernel = __rdgsbase(); diff --git a/xen/arch/x86/pv/emul-priv-op.c b/xen/arch/x86/pv/emul-priv-op.c index b4a56f9c81..3746e2ad54 100644 --- a/xen/arch/x86/pv/emul-priv-op.c +++ b/xen/arch/x86/pv/emul-priv-op.c @@ -780,6 +780,17 @@ static int write_cr(unsigned int reg, unsigned long val, } case 4: /* Write CR4 */ + /* + * If this write will disable FSGSBASE, refresh Xen's idea of the + * guest bases now that they can no longer change. + */ + if ( (curr->arch.pv.ctrlreg[4] & X86_CR4_FSGSBASE) && + !(val & X86_CR4_FSGSBASE) ) + { + curr->arch.pv.fs_base = __rdfsbase(); + curr->arch.pv.gs_base_kernel = __rdgsbase(); + } + curr->arch.pv.ctrlreg[4] = pv_fixup_guest_cr4(curr, val); write_cr4(pv_make_cr4(curr)); ctxt_switch_levelling(curr); @@ -828,14 +839,15 @@ static int read_msr(unsigned int reg, uint64_t *val, case MSR_FS_BASE: if ( is_pv_32bit_domain(currd) ) break; - *val = cpu_has_fsgsbase ? __rdfsbase() : curr->arch.pv.fs_base; + *val = (read_cr4() & X86_CR4_FSGSBASE) ? __rdfsbase() + : curr->arch.pv.fs_base; return X86EMUL_OKAY; case MSR_GS_BASE: if ( is_pv_32bit_domain(currd) ) break; - *val = cpu_has_fsgsbase ? __rdgsbase() - : curr->arch.pv.gs_base_kernel; + *val = (read_cr4() & X86_CR4_FSGSBASE) ? __rdgsbase() + : curr->arch.pv.gs_base_kernel; return X86EMUL_OKAY; case MSR_SHADOW_GS_BASE: diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c index 92da060915..3440794275 100644 --- a/xen/arch/x86/setup.c +++ b/xen/arch/x86/setup.c @@ -1610,7 +1610,7 @@ void __init noreturn __start_xen(unsigned long mbi_p) cr4_pv32_mask = mmu_cr4_features & XEN_CR4_PV32_BITS; - if ( cpu_has_fsgsbase ) + if ( boot_cpu_has(X86_FEATURE_FSGSBASE) ) set_in_cr4(X86_CR4_FSGSBASE); if ( opt_invpcid && cpu_has_invpcid ) diff --git a/xen/include/asm-x86/cpufeature.h b/xen/include/asm-x86/cpufeature.h index 5592e1749d..1fb9af4b19 100644 --- a/xen/include/asm-x86/cpufeature.h +++ b/xen/include/asm-x86/cpufeature.h @@ -89,7 +89,6 @@ #define cpu_has_xsaves boot_cpu_has(X86_FEATURE_XSAVES) /* CPUID level 0x00000007:0.ebx */ -#define cpu_has_fsgsbase boot_cpu_has(X86_FEATURE_FSGSBASE) #define cpu_has_bmi1 boot_cpu_has(X86_FEATURE_BMI1) #define cpu_has_hle boot_cpu_has(X86_FEATURE_HLE) #define cpu_has_avx2 boot_cpu_has(X86_FEATURE_AVX2) diff --git a/xen/include/asm-x86/msr.h b/xen/include/asm-x86/msr.h index adfa2fa05b..a7244793bf 100644 --- a/xen/include/asm-x86/msr.h +++ b/xen/include/asm-x86/msr.h @@ -124,6 +124,14 @@ static inline uint64_t rdtsc_ordered(void) : "=a" (low), "=d" (high) \ : "c" (counter)) +/* + * On hardware supporting FSGSBASE, the value loaded into hardware is the + * guest kernel's choice for 64bit PV guests (Xen's choice for Idle, HVM and + * 32bit PV). + * + * Therefore, the {RD,WR}{FS,GS}BASE instructions are only safe to use if + * %cr4.fsgsbase is set. + */ static inline unsigned long __rdfsbase(void) { unsigned long base; @@ -154,7 +162,7 @@ static inline unsigned long rdfsbase(void) { unsigned long base; - if ( cpu_has_fsgsbase ) + if ( read_cr4() & X86_CR4_FSGSBASE ) return __rdfsbase(); rdmsrl(MSR_FS_BASE, base); @@ -166,7 +174,7 @@ static inline unsigned long rdgsbase(void) { unsigned long base; - if ( cpu_has_fsgsbase ) + if ( read_cr4() & X86_CR4_FSGSBASE ) return __rdgsbase(); rdmsrl(MSR_GS_BASE, base); @@ -178,7 +186,7 @@ static inline unsigned long rdgsshadow(void) { unsigned long base; - if ( cpu_has_fsgsbase ) + if ( read_cr4() & X86_CR4_FSGSBASE ) { asm volatile ( "swapgs" ); base = __rdgsbase(); @@ -192,7 +200,7 @@ static inline unsigned long rdgsshadow(void) static inline void wrfsbase(unsigned long base) { - if ( cpu_has_fsgsbase ) + if ( read_cr4() & X86_CR4_FSGSBASE ) #ifdef HAVE_AS_FSGSBASE asm volatile ( "wrfsbase %0" :: "r" (base) ); #else @@ -204,7 +212,7 @@ static inline void wrfsbase(unsigned long base) static inline void wrgsbase(unsigned long base) { - if ( cpu_has_fsgsbase ) + if ( read_cr4() & X86_CR4_FSGSBASE ) #ifdef HAVE_AS_FSGSBASE asm volatile ( "wrgsbase %0" :: "r" (base) ); #else @@ -216,7 +224,7 @@ static inline void wrgsbase(unsigned long base) static inline void wrgsshadow(unsigned long base) { - if ( cpu_has_fsgsbase ) + if ( read_cr4() & X86_CR4_FSGSBASE ) { asm volatile ( "swapgs\n\t" #ifdef HAVE_AS_FSGSBASE diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h index df01ae30d7..f3275ca5d3 100644 --- a/xen/include/asm-x86/processor.h +++ b/xen/include/asm-x86/processor.h @@ -304,11 +304,31 @@ static inline unsigned long read_cr4(void) static inline void write_cr4(unsigned long val) { + struct cpu_info *info = get_cpu_info(); + /* No global pages in case of PCIDs enabled! */ ASSERT(!(val & X86_CR4_PGE) || !(val & X86_CR4_PCIDE)); - get_cpu_info()->cr4 = val; - asm volatile ( "mov %0,%%cr4" : : "r" (val) ); + /* + * On hardware supporting FSGSBASE, the value in %cr4 is the kernel's + * choice for 64bit PV guests, which impacts whether Xen can use the + * instructions. + * + * The {rd,wr}{fs,gs}base() helpers use info->cr4 to work out whether it + * is safe to execute the {RD,WR}{FS,GS}BASE instruction, falling back to + * the MSR path if not. Some users require interrupt safety. + * + * If FSGSBASE is currently or about to become clear, reflect this in + * info->cr4 before updating %cr4, so an interrupt which hits in the + * middle won't observe FSGSBASE set in info->cr4 but clear in %cr4. + */ + info->cr4 = val & (info->cr4 | ~X86_CR4_FSGSBASE); + + asm volatile ( "mov %[val], %%cr4" + : "+m" (info->cr4) /* Force ordering without a barrier. */ + : [val] "r" (val) ); + + info->cr4 = val; } /* Clear and set 'TS' bit respectively */