Skip to content

Commit

Permalink
x86/pv: Don't have %cr4.fsgsbase active behind a guest kernels back
Browse files Browse the repository at this point in the history
Currently, a 64bit PV guest can appear to set and clear FSGSBASE in %cr4, but
the bit remains set in hardware.  Therefore, the {RD,WR}{FS,GS}BASE are usable
even when the guest kernel believes that they are disabled.

The FSGSBASE feature isn't currently supported in Linux, and its context
switch path has some optimisations which rely on userspace being unable to use
the WR{FS,GS}BASE instructions.  Xen's current behaviour undermines this
expectation.

In 64bit PV guest context, always load the guest kernels setting of FSGSBASE
into %cr4.  This requires adjusting how Xen uses the {RD,WR}{FS,GS}BASE
instructions.

 * Delete the cpu_has_fsgsbase helper.  It is no longer safe, as users need to
   check %cr4 directly.
 * The raw __rd{fs,gs}base() helpers are only safe to use when %cr4.fsgsbase
   is set.  Comment this property.
 * The {rd,wr}{fs,gs}{base,shadow}() and read_msr() helpers are updated to use
   the current %cr4 value to determine which mechanism to use.
 * toggle_guest_mode() and save_segments() are update to avoid reading
   fs/gsbase if the values in hardware cannot be stale WRT struct vcpu.  A
   consequence of this is that the write_cr() path needs to cache the current
   bases, as subsequent context switches will skip saving the values.
 * write_cr4() is updated to ensure that the shadow %cr4.fsgsbase value is
   observed in a safe way WRT the hardware setting, if an interrupt happens to
   hit in the middle.
 * load_segments() is updated to use the VMLOAD optimisation if FSGSBASE is
   unavailable, even if only gs_shadow needs updating.  As a minor perf
   improvement, check cpu_has_svm first to short circuit a context-dependent
   conditional on Intel hardware.
 * pv_make_cr4() is updated for 64bit PV guests to use the guest kernels
   choice of FSGSBASE.

This is part of XSA-293.

Reported-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
  • Loading branch information
andyhhp authored and jbeulich committed Mar 5, 2019
1 parent b2dd005 commit eccc170
Show file tree
Hide file tree
Showing 8 changed files with 70 additions and 26 deletions.
16 changes: 6 additions & 10 deletions xen/arch/x86/domain.c
Original file line number Diff line number Diff line change
Expand Up @@ -1302,13 +1302,8 @@ static void load_segments(struct vcpu *n)
per_cpu(dirty_segment_mask, cpu) = 0;

#ifdef CONFIG_HVM
if ( !is_pv_32bit_vcpu(n) && !cpu_has_fsgsbase && cpu_has_svm &&
!((uregs->fs | uregs->gs) & ~3) &&
/*
* The remaining part is just for optimization: If only shadow GS
* needs loading, there's nothing to be gained here.
*/
(n->arch.pv.fs_base | n->arch.pv.gs_base_user | n->arch.pv.ldt_ents) )
if ( cpu_has_svm && !is_pv_32bit_vcpu(n) &&
!(read_cr4() & X86_CR4_FSGSBASE) && !((uregs->fs | uregs->gs) & ~3) )
{
unsigned long gsb = n->arch.flags & TF_kernel_mode
? n->arch.pv.gs_base_kernel : n->arch.pv.gs_base_user;
Expand Down Expand Up @@ -1487,7 +1482,8 @@ static void save_segments(struct vcpu *v)
regs->fs = read_sreg(fs);
regs->gs = read_sreg(gs);

if ( cpu_has_fsgsbase && !is_pv_32bit_vcpu(v) )
/* %fs/%gs bases can only be stale if WR{FS,GS}BASE are usable. */
if ( (read_cr4() & X86_CR4_FSGSBASE) && !is_pv_32bit_vcpu(v) )
{
v->arch.pv.fs_base = __rdfsbase();
if ( v->arch.flags & TF_kernel_mode )
Expand Down Expand Up @@ -1691,8 +1687,8 @@ static void __context_switch(void)

#if defined(CONFIG_PV) && defined(CONFIG_HVM)
/* Prefetch the VMCB if we expect to use it later in the context switch */
if ( is_pv_domain(nd) && !is_pv_32bit_domain(nd) && !is_idle_domain(nd) &&
!cpu_has_fsgsbase && cpu_has_svm )
if ( cpu_has_svm && is_pv_domain(nd) && !is_pv_32bit_domain(nd) &&
!is_idle_domain(nd) && !(read_cr4() & X86_CR4_FSGSBASE) )
svm_load_segs(0, 0, 0, 0, 0, 0, 0);
#endif

Expand Down
3 changes: 1 addition & 2 deletions xen/arch/x86/hvm/svm/svm.c
Original file line number Diff line number Diff line change
Expand Up @@ -1574,8 +1574,7 @@ static int svm_cpu_up_prepare(unsigned int cpu)
goto err;

#ifdef CONFIG_PV
if ( !cpu_has_fsgsbase )
per_cpu(host_vmcb_va, cpu) = __map_domain_page_global(pg);
per_cpu(host_vmcb_va, cpu) = __map_domain_page_global(pg);
#endif

clear_domain_page(page_to_mfn(pg));
Expand Down
12 changes: 11 additions & 1 deletion xen/arch/x86/pv/domain.c
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,15 @@ unsigned long pv_make_cr4(const struct vcpu *v)
if ( d->arch.vtsc || (v->arch.pv.ctrlreg[4] & X86_CR4_TSD) )
cr4 |= X86_CR4_TSD;

/*
* The {RD,WR}{FS,GS}BASE are only useable in 64bit code segments. While
* we must not have CR4.FSGSBASE set behind the back of a 64bit PV kernel,
* we do leave it set in 32bit PV context to speed up Xen's context switch
* path.
*/
if ( !is_pv_32bit_domain(d) && !(v->arch.pv.ctrlreg[4] & X86_CR4_FSGSBASE) )
cr4 &= ~X86_CR4_FSGSBASE;

return cr4;
}

Expand Down Expand Up @@ -371,7 +380,8 @@ void toggle_guest_mode(struct vcpu *v)
{
ASSERT(!is_pv_32bit_vcpu(v));

if ( cpu_has_fsgsbase )
/* %fs/%gs bases can only be stale if WR{FS,GS}BASE are usable. */
if ( read_cr4() & X86_CR4_FSGSBASE )
{
if ( v->arch.flags & TF_kernel_mode )
v->arch.pv.gs_base_kernel = __rdgsbase();
Expand Down
18 changes: 15 additions & 3 deletions xen/arch/x86/pv/emul-priv-op.c
Original file line number Diff line number Diff line change
Expand Up @@ -780,6 +780,17 @@ static int write_cr(unsigned int reg, unsigned long val,
}

case 4: /* Write CR4 */
/*
* If this write will disable FSGSBASE, refresh Xen's idea of the
* guest bases now that they can no longer change.
*/
if ( (curr->arch.pv.ctrlreg[4] & X86_CR4_FSGSBASE) &&
!(val & X86_CR4_FSGSBASE) )
{
curr->arch.pv.fs_base = __rdfsbase();
curr->arch.pv.gs_base_kernel = __rdgsbase();
}

curr->arch.pv.ctrlreg[4] = pv_fixup_guest_cr4(curr, val);
write_cr4(pv_make_cr4(curr));
ctxt_switch_levelling(curr);
Expand Down Expand Up @@ -828,14 +839,15 @@ static int read_msr(unsigned int reg, uint64_t *val,
case MSR_FS_BASE:
if ( is_pv_32bit_domain(currd) )
break;
*val = cpu_has_fsgsbase ? __rdfsbase() : curr->arch.pv.fs_base;
*val = (read_cr4() & X86_CR4_FSGSBASE) ? __rdfsbase()
: curr->arch.pv.fs_base;
return X86EMUL_OKAY;

case MSR_GS_BASE:
if ( is_pv_32bit_domain(currd) )
break;
*val = cpu_has_fsgsbase ? __rdgsbase()
: curr->arch.pv.gs_base_kernel;
*val = (read_cr4() & X86_CR4_FSGSBASE) ? __rdgsbase()
: curr->arch.pv.gs_base_kernel;
return X86EMUL_OKAY;

case MSR_SHADOW_GS_BASE:
Expand Down
2 changes: 1 addition & 1 deletion xen/arch/x86/setup.c
Original file line number Diff line number Diff line change
Expand Up @@ -1610,7 +1610,7 @@ void __init noreturn __start_xen(unsigned long mbi_p)

cr4_pv32_mask = mmu_cr4_features & XEN_CR4_PV32_BITS;

if ( cpu_has_fsgsbase )
if ( boot_cpu_has(X86_FEATURE_FSGSBASE) )
set_in_cr4(X86_CR4_FSGSBASE);

if ( opt_invpcid && cpu_has_invpcid )
Expand Down
1 change: 0 additions & 1 deletion xen/include/asm-x86/cpufeature.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,6 @@
#define cpu_has_xsaves boot_cpu_has(X86_FEATURE_XSAVES)

/* CPUID level 0x00000007:0.ebx */
#define cpu_has_fsgsbase boot_cpu_has(X86_FEATURE_FSGSBASE)
#define cpu_has_bmi1 boot_cpu_has(X86_FEATURE_BMI1)
#define cpu_has_hle boot_cpu_has(X86_FEATURE_HLE)
#define cpu_has_avx2 boot_cpu_has(X86_FEATURE_AVX2)
Expand Down
20 changes: 14 additions & 6 deletions xen/include/asm-x86/msr.h
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,14 @@ static inline uint64_t rdtsc_ordered(void)
: "=a" (low), "=d" (high) \
: "c" (counter))

/*
* On hardware supporting FSGSBASE, the value loaded into hardware is the
* guest kernel's choice for 64bit PV guests (Xen's choice for Idle, HVM and
* 32bit PV).
*
* Therefore, the {RD,WR}{FS,GS}BASE instructions are only safe to use if
* %cr4.fsgsbase is set.
*/
static inline unsigned long __rdfsbase(void)
{
unsigned long base;
Expand Down Expand Up @@ -154,7 +162,7 @@ static inline unsigned long rdfsbase(void)
{
unsigned long base;

if ( cpu_has_fsgsbase )
if ( read_cr4() & X86_CR4_FSGSBASE )
return __rdfsbase();

rdmsrl(MSR_FS_BASE, base);
Expand All @@ -166,7 +174,7 @@ static inline unsigned long rdgsbase(void)
{
unsigned long base;

if ( cpu_has_fsgsbase )
if ( read_cr4() & X86_CR4_FSGSBASE )
return __rdgsbase();

rdmsrl(MSR_GS_BASE, base);
Expand All @@ -178,7 +186,7 @@ static inline unsigned long rdgsshadow(void)
{
unsigned long base;

if ( cpu_has_fsgsbase )
if ( read_cr4() & X86_CR4_FSGSBASE )
{
asm volatile ( "swapgs" );
base = __rdgsbase();
Expand All @@ -192,7 +200,7 @@ static inline unsigned long rdgsshadow(void)

static inline void wrfsbase(unsigned long base)
{
if ( cpu_has_fsgsbase )
if ( read_cr4() & X86_CR4_FSGSBASE )
#ifdef HAVE_AS_FSGSBASE
asm volatile ( "wrfsbase %0" :: "r" (base) );
#else
Expand All @@ -204,7 +212,7 @@ static inline void wrfsbase(unsigned long base)

static inline void wrgsbase(unsigned long base)
{
if ( cpu_has_fsgsbase )
if ( read_cr4() & X86_CR4_FSGSBASE )
#ifdef HAVE_AS_FSGSBASE
asm volatile ( "wrgsbase %0" :: "r" (base) );
#else
Expand All @@ -216,7 +224,7 @@ static inline void wrgsbase(unsigned long base)

static inline void wrgsshadow(unsigned long base)
{
if ( cpu_has_fsgsbase )
if ( read_cr4() & X86_CR4_FSGSBASE )
{
asm volatile ( "swapgs\n\t"
#ifdef HAVE_AS_FSGSBASE
Expand Down
24 changes: 22 additions & 2 deletions xen/include/asm-x86/processor.h
Original file line number Diff line number Diff line change
Expand Up @@ -304,11 +304,31 @@ static inline unsigned long read_cr4(void)

static inline void write_cr4(unsigned long val)
{
struct cpu_info *info = get_cpu_info();

/* No global pages in case of PCIDs enabled! */
ASSERT(!(val & X86_CR4_PGE) || !(val & X86_CR4_PCIDE));

get_cpu_info()->cr4 = val;
asm volatile ( "mov %0,%%cr4" : : "r" (val) );
/*
* On hardware supporting FSGSBASE, the value in %cr4 is the kernel's
* choice for 64bit PV guests, which impacts whether Xen can use the
* instructions.
*
* The {rd,wr}{fs,gs}base() helpers use info->cr4 to work out whether it
* is safe to execute the {RD,WR}{FS,GS}BASE instruction, falling back to
* the MSR path if not. Some users require interrupt safety.
*
* If FSGSBASE is currently or about to become clear, reflect this in
* info->cr4 before updating %cr4, so an interrupt which hits in the
* middle won't observe FSGSBASE set in info->cr4 but clear in %cr4.
*/
info->cr4 = val & (info->cr4 | ~X86_CR4_FSGSBASE);

asm volatile ( "mov %[val], %%cr4"
: "+m" (info->cr4) /* Force ordering without a barrier. */
: [val] "r" (val) );

info->cr4 = val;
}

/* Clear and set 'TS' bit respectively */
Expand Down

0 comments on commit eccc170

Please sign in to comment.