Skip to content

Commit

Permalink
kvm: share upper halves among all pagtables
Browse files Browse the repository at this point in the history
Fixes: #509

Signed-off-by: Lai Jiangshan <jiangshan.ljs@antfin.com>
Signed-off-by: Lai Jiangshan <laijs@linux.alibaba.com>
  • Loading branch information
laijs authored and Lai Jiangshan committed Nov 2, 2020
1 parent dd05611 commit 3425485
Show file tree
Hide file tree
Showing 14 changed files with 149 additions and 65 deletions.
3 changes: 1 addition & 2 deletions pkg/sentry/platform/kvm/kvm.go
Original file line number Diff line number Diff line change
Expand Up @@ -158,8 +158,7 @@ func (*KVM) MaxUserAddress() usermem.Addr {
// NewAddressSpace returns a new pagetable root.
func (k *KVM) NewAddressSpace(_ interface{}) (platform.AddressSpace, <-chan struct{}, error) {
// Allocate page tables and install system mappings.
pageTables := pagetables.New(newAllocator())
k.machine.mapUpperHalf(pageTables)
pageTables := pagetables.NewWithUpper(newAllocator(), k.machine.upperSharedPageTables, ring0.KernelStartAddress)

// Return the new address space.
return &addressSpace{
Expand Down
15 changes: 11 additions & 4 deletions pkg/sentry/platform/kvm/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ type machine struct {
// slots are currently being updated, and the caller should retry.
nextSlot uint32

// upperSharedPageTables tracks the read-only shared upper of all the pagetables.
upperSharedPageTables *pagetables.PageTables

// kernel is the set of global structures.
kernel ring0.Kernel

Expand Down Expand Up @@ -199,9 +202,7 @@ func newMachine(vm int) (*machine, error) {
log.Debugf("The maximum number of vCPUs is %d.", m.maxVCPUs)
m.vCPUsByTID = make(map[uint64]*vCPU)
m.vCPUsByID = make([]*vCPU, m.maxVCPUs)
m.kernel.Init(ring0.KernelOpts{
PageTables: pagetables.New(newAllocator()),
}, m.maxVCPUs)
m.kernel.Init(m.maxVCPUs)

// Pull the maximum slots.
maxSlots, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_MEMSLOTS)
Expand All @@ -213,6 +214,13 @@ func newMachine(vm int) (*machine, error) {
log.Debugf("The maximum number of slots is %d.", m.maxSlots)
m.usedSlots = make([]uintptr, m.maxSlots)

// Create the upper shared pagetables and kernel(sentry) pagetables.
m.upperSharedPageTables = pagetables.New(newAllocator())
m.mapUpperHalf(m.upperSharedPageTables)
m.upperSharedPageTables.Allocator.(*allocator).base.Drain()
m.upperSharedPageTables.MarkReadOnlyShared()
m.kernel.PageTables = pagetables.NewWithUpper(newAllocator(), m.upperSharedPageTables, ring0.KernelStartAddress)

// Apply the physical mappings. Note that these mappings may point to
// guest physical addresses that are not actually available. These
// physical pages are mapped on demand, see kernel_unsafe.go.
Expand All @@ -226,7 +234,6 @@ func newMachine(vm int) (*machine, error) {

return true // Keep iterating.
})
m.mapUpperHalf(m.kernel.PageTables)

var physicalRegionsReadOnly []physicalRegion
var physicalRegionsAvailable []physicalRegion
Expand Down
31 changes: 14 additions & 17 deletions pkg/sentry/platform/kvm/machine_amd64.go
Original file line number Diff line number Diff line change
Expand Up @@ -432,30 +432,27 @@ func availableRegionsForSetMem() (phyRegions []physicalRegion) {
return physicalRegions
}

var execRegions = func() (regions []region) {
func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) {
// Map all the executible regions so that all the entry functions
// are mapped in the upper half.
applyVirtualRegions(func(vr virtualRegion) {
if excludeVirtualRegion(vr) || vr.filename == "[vsyscall]" {
return
}

if vr.accessType.Execute {
regions = append(regions, vr.region)
r := vr.region
physical, length, ok := translateToPhysical(r.virtual)
if !ok || length < r.length {
panic("impossible translation")
}
pageTable.Map(
usermem.Addr(ring0.KernelStartAddress|r.virtual),
r.length,
pagetables.MapOpts{AccessType: usermem.Execute},
physical)
}
})
return
}()

func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) {
for _, r := range execRegions {
physical, length, ok := translateToPhysical(r.virtual)
if !ok || length < r.length {
panic("impossilbe translation")
}
pageTable.Map(
usermem.Addr(ring0.KernelStartAddress|r.virtual),
r.length,
pagetables.MapOpts{AccessType: usermem.Execute},
physical)
}
for start, end := range m.kernel.EntryRegions() {
regionLen := end - start
physical, length, ok := translateToPhysical(start)
Expand Down
3 changes: 3 additions & 0 deletions pkg/sentry/platform/ring0/defs.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ import (
//
// This contains global state, shared by multiple CPUs.
type Kernel struct {
// PageTables are the kernel pagetables; this must be provided.
PageTables *pagetables.PageTables

KernelArchState
}

Expand Down
10 changes: 1 addition & 9 deletions pkg/sentry/platform/ring0/defs_amd64.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,17 +66,9 @@ var (
KernelDataSegment SegmentDescriptor
)

// KernelOpts has initialization options for the kernel.
type KernelOpts struct {
// PageTables are the kernel pagetables; this must be provided.
PageTables *pagetables.PageTables
}

// KernelArchState contains architecture-specific state.
type KernelArchState struct {
KernelOpts

// cpuEntries is array of kernelEntry for all cpus
// cpuEntries is array of kernelEntry for all cpus.
cpuEntries []kernelEntry

// globalIDT is our set of interrupt gates.
Expand Down
7 changes: 0 additions & 7 deletions pkg/sentry/platform/ring0/defs_arm64.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,8 @@ var (
KernelStartAddress = ^uintptr(0) - (UserspaceSize - 1)
)

// KernelOpts has initialization options for the kernel.
type KernelOpts struct {
// PageTables are the kernel pagetables; this must be provided.
PageTables *pagetables.PageTables
}

// KernelArchState contains architecture-specific state.
type KernelArchState struct {
KernelOpts
}

// CPUArchState contains CPU-specific arch state.
Expand Down
6 changes: 2 additions & 4 deletions pkg/sentry/platform/ring0/kernel.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,9 @@ package ring0

// Init initializes a new kernel.
//
// N.B. that constraints on KernelOpts must be satisfied.
//
//go:nosplit
func (k *Kernel) Init(opts KernelOpts, maxCPUs int) {
k.init(opts, maxCPUs)
func (k *Kernel) Init(maxCPUs int) {
k.init(maxCPUs)
}

// Halt halts execution.
Expand Down
5 changes: 1 addition & 4 deletions pkg/sentry/platform/ring0/kernel_amd64.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,7 @@ import (
)

// init initializes architecture-specific state.
func (k *Kernel) init(opts KernelOpts, maxCPUs int) {
// Save the root page tables.
k.PageTables = opts.PageTables

func (k *Kernel) init(maxCPUs int) {
entrySize := reflect.TypeOf(kernelEntry{}).Size()
var (
entries []kernelEntry
Expand Down
4 changes: 1 addition & 3 deletions pkg/sentry/platform/ring0/kernel_arm64.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,7 @@ func HaltAndResume()
func HaltEl1SvcAndResume()

// init initializes architecture-specific state.
func (k *Kernel) init(opts KernelOpts, maxCPUs int) {
// Save the root page tables.
k.PageTables = opts.PageTables
func (k *Kernel) init(maxCPUs int) {
}

// init initializes architecture-specific state.
Expand Down
84 changes: 81 additions & 3 deletions pkg/sentry/platform/ring0/pagetables/pagetables.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@ type PageTables struct {
Allocator Allocator

// root is the pagetable root.
//
// For same archs such as amd64, the upper of the PTEs is cloned
// from and owned by upperSharedPageTables which are shared among
// many PageTables if upperSharedPageTables is not nil.
root *PTEs

// rootPhysical is the cached physical address of the root.
Expand All @@ -39,15 +43,52 @@ type PageTables struct {

// archPageTables includes architecture-specific features.
archPageTables

// upperSharedPageTables represents a read-only shared upper
// of the Pagetable. When it is not nil, the upper is not
// allowed to be modified.
upperSharedPageTables *PageTables

// upperStart is the start address of the upper portion that
// are shared from upperSharedPageTables
upperStart uintptr

// readOnlyShared indicates the Pagetables are read-only and
// own the ranges that are shared with other Pagetables.
readOnlyShared bool
}

// New returns new PageTables.
func New(a Allocator) *PageTables {
// NewWithUpper returns new PageTables.
//
// upperSharedPageTables are used for mapping the upper of addresses,
// starting at upperStart. These pageTables should not be touched (as
// invalidations may be incorrect) after they are passed as an
// upperSharedPageTables. Only when all dependent PageTables are gone
// may they be used. The intenteded use case is for kernel page tables,
// which are static and fixed.
//
// Precondition: upperStart must be between canonical ranges.
// Precondition: upperStart must be pgdSize aligned.
// precondition: upperSharedPageTables must be marked read-only shared.
func NewWithUpper(a Allocator, upperSharedPageTables *PageTables, upperStart uintptr) *PageTables {
p := new(PageTables)
p.Init(a)
if upperSharedPageTables != nil {
if !upperSharedPageTables.readOnlyShared {
panic("Only read-only shared pagetables can be used as upper")
}
p.upperSharedPageTables = upperSharedPageTables
p.upperStart = upperStart
p.cloneUpperShared()
}
return p
}

// New returns new PageTables.
func New(a Allocator) *PageTables {
return NewWithUpper(a, nil, 0)
}

// mapVisitor is used for map.
type mapVisitor struct {
target uintptr // Input.
Expand Down Expand Up @@ -90,6 +131,21 @@ func (*mapVisitor) requiresSplit() bool { return true }
//
//go:nosplit
func (p *PageTables) Map(addr usermem.Addr, length uintptr, opts MapOpts, physical uintptr) bool {
if p.readOnlyShared {
panic("Should not modify read-only shared pagetables.")
}
if uintptr(addr)+length < uintptr(addr) {
panic("addr & length overflow")
}
if p.upperSharedPageTables != nil {
// ignore change to the read-only upper shared portion.
if uintptr(addr) >= p.upperStart {
return false
}
if uintptr(addr)+length > p.upperStart {
length = p.upperStart - uintptr(addr)
}
}
if !opts.AccessType.Any() {
return p.Unmap(addr, length)
}
Expand Down Expand Up @@ -128,12 +184,27 @@ func (v *unmapVisitor) visit(start uintptr, pte *PTE, align uintptr) {
//
// True is returned iff there was a previous mapping in the range.
//
// Precondition: addr & length must be page-aligned.
// Precondition: addr & length must be page-aligned, their sum must not overflow.
//
// +checkescape:hard,stack
//
//go:nosplit
func (p *PageTables) Unmap(addr usermem.Addr, length uintptr) bool {
if p.readOnlyShared {
panic("Should not modify read-only shared pagetables.")
}
if uintptr(addr)+length < uintptr(addr) {
panic("addr & length overflow")
}
if p.upperSharedPageTables != nil {
// ignore change to the read-only upper shared portion.
if uintptr(addr) >= p.upperStart {
return false
}
if uintptr(addr)+length > p.upperStart {
length = p.upperStart - uintptr(addr)
}
}
w := unmapWalker{
pageTables: p,
visitor: unmapVisitor{
Expand Down Expand Up @@ -218,3 +289,10 @@ func (p *PageTables) Lookup(addr usermem.Addr) (physical uintptr, opts MapOpts)
w.iterateRange(uintptr(addr), uintptr(addr)+1)
return w.visitor.physical + offset, w.visitor.opts
}

// MarkReadOnlyShared marks the pagetables read-only and can be shared.
//
// It is usually used on the pagetables that are used as the upper
func (p *PageTables) MarkReadOnlyShared() {
p.readOnlyShared = true
}
10 changes: 1 addition & 9 deletions pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,6 @@ import (

// archPageTables is architecture-specific data.
type archPageTables struct {
// root is the pagetable root for kernel space.
root *PTEs

// rootPhysical is the cached physical address of the root.
//
// This is saved only to prevent constant translation.
rootPhysical uintptr

asid uint16
}

Expand All @@ -46,7 +38,7 @@ func (p *PageTables) TTBR0_EL1(noFlush bool, asid uint16) uint64 {
//
//go:nosplit
func (p *PageTables) TTBR1_EL1(noFlush bool, asid uint16) uint64 {
return uint64(p.archPageTables.rootPhysical) | (uint64(asid)&ttbrASIDMask)<<ttbrASIDOffset
return uint64(p.upperSharedPageTables.rootPhysical) | (uint64(asid)&ttbrASIDMask)<<ttbrASIDOffset
}

// Bits in page table entries.
Expand Down
21 changes: 21 additions & 0 deletions pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,5 +50,26 @@ func (p *PageTables) Init(allocator Allocator) {
p.rootPhysical = p.Allocator.PhysicalFor(p.root)
}

func pgdIndex(upperStart uintptr) uintptr {
if upperStart&(pgdSize-1) != 0 {
panic("upperStart should be pgd size aligned")
}
if upperStart >= upperBottom {
return entriesPerPage/2 + (upperStart-upperBottom)/pgdSize
}
if upperStart < lowerTop {
return upperStart / pgdSize
}
panic("upperStart should be in canonical range")
}

// cloneUpperShared clone the upper from the upper shared page tables.
//
//go:nosplit
func (p *PageTables) cloneUpperShared() {
start := pgdIndex(p.upperStart)
copy(p.root[start:entriesPerPage], p.upperSharedPageTables.root[start:entriesPerPage])
}

// PTEs is a collection of entries.
type PTEs [entriesPerPage]PTE
13 changes: 11 additions & 2 deletions pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,17 @@ func (p *PageTables) Init(allocator Allocator) {
p.Allocator = allocator
p.root = p.Allocator.NewPTEs()
p.rootPhysical = p.Allocator.PhysicalFor(p.root)
p.archPageTables.root = p.Allocator.NewPTEs()
p.archPageTables.rootPhysical = p.Allocator.PhysicalFor(p.archPageTables.root)
}

// cloneUpperShared clone the upper from the upper shared page tables.
//
//go:nosplit
func (p *PageTables) cloneUpperShared() {
if p.upperStart != upperBottom {
panic("upperStart should be the same as upperBottom")
}

// nothing to do for arm.
}

// PTEs is a collection of entries.
Expand Down
2 changes: 1 addition & 1 deletion pkg/sentry/platform/ring0/pagetables/walker_arm64.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ func next(start uintptr, size uintptr) uintptr {
func (w *Walker) iterateRangeCanonical(start, end uintptr) {
pgdEntryIndex := w.pageTables.root
if start >= upperBottom {
pgdEntryIndex = w.pageTables.archPageTables.root
pgdEntryIndex = w.pageTables.upperSharedPageTables.root
}

for pgdIndex := (uint16((start & pgdMask) >> pgdShift)); start < end && pgdIndex < entriesPerPage; pgdIndex++ {
Expand Down

0 comments on commit 3425485

Please sign in to comment.