Skip to content
This repository was archived by the owner on Feb 8, 2021. It is now read-only.

fix the race in case sandbox start failed during pod creating #600

Merged
merged 4 commits into from
Apr 12, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions daemon/pod/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"github.com/docker/engine-api/types/strslice"

"github.com/hyperhq/hypercontainer-utils/hlog"
"github.com/hyperhq/hyperd/errors"
apitypes "github.com/hyperhq/hyperd/types"
"github.com/hyperhq/hyperd/utils"
runv "github.com/hyperhq/runv/api"
Expand Down Expand Up @@ -233,6 +234,11 @@ func (c *Container) Add() error {

func (c *Container) start() error {
if err := c.status.Start(); err != nil {
if err == errors.ErrContainerAlreadyRunning {
err = nil
c.Log(INFO, "container in running state, do not need start")
return nil
}
c.Log(ERROR, err)
return err
}
Expand Down Expand Up @@ -1196,8 +1202,10 @@ func (cs *ContainerStatus) Start() error {
cs.Lock()
defer cs.Unlock()

if cs.State != S_CONTAINER_CREATED {
return fmt.Errorf("only CREATING container could be set to creatd, current: %d", cs.State)
if cs.State == S_CONTAINER_RUNNING {
return errors.ErrContainerAlreadyRunning
} else if cs.State != S_CONTAINER_CREATED {
return fmt.Errorf("only CREATED container could be set to running, current: %d", cs.State)
}

cs.Killed = false
Expand Down
6 changes: 3 additions & 3 deletions daemon/pod/decommission.go
Original file line number Diff line number Diff line change
Expand Up @@ -536,14 +536,14 @@ func (p *XPod) cleanup() {
p.resourceLock.Lock()
defer p.resourceLock.Unlock()

p.statusLock.RLock()
p.statusLock.Lock()
if p.status == S_POD_STOPPED || p.status == S_POD_NONE {
p.statusLock.RUnlock()
p.statusLock.Unlock()
return
} else {
p.status = S_POD_STOPPING
}
p.statusLock.RUnlock()
p.statusLock.Unlock()

err := p.decommissionResources()
if err != nil {
Expand Down
3 changes: 0 additions & 3 deletions daemon/pod/migration.go
Original file line number Diff line number Diff line change
Expand Up @@ -198,9 +198,6 @@ func persistLagecyPod(factory *PodFactory, spec *apitypes.UserPod) error {
if err = p.initResources(spec, false); err != nil {
return err
}
if err = p.prepareResources(); err != nil {
return err
}

if err = p.savePod(); err != nil {
return err
Expand Down
21 changes: 15 additions & 6 deletions daemon/pod/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ type XPod struct {
// got a value from this chan, it should put an element to it again, in case other procedure may wait
// on it too.
stoppedChan chan bool
initCond *sync.Cond
}

// The Log infrastructure, to add pod name as prefix of the log message.
Expand Down Expand Up @@ -110,29 +111,37 @@ func (p *XPod) SandboxName() string {
}

func (p *XPod) IsNone() bool {
p.statusLock.Lock()
p.statusLock.RLock()
isNone := p.status == S_POD_NONE
p.statusLock.Unlock()
p.statusLock.RUnlock()

return isNone
}

func (p *XPod) IsRunning() bool {
p.statusLock.Lock()
p.statusLock.RLock()
running := p.status == S_POD_RUNNING
p.statusLock.Unlock()
p.statusLock.RUnlock()

return running
}

func (p *XPod) IsAlive() bool {
p.statusLock.Lock()
p.statusLock.RLock()
alive := (p.status == S_POD_RUNNING) || (p.status == S_POD_STARTING)
p.statusLock.Unlock()
p.statusLock.RUnlock()

return alive
}

func (p *XPod) IsStopped() bool {
p.statusLock.RLock()
stopped := p.status == S_POD_STOPPED
p.statusLock.RUnlock()

return stopped
}

func (p *XPod) IsContainerRunning(cid string) bool {
if !p.IsRunning() {
return false
Expand Down
67 changes: 54 additions & 13 deletions daemon/pod/provision.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"time"

"github.com/hyperhq/hypercontainer-utils/hlog"
"github.com/hyperhq/hyperd/errors"
apitypes "github.com/hyperhq/hyperd/types"
"github.com/hyperhq/hyperd/utils"
runv "github.com/hyperhq/runv/api"
Expand Down Expand Up @@ -84,7 +85,7 @@ func newXPod(factory *PodFactory, spec *apitypes.UserPod) (*XPod, error) {
}
factory.hosts = HostsCreator(spec.Id)
factory.logCreator = initLogCreator(factory, spec)
return &XPod{
p := &XPod{
name: spec.Id,
logPrefix: fmt.Sprintf("Pod[%s] ", spec.Id),
globalSpec: spec.CloneGlobalPart(),
Expand All @@ -98,7 +99,9 @@ func newXPod(factory *PodFactory, spec *apitypes.UserPod) (*XPod, error) {
statusLock: &sync.RWMutex{},
stoppedChan: make(chan bool, 1),
factory: factory,
}, nil
}
p.initCond = sync.NewCond(p.statusLock.RLocker())
return p, nil
}

func (p *XPod) ContainerCreate(c *apitypes.UserContainer) (string, error) {
Expand Down Expand Up @@ -211,7 +214,7 @@ func (p *XPod) ContainerStart(cid string) error {
// Start() means start a STOPPED pod.
func (p *XPod) Start() error {

if p.status == S_POD_STOPPED {
if p.IsStopped() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I remember there is a S_POD_ERROR state, should we filter it here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's leave this status in later issue/pr.

In principle, if a pod failed and could be restarted, it should be set back to stopped status. Only those meet fatal exception and could not start any more should stay in ERROR status.

However, sometimes the daemon can not judge the situation accurately. And I think it might be better if we could allow a user to set back status from ERROR to STOPPED manually.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, more complicated than I thought.

if err := p.createSandbox(p.globalSpec); err != nil {
p.Log(ERROR, "failed to create sandbox for the stopped pod: %v", err)
return err
Expand All @@ -226,13 +229,12 @@ func (p *XPod) Start() error {
}
}

if p.status == S_POD_RUNNING {
if err := p.startAll(); err != nil {
return err
}
} else {
err := fmt.Errorf("not in proper status and could not be started: %v", p.status)
p.Log(ERROR, err)
err := p.waitPodRun("start pod")
if err != nil {
p.Log(ERROR, "wait running failed, cannot start pod")
return err
}
if err := p.startAll(); err != nil {
return err
}

Expand Down Expand Up @@ -299,20 +301,20 @@ func (p *XPod) waitVMInit() {
}
r := p.sandbox.WaitInit()
p.Log(INFO, "sandbox init result: %#v", r)
p.statusLock.Lock()
if r.IsSuccess() {
p.statusLock.Lock()
if p.status == S_POD_STARTING {
p.status = S_POD_RUNNING
}
p.statusLock.Unlock()
} else {
p.statusLock.Lock()
if p.sandbox != nil {
go p.sandbox.Shutdown()
}
p.status = S_POD_STOPPING
p.statusLock.Unlock()
}
p.initCond.Broadcast()
p.statusLock.Unlock()
}

func (p *XPod) reserveNames(containers []*apitypes.UserContainer) error {
Expand Down Expand Up @@ -391,6 +393,15 @@ func (p *XPod) prepareResources() error {
var (
err error
)

p.resourceLock.Lock()
defer p.resourceLock.Unlock()

if !p.IsAlive() {
p.Log(ERROR, "pod is not alive, can not prepare resources")
return errors.ErrPodNotAlive.WithArgs(p.Id())
}

//generate /etc/hosts
p.factory.hosts.Do()

Expand All @@ -414,6 +425,14 @@ func (p *XPod) prepareResources() error {
// addResourcesToSandbox() add resources to sandbox parallelly, it issues
// runV API parallelly to send the NIC, Vols, and Containers to sandbox
func (p *XPod) addResourcesToSandbox() error {
p.resourceLock.Lock()
defer p.resourceLock.Unlock()

if !p.IsAlive() {
p.Log(ERROR, "pod is not alive, can not add resources to sandbox")
return errors.ErrPodNotAlive.WithArgs(p.Id())
}

p.Log(INFO, "adding resource to sandbox")
future := utils.NewFutureSet()

Expand Down Expand Up @@ -471,3 +490,25 @@ func (p *XPod) sandboxShareDir() string {
}
return filepath.Join(hypervisor.BaseDir, p.sandbox.Id, hypervisor.ShareDirTag)
}

func (p *XPod) waitPodRun(activity string) error {
p.statusLock.RLock()
for {
if p.status == S_POD_RUNNING || p.status == S_POD_PAUSED {
p.statusLock.RUnlock()
p.Log(DEBUG, "pod is running, proceed %s", activity)
return nil
}
if p.status != S_POD_STARTING {
p.statusLock.RUnlock()
// only starting could transit to running, if not starting, that's mean failed
p.Log(ERROR, "pod is not running, cannot %s", activity)
return errors.ErrPodNotRunning.WithArgs(p.Id())
}
p.Log(TRACE, "wait for pod running")
p.initCond.Wait()
}
// should never reach here
p.statusLock.RUnlock()
return errors.ErrorCodeCommon.WithArgs("reach unreachable code...")
}
35 changes: 35 additions & 0 deletions errors/errors.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
package errors

import (
"net/http"

"github.com/docker/distribution/registry/api/errcode"
)

const errGroup = "hyperd"

var (
ErrorCodeCommon = errcode.Register(errGroup, errcode.ErrorDescriptor{
Value: "COMMONERROR",
Message: "%v",
HTTPStatusCode: http.StatusInternalServerError,
})

ErrPodNotAlive = errcode.Register(errGroup, errcode.ErrorDescriptor{
Value: "HYPER_POD_NOT_ALIVE",
Message: "cannot complete the operation, because the pod %s is not alive",
HTTPStatusCode: http.StatusPreconditionFailed,
})

ErrPodNotRunning = errcode.Register(errGroup, errcode.ErrorDescriptor{
Value: "HYPER_POD_NOT_RUNNING",
Message: "cannot complete the operation, because the pod %s is not running",
HTTPStatusCode: http.StatusPreconditionFailed,
})

ErrContainerAlreadyRunning = errcode.Register(errGroup, errcode.ErrorDescriptor{
Value: "HYPER_CONTAINER_RUNNING",
Message: "container %s is in running state",
HTTPStatusCode: http.StatusPreconditionFailed,
})
)