Skip to content

Commit

Permalink
Merge pull request #306 from william-wang/master
Browse files Browse the repository at this point in the history
Fix the scheduler panic whenever the GPU is lost on node
  • Loading branch information
volcano-sh-bot authored Jul 8, 2019
2 parents 5b98aff + 52d9732 commit 3f264d4
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 2 deletions.
20 changes: 18 additions & 2 deletions pkg/scheduler/api/node_info.go
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,18 @@ func (ni *NodeInfo) SetNode(node *v1.Node) {
}
}

func (ni *NodeInfo) allocateIdleResource(ti *TaskInfo) error {
if ti.Resreq.LessEqual(ni.Idle) {
ni.Idle.Sub(ti.Resreq)
return nil
}
ni.State = NodeState{
Phase: NotReady,
Reason: "OutOfSync",
}
return fmt.Errorf("Selected node NotReady")
}

// AddTask is used to add a task in nodeInfo object
func (ni *NodeInfo) AddTask(task *TaskInfo) error {
key := PodKey(task.Pod)
Expand All @@ -176,12 +188,16 @@ func (ni *NodeInfo) AddTask(task *TaskInfo) error {
if ni.Node != nil {
switch ti.Status {
case Releasing:
if err := ni.allocateIdleResource(ti); err != nil {
return err
}
ni.Releasing.Add(ti.Resreq)
ni.Idle.Sub(ti.Resreq)
case Pipelined:
ni.Releasing.Sub(ti.Resreq)
default:
ni.Idle.Sub(ti.Resreq)
if err := ni.allocateIdleResource(ti); err != nil {
return err
}
}

ni.Used.Add(ti.Resreq)
Expand Down
19 changes: 19 additions & 0 deletions pkg/scheduler/api/node_info_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ func TestNodeInfo_AddPod(t *testing.T) {
case01Node := buildNode("n1", buildResourceList("8000m", "10G"))
case01Pod1 := buildPod("c1", "p1", "n1", v1.PodRunning, buildResourceList("1000m", "1G"), []metav1.OwnerReference{}, make(map[string]string))
case01Pod2 := buildPod("c1", "p2", "n1", v1.PodRunning, buildResourceList("2000m", "2G"), []metav1.OwnerReference{}, make(map[string]string))
// case2
case02Node := buildNode("n2", buildResourceList("2000m", "1G"))
case02Pod1 := buildPod("c2", "p1", "n2", v1.PodUnknown, buildResourceList("1000m", "2G"), []metav1.OwnerReference{}, make(map[string]string))

tests := []struct {
name string
Expand All @@ -63,6 +66,22 @@ func TestNodeInfo_AddPod(t *testing.T) {
},
},
},
{
name: "add 1 unknown pod",
node: case02Node,
pods: []*v1.Pod{case02Pod1},
expected: &NodeInfo{
Name: "n2",
Node: case02Node,
Idle: buildResource("2000m", "1G"),
Used: EmptyResource(),
Releasing: EmptyResource(),
Allocatable: buildResource("2000m", "1G"),
Capability: buildResource("2000m", "1G"),
State: NodeState{Phase: NotReady, Reason: "OutOfSync"},
Tasks: map[TaskID]*TaskInfo{},
},
},
}

for i, test := range tests {
Expand Down

0 comments on commit 3f264d4

Please sign in to comment.