-
Notifications
You must be signed in to change notification settings - Fork 256
/
pod.go
445 lines (407 loc) · 13.6 KB
/
pod.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
//go:build windows
package main
import (
"context"
"fmt"
"os"
"path/filepath"
"strings"
"sync"
"github.com/Microsoft/hcsshim/internal/layers"
"github.com/Microsoft/hcsshim/internal/log"
"github.com/Microsoft/hcsshim/internal/oci"
"github.com/Microsoft/hcsshim/internal/uvm"
"github.com/Microsoft/hcsshim/osversion"
"github.com/Microsoft/hcsshim/pkg/annotations"
eventstypes "github.com/containerd/containerd/api/events"
task "github.com/containerd/containerd/api/runtime/task/v2"
"github.com/containerd/containerd/runtime"
"github.com/containerd/errdefs"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/pkg/errors"
"golang.org/x/sync/errgroup"
)
// shimPod represents the logical grouping of all tasks in a single set of
// shared namespaces. The pod sandbox (container) is represented by the task
// that matches the `shimPod.ID()`
type shimPod interface {
// ID is the id of the task representing the pause (sandbox) container.
ID() string
// CreateTask creates a workload task within this pod named `tid` with
// settings `s`.
//
// If `tid==ID()` or `tid` is the same as any other task in this pod, this
// pod MUST return `errdefs.ErrAlreadyExists`.
CreateTask(ctx context.Context, req *task.CreateTaskRequest, s *specs.Spec) (shimTask, error)
// GetTask returns a task in this pod that matches `tid`.
//
// If `tid` is not found, this pod MUST return `errdefs.ErrNotFound`.
GetTask(tid string) (shimTask, error)
// GetTasks returns every task in the pod.
//
// If a shim cannot be loaded, this will return an error.
ListTasks() ([]shimTask, error)
// KillTask sends `signal` to task that matches `tid`.
//
// If `tid` is not found, this pod MUST return `errdefs.ErrNotFound`.
//
// If `tid==ID() && eid == "" && all == true` this pod will send `signal` to
// all tasks in the pod and lastly send `signal` to the sandbox itself.
//
// If `all == true && eid != ""` this pod MUST return
// `errdefs.ErrFailedPrecondition`.
//
// A call to `KillTask` is only valid when the exec found by `tid,eid` is in
// the `shimExecStateRunning, shimExecStateExited` states. If the exec is
// not in this state this pod MUST return `errdefs.ErrFailedPrecondition`.
KillTask(ctx context.Context, tid, eid string, signal uint32, all bool) error
// DeleteTask removes a task from being tracked by this pod, and cleans up
// the resources the shim allocated for the task.
//
// The task's init exec (eid == "") must be either `shimExecStateCreated` or
// `shimExecStateExited`. If the exec is not in this state this pod MUST
// return `errdefs.ErrFailedPrecondition`. Deleting the pod's sandbox task
// is a no-op.
DeleteTask(ctx context.Context, tid string) error
}
func createPod(ctx context.Context, events publisher, req *task.CreateTaskRequest, s *specs.Spec) (_ shimPod, err error) {
log.G(ctx).WithField("tid", req.ID).Debug("createPod")
if osversion.Build() < osversion.RS5 {
return nil, errors.Wrapf(errdefs.ErrFailedPrecondition, "pod support is not available on Windows versions previous to RS5 (%d)", osversion.RS5)
}
ct, sid, err := oci.GetSandboxTypeAndID(s.Annotations)
if err != nil {
return nil, err
}
if ct != oci.KubernetesContainerTypeSandbox {
return nil, errors.Wrapf(
errdefs.ErrFailedPrecondition,
"expected annotation: '%s': '%s' got '%s'",
annotations.KubernetesContainerType,
oci.KubernetesContainerTypeSandbox,
ct)
}
if sid != req.ID {
return nil, errors.Wrapf(
errdefs.ErrFailedPrecondition,
"expected annotation '%s': '%s' got '%s'",
annotations.KubernetesSandboxID,
req.ID,
sid)
}
owner := filepath.Base(os.Args[0])
isWCOW := oci.IsWCOW(s)
p := pod{
events: events,
id: req.ID,
spec: s,
}
var parent *uvm.UtilityVM
var lopts *uvm.OptionsLCOW
if oci.IsIsolated(s) {
// Create the UVM parent
opts, err := oci.SpecToUVMCreateOpts(ctx, s, fmt.Sprintf("%s@vm", req.ID), owner)
if err != nil {
return nil, err
}
switch opts.(type) {
case *uvm.OptionsLCOW:
lopts = (opts).(*uvm.OptionsLCOW)
lopts.BundleDirectory = req.Bundle
parent, err = uvm.CreateLCOW(ctx, lopts)
if err != nil {
return nil, err
}
case *uvm.OptionsWCOW:
var layerFolders []string
if s.Windows != nil {
layerFolders = s.Windows.LayerFolders
}
wopts := (opts).(*uvm.OptionsWCOW)
wopts.BootFiles, err = layers.GetWCOWUVMBootFilesFromLayers(ctx, req.Rootfs, layerFolders)
if err != nil {
return nil, err
}
parent, err = uvm.CreateWCOW(ctx, wopts)
if err != nil {
return nil, err
}
}
err = parent.Start(ctx)
if err != nil {
parent.Close()
return nil, err
}
} else if oci.IsJobContainer(s) {
// If we're making a job container fake a task (i.e reuse the wcowPodSandbox logic)
p.sandboxTask = newWcowPodSandboxTask(ctx, events, req.ID, req.Bundle, parent, "")
if err := events.publishEvent(
ctx,
runtime.TaskCreateEventTopic,
&eventstypes.TaskCreate{
ContainerID: req.ID,
Bundle: req.Bundle,
Rootfs: req.Rootfs,
IO: &eventstypes.TaskIO{
Stdin: req.Stdin,
Stdout: req.Stdout,
Stderr: req.Stderr,
Terminal: req.Terminal,
},
Checkpoint: "",
Pid: 0,
}); err != nil {
return nil, err
}
p.jobContainer = true
return &p, nil
} else if !isWCOW {
return nil, errors.Wrap(errdefs.ErrFailedPrecondition, "oci spec does not contain WCOW or LCOW spec")
}
defer func() {
// clean up the uvm if we fail any further operations
if err != nil && parent != nil {
parent.Close()
}
}()
p.host = parent
if parent != nil {
cid := req.ID
if id, ok := s.Annotations[annotations.NcproxyContainerID]; ok {
cid = id
}
caAddr := fmt.Sprintf(uvm.ComputeAgentAddrFmt, cid)
if err := parent.CreateAndAssignNetworkSetup(ctx, caAddr, cid); err != nil {
return nil, err
}
}
// TODO: JTERRY75 - There is a bug in the compartment activation for Windows
// Process isolated that requires us to create the real pause container to
// hold the network compartment open. This is not required for Windows
// Hypervisor isolated. When we have a build that supports this for Windows
// Process isolated make sure to move back to this model.
// For WCOW we fake out the init task since we dont need it. We only
// need to provision the guest network namespace if this is hypervisor
// isolated. Process isolated WCOW gets the namespace endpoints
// automatically.
nsid := ""
if isWCOW && parent != nil {
if s.Windows != nil && s.Windows.Network != nil {
nsid = s.Windows.Network.NetworkNamespace
}
if nsid != "" {
if err := parent.ConfigureNetworking(ctx, nsid); err != nil {
return nil, errors.Wrapf(err, "failed to setup networking for pod %q", req.ID)
}
}
p.sandboxTask = newWcowPodSandboxTask(ctx, events, req.ID, req.Bundle, parent, nsid)
// Publish the created event. We only do this for a fake WCOW task. A
// HCS Task will event itself based on actual process lifetime.
if err := events.publishEvent(
ctx,
runtime.TaskCreateEventTopic,
&eventstypes.TaskCreate{
ContainerID: req.ID,
Bundle: req.Bundle,
Rootfs: req.Rootfs,
IO: &eventstypes.TaskIO{
Stdin: req.Stdin,
Stdout: req.Stdout,
Stderr: req.Stderr,
Terminal: req.Terminal,
},
Checkpoint: "",
Pid: 0,
}); err != nil {
return nil, err
}
} else {
if isWCOW {
defaultArgs := "c:\\windows\\system32\\cmd.exe"
// For the default pause image, the entrypoint
// used is pause.exe
// If the default pause image is not used for pause containers,
// the activation will immediately exit on Windows
// because there is no command. We forcibly update the command here
// to keep it alive only for non-default pause images.
// TODO: This override can be completely removed from containerd/1.7
if (len(s.Process.Args) == 1 && strings.EqualFold(s.Process.Args[0], defaultArgs)) ||
strings.EqualFold(s.Process.CommandLine, defaultArgs) {
log.G(ctx).Warning("Detected CMD override for pause container entrypoint." +
"Please consider switching to a pause image with an explicit cmd set")
s.Process.CommandLine = "cmd /c ping -t 127.0.0.1 > nul"
}
}
// LCOW (and WCOW Process Isolated for the time being) requires a real
// task for the sandbox.
lt, err := newHcsTask(ctx, events, parent, true, req, s)
if err != nil {
return nil, err
}
p.sandboxTask = lt
}
return &p, nil
}
var _ = (shimPod)(&pod{})
type pod struct {
events publisher
// id is the id of the sandbox task when the pod is created.
//
// It MUST be treated as read only in the lifetime of the pod.
id string
// sandboxTask is the task that represents the sandbox.
//
// Note: The invariant `id==sandboxTask.ID()` MUST be true.
//
// It MUST be treated as read only in the lifetime of the pod.
sandboxTask shimTask
// host is the UtilityVM that is hosting `sandboxTask` if the task is
// hypervisor isolated.
//
// It MUST be treated as read only in the lifetime of the pod.
host *uvm.UtilityVM
// jobContainer specifies whether this pod is for WCOW job containers only.
//
// It MUST be treated as read only in the lifetime of the pod.
jobContainer bool
// spec is the OCI runtime specification for the pod sandbox container.
spec *specs.Spec
workloadTasks sync.Map
}
func (p *pod) ID() string {
return p.id
}
func (p *pod) CreateTask(ctx context.Context, req *task.CreateTaskRequest, s *specs.Spec) (_ shimTask, err error) {
if req.ID == p.id {
return nil, errors.Wrapf(errdefs.ErrAlreadyExists, "task with id: '%s' already exists", req.ID)
}
e, _ := p.sandboxTask.GetExec("")
if e.State() != shimExecStateRunning {
return nil, errors.Wrapf(errdefs.ErrFailedPrecondition, "task with id: '%s' cannot be created in pod: '%s' which is not running", req.ID, p.id)
}
_, ok := p.workloadTasks.Load(req.ID)
if ok {
return nil, errors.Wrapf(errdefs.ErrAlreadyExists, "task with id: '%s' already exists id pod: '%s'", req.ID, p.id)
}
if p.jobContainer {
// This is a short circuit to make sure that all containers in a pod will have
// the same IP address/be added to the same compartment.
//
// There will need to be OS work needed to support this scenario, so for now we need to block on
// this.
if !oci.IsJobContainer(s) {
return nil, errors.New("cannot create a normal process isolated container if the pod sandbox is a job container")
}
// Pass through some annotations from the pod spec that if specified will need to be made available
// to every container as well. Kubernetes only passes annotations to RunPodSandbox so there needs to be
// a way for individual containers to get access to these.
oci.SandboxAnnotationsPassThrough(
p.spec.Annotations,
s.Annotations,
annotations.HostProcessInheritUser,
annotations.HostProcessRootfsLocation,
)
}
ct, sid, err := oci.GetSandboxTypeAndID(s.Annotations)
if err != nil {
return nil, err
}
if ct != oci.KubernetesContainerTypeContainer {
return nil, errors.Wrapf(
errdefs.ErrFailedPrecondition,
"expected annotation: '%s': '%s' got '%s'",
annotations.KubernetesContainerType,
oci.KubernetesContainerTypeContainer,
ct)
}
if sid != p.id {
return nil, errors.Wrapf(
errdefs.ErrFailedPrecondition,
"expected annotation '%s': '%s' got '%s'",
annotations.KubernetesSandboxID,
p.id,
sid)
}
st, err := newHcsTask(ctx, p.events, p.host, false, req, s)
if err != nil {
return nil, err
}
p.workloadTasks.Store(req.ID, st)
return st, nil
}
func (p *pod) GetTask(tid string) (shimTask, error) {
if tid == p.id {
return p.sandboxTask, nil
}
raw, loaded := p.workloadTasks.Load(tid)
if !loaded {
return nil, errors.Wrapf(errdefs.ErrNotFound, "task with id: '%s' not found", tid)
}
return raw.(shimTask), nil
}
func (p *pod) ListTasks() (_ []shimTask, err error) {
tasks := []shimTask{p.sandboxTask}
p.workloadTasks.Range(func(key, value interface{}) bool {
wt, loaded := value.(shimTask)
if !loaded {
err = fmt.Errorf("failed to load tasks %s", key)
return false
}
tasks = append(tasks, wt)
// Iterate all. Returning false stops the iteration. See:
// https://pkg.go.dev/sync#Map.Range
return true
})
if err != nil {
return nil, err
}
return tasks, nil
}
func (p *pod) KillTask(ctx context.Context, tid, eid string, signal uint32, all bool) error {
t, err := p.GetTask(tid)
if err != nil {
return err
}
if all && eid != "" {
return errors.Wrapf(errdefs.ErrFailedPrecondition, "cannot signal all with non empty ExecID: '%s'", eid)
}
eg := errgroup.Group{}
if all && tid == p.id {
// We are in a kill all on the sandbox task. Signal everything.
p.workloadTasks.Range(func(key, value interface{}) bool {
wt := value.(shimTask)
eg.Go(func() error {
return wt.KillExec(ctx, eid, signal, all)
})
// Iterate all. Returning false stops the iteration. See:
// https://pkg.go.dev/sync#Map.Range
return true
})
}
eg.Go(func() error {
return t.KillExec(ctx, eid, signal, all)
})
return eg.Wait()
}
func (p *pod) DeleteTask(ctx context.Context, tid string) error {
// Deleting the sandbox task is a no-op, since the service should delete its
// reference to the sandbox task or pod, and `p.sandboxTask != nil` is an
// invariant that is relied on elsewhere.
// However, still get the init exec for all tasks to ensure that they have
// been properly stopped.
t, err := p.GetTask(tid)
if err != nil {
return errors.Wrap(err, "could not find task to delete")
}
e, err := t.GetExec("")
if err != nil {
return errors.Wrap(err, "could not get initial exec")
}
if e.State() == shimExecStateRunning {
return errors.Wrap(errdefs.ErrFailedPrecondition, "cannot delete task with running exec")
}
if p.id != tid {
p.workloadTasks.Delete(tid)
}
return nil
}