Skip to content

Commit

Permalink
Updated logging and tracing
Browse files Browse the repository at this point in the history
Updating low level spans (right above bridge/guest connection) to use
tracing/log-level sampler. Set span kind to client/server, as
required, for bridge RPC calls.

Added traces and spans to public fuctions and wait goroutines in
contaienrd-shim-runhcs.

Sampler in cmd/contaienrd-shim-runhcs calls underneath main `service`
calls exposed to containerd use trace samples, and that is inherited by
calls beneath it.
Exceptions are made for internal functions that are called with
context.Background.

In places where spans did not seem necessary, added `Trace()` calls.
Additionally, if the context was propagated to other calls, or further
logging was done, added entry used for `Trace` to the context for
further propagation to callees.

Updated internal/cmd to use spans within `.Start()` and `.Wait()`

Reduced the number of `INFO` level logs printed, downgrading to
`DEBUG`.

Moved traces and spans from inside `Once.Do()`, so that they are
always logged, even if the `Once` does not execute.

Signed-off-by: Hamza El-Saawy <hamzaelsaawy@microsoft.com>
  • Loading branch information
helsaawy committed Feb 16, 2022
1 parent 76917f9 commit 064cc68
Show file tree
Hide file tree
Showing 106 changed files with 1,515 additions and 579 deletions.
2 changes: 1 addition & 1 deletion cmd/containerd-shim-runhcs-v1/events.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ func (e *eventPublisher) close() error {
}

func (e *eventPublisher) publishEvent(ctx context.Context, topic string, event interface{}) (err error) {
ctx, span := trace.StartSpan(ctx, "publishEvent")
ctx, span := oc.StartTraceSpan(ctx, "publishEvent")
defer span.End()
defer func() { oc.SetSpanStatus(span, err) }()
span.AddAttributes(
Expand Down
11 changes: 10 additions & 1 deletion cmd/containerd-shim-runhcs-v1/exec_clone.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,12 @@ import (
"github.com/Microsoft/hcsshim/internal/cmd"
"github.com/Microsoft/hcsshim/internal/cow"
"github.com/Microsoft/hcsshim/internal/log"
"github.com/Microsoft/hcsshim/internal/logfields"
"github.com/Microsoft/hcsshim/internal/oc"
"github.com/Microsoft/hcsshim/internal/uvm"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/sirupsen/logrus"
"go.opencensus.io/trace"
)

func newClonedExec(
Expand All @@ -25,7 +28,7 @@ func newClonedExec(
"tid": tid,
"eid": id, // Init exec ID is always same as Task ID
"bundle": bundle,
}).Debug("newClonedExec")
}).Trace("newClonedExec")

he := &hcsExec{
events: events,
Expand Down Expand Up @@ -61,6 +64,12 @@ type clonedExec struct {
}

func (ce *clonedExec) Start(ctx context.Context) (err error) {
ctx, span := oc.StartTraceSpan(ctx, "clonedExec::Start")
defer span.End()
defer func() { oc.SetSpanStatus(span, err) }()
span.AddAttributes(trace.StringAttribute(logfields.TaskID, ce.tid),
trace.StringAttribute(logfields.ExecID, ce.id))

// A cloned exec should never initialize the container as it should
// already be running.
return ce.startInternal(ctx, false)
Expand Down
64 changes: 53 additions & 11 deletions cmd/containerd-shim-runhcs-v1/exec_hcs.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ import (

"github.com/Microsoft/hcsshim/internal/cmd"
"github.com/Microsoft/hcsshim/internal/cow"
"github.com/Microsoft/hcsshim/internal/logfields"
"github.com/Microsoft/hcsshim/internal/oc"
"github.com/Microsoft/hcsshim/internal/log"
"github.com/Microsoft/hcsshim/internal/protocol/guestresource"
"github.com/Microsoft/hcsshim/internal/signals"
Expand All @@ -43,7 +45,7 @@ func newHcsExec(
"eid": id, // Init exec ID is always same as Task ID
"bundle": bundle,
"wcow": isWCOW,
}).Debug("newHcsExec")
}).Trace("newHcsExec")

he := &hcsExec{
events: events,
Expand Down Expand Up @@ -249,12 +251,24 @@ func (he *hcsExec) startInternal(ctx context.Context, initializeContainer bool)
}

func (he *hcsExec) Start(ctx context.Context) (err error) {
ctx, span := oc.StartTraceSpan(ctx, "hcsExec::Start")
defer span.End()
defer func() { oc.SetSpanStatus(span, err) }()
span.AddAttributes(trace.StringAttribute(logfields.TaskID, he.tid),
trace.StringAttribute(logfields.ExecID, he.id))

// If he.id == he.tid then this is the init exec.
// We need to initialize the container itself before starting this exec.
return he.startInternal(ctx, he.id == he.tid)
}

func (he *hcsExec) Kill(ctx context.Context, signal uint32) error {
func (he *hcsExec) Kill(ctx context.Context, signal uint32) (err error) {
ctx, span := oc.StartTraceSpan(ctx, "hcsExec::Kill")
defer span.End()
defer func() { oc.SetSpanStatus(span, err) }()
span.AddAttributes(trace.StringAttribute(logfields.TaskID, he.tid),
trace.StringAttribute(logfields.ExecID, he.id))

he.sl.Lock()
defer he.sl.Unlock()
switch he.state {
Expand All @@ -267,7 +281,6 @@ func (he *hcsExec) Kill(ctx context.Context, signal uint32) error {
supported = he.host == nil || he.host.SignalProcessSupported()
}
var options interface{}
var err error
if he.isWCOW {
var opt *guestresource.SignalProcessOptionsWCOW
opt, err = signals.ValidateWCOW(int(signal), supported)
Expand Down Expand Up @@ -307,6 +320,14 @@ func (he *hcsExec) Kill(ctx context.Context, signal uint32) error {
}

func (he *hcsExec) ResizePty(ctx context.Context, width, height uint32) error {
ctx, etr := log.S(ctx, logrus.Fields{
logfields.TaskID: he.tid,
logfields.ExecID: he.id,
"width": width,
"height": height,
})
etr.Trace("hcsExec::ResizePty")

he.sl.Lock()
defer he.sl.Unlock()
if !he.io.Terminal() {
Expand All @@ -320,6 +341,11 @@ func (he *hcsExec) ResizePty(ctx context.Context, width, height uint32) error {
}

func (he *hcsExec) CloseIO(ctx context.Context, stdin bool) error {
ctx, etr := log.S(ctx, logrus.Fields{
logfields.TaskID: he.tid,
logfields.ExecID: he.id,
})
etr.Trace("hcsExec::CloseIO")
// If we have any upstream IO we close the upstream connection. This will
// unblock the `io.Copy` in the `Start()` call which will signal
// `he.p.CloseStdin()`. If `he.io.Stdin()` is already closed this is safe to
Expand All @@ -334,6 +360,13 @@ func (he *hcsExec) Wait() *task.StateResponse {
}

func (he *hcsExec) ForceExit(ctx context.Context, status int) {
ctx, etr := log.S(ctx, logrus.Fields{
logfields.TaskID: he.tid,
logfields.ExecID: he.id,
"status": status,
})
etr.Trace("hcsExec::ForceExit")

he.sl.Lock()
defer he.sl.Unlock()
if he.state != shimExecStateExited {
Expand Down Expand Up @@ -369,10 +402,15 @@ func (he *hcsExec) ForceExit(ctx context.Context, status int) {
// We DO NOT send the async `TaskExit` event because we never would have sent
// the `TaskStart`/`TaskExecStarted` event.
func (he *hcsExec) exitFromCreatedL(ctx context.Context, status int) {
if he.state != shimExecStateExited {
// Avoid logging the force if we already exited gracefully
log.G(ctx).WithField("status", status).Debug("hcsExec::exitFromCreatedL")
ctx, etr := log.S(ctx, logrus.Fields{
logfields.TaskID: he.tid,
logfields.ExecID: he.id,
"status": status,
"state": he.state,
})
etr.Trace("hcsExec::exitFromCreatedL")

if he.state != shimExecStateExited {
// Unblock the container exit goroutine
he.processDoneOnce.Do(func() { close(he.processDone) })
// Transition this exec
Expand Down Expand Up @@ -414,13 +452,15 @@ func (he *hcsExec) exitFromCreatedL(ctx context.Context, status int) {
//
// 7. Finally, save the UVM and this container as a template if specified.
func (he *hcsExec) waitForExit() {
ctx, span := trace.StartSpan(context.Background(), "hcsExec::waitForExit")
ctx, span := oc.StartTraceSpan(context.Background(), "hcsExec::waitForExit")
var err error // this will only save the last error, since we dont return early on error
defer span.End()
defer func() { oc.SetSpanStatus(span, err) }()
span.AddAttributes(
trace.StringAttribute("tid", he.tid),
trace.StringAttribute("eid", he.id))

err := he.p.Process.Wait()
err = he.p.Process.Wait()
if err != nil {
log.G(ctx).WithError(err).Error("failed process Wait")
}
Expand All @@ -431,9 +471,11 @@ func (he *hcsExec) waitForExit() {

code, err := he.p.Process.ExitCode()
if err != nil {
log.G(ctx).WithError(err).Error("failed to get ExitCode")
log.G(ctx).WithError(err).WithField(logfields.ExecID, he.id).Error("failed to get hcsExec ExitCode")
} else {
log.G(ctx).WithField("exitCode", code).Debug("exited")
log.G(ctx).WithFields(logrus.Fields{
"exitCode": code,
logfields.ExecID: he.id}).Debug("hcsExec exited")
}

he.sl.Lock()
Expand Down Expand Up @@ -476,7 +518,7 @@ func (he *hcsExec) waitForExit() {
//
// This MUST be called via a goroutine at exec create.
func (he *hcsExec) waitForContainerExit() {
ctx, span := trace.StartSpan(context.Background(), "hcsExec::waitForContainerExit")
ctx, span := oc.StartTraceSpan(context.Background(), "hcsExec::waitForContainerExit")
defer span.End()
span.AddAttributes(
trace.StringAttribute("tid", he.tid),
Expand Down
29 changes: 27 additions & 2 deletions cmd/containerd-shim-runhcs-v1/exec_wcow_podsandbox.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"time"

"github.com/Microsoft/hcsshim/internal/log"
"github.com/Microsoft/hcsshim/internal/logfields"
eventstypes "github.com/containerd/containerd/api/events"
containerd_v1_types "github.com/containerd/containerd/api/types/task"
"github.com/containerd/containerd/errdefs"
Expand All @@ -20,7 +21,7 @@ func newWcowPodSandboxExec(ctx context.Context, events publisher, tid, bundle st
"tid": tid,
"eid": tid, // Init exec ID is always same as Task ID
"bundle": bundle,
}).Debug("newWcowPodSandboxExec")
}).Trace("newWcowPodSandboxExec")

wpse := &wcowPodSandboxExec{
events: events,
Expand Down Expand Up @@ -121,6 +122,11 @@ func (wpse *wcowPodSandboxExec) Status() *task.StateResponse {
}

func (wpse *wcowPodSandboxExec) Start(ctx context.Context) error {
log.G(ctx).WithFields(logrus.Fields{
logfields.TaskID: wpse.tid,
logfields.ExecID: wpse.tid,
}).Trace("wcowPodSandboxExec::Start")

wpse.sl.Lock()
defer wpse.sl.Unlock()
if wpse.state != shimExecStateCreated {
Expand All @@ -142,6 +148,12 @@ func (wpse *wcowPodSandboxExec) Start(ctx context.Context) error {
}

func (wpse *wcowPodSandboxExec) Kill(ctx context.Context, signal uint32) error {
log.G(ctx).WithFields(logrus.Fields{
logfields.TaskID: wpse.tid,
logfields.ExecID: wpse.tid,
"signal": signal,
}).Trace("wcowPodSandboxExec::Kill")

wpse.sl.Lock()
defer wpse.sl.Unlock()
switch wpse.state {
Expand Down Expand Up @@ -170,6 +182,12 @@ func (wpse *wcowPodSandboxExec) Kill(ctx context.Context, signal uint32) error {
}

func (wpse *wcowPodSandboxExec) ResizePty(ctx context.Context, width, height uint32) error {
// useless function, but trace could help track down who is calling it
log.G(ctx).WithFields(logrus.Fields{
logfields.TaskID: wpse.tid,
logfields.ExecID: wpse.tid,
}).Trace("wcowPodSandboxExec::ResizePty")

wpse.sl.Lock()
defer wpse.sl.Unlock()
// We will never have IO for a sandbox container so we wont have a tty
Expand All @@ -187,11 +205,18 @@ func (wpse *wcowPodSandboxExec) Wait() *task.StateResponse {
}

func (wpse *wcowPodSandboxExec) ForceExit(ctx context.Context, status int) {
ctx, etr := log.S(ctx, logrus.Fields{ //nolint:ineffassign,staticcheck
logfields.TaskID: wpse.tid,
logfields.ExecID: wpse.tid,
"status": status,
})
etr.Trace("wcowPodSandboxExec::ForceExit")

wpse.sl.Lock()
defer wpse.sl.Unlock()
if wpse.state != shimExecStateExited {
// Avoid logging the force if we already exited gracefully
log.G(ctx).WithField("status", status).Debug("wcowPodSandboxExec::ForceExit")
etr.WithField("status", status).Debug("forcing exit")

wpse.state = shimExecStateExited
wpse.exitStatus = 1
Expand Down
2 changes: 1 addition & 1 deletion cmd/containerd-shim-runhcs-v1/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ func main() {
)

// Register our OpenCensus logrus exporter
trace.ApplyConfig(trace.Config{DefaultSampler: trace.AlwaysSample()})
trace.ApplyConfig(trace.Config{DefaultSampler: oc.DefaultSampler})
trace.RegisterExporter(&oc.LogrusExporter{})

app := cli.NewApp()
Expand Down
22 changes: 20 additions & 2 deletions cmd/containerd-shim-runhcs-v1/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ import (
"sync"

"github.com/Microsoft/hcsshim/internal/log"
"github.com/Microsoft/hcsshim/internal/logfields"
"github.com/Microsoft/hcsshim/internal/oc"
"github.com/Microsoft/hcsshim/internal/oci"
"github.com/Microsoft/hcsshim/internal/uvm"
"github.com/Microsoft/hcsshim/osversion"
Expand All @@ -18,6 +20,7 @@ import (
"github.com/containerd/containerd/runtime/v2/task"
specs "github.com/opencontainers/runtime-spec/specs-go"
"github.com/pkg/errors"
"go.opencensus.io/trace"
"golang.org/x/sync/errgroup"
)

Expand Down Expand Up @@ -62,7 +65,10 @@ type shimPod interface {
}

func createPod(ctx context.Context, events publisher, req *task.CreateTaskRequest, s *specs.Spec) (_ shimPod, err error) {
log.G(ctx).WithField("tid", req.ID).Debug("createPod")
ctx, span := oc.StartTraceSpan(ctx, "createPod")
defer span.End()
defer func() { oc.SetSpanStatus(span, err) }()
span.AddAttributes(trace.StringAttribute("tid", req.ID))

if osversion.Build() < osversion.RS5 {
return nil, errors.Wrapf(errdefs.ErrFailedPrecondition, "pod support is not available on Windows versions previous to RS5 (%d)", osversion.RS5)
Expand Down Expand Up @@ -286,6 +292,8 @@ func (p *pod) ID() string {
}

func (p *pod) GetCloneAnnotations(ctx context.Context, s *specs.Spec) (bool, string, error) {
log.G(ctx).WithField(logfields.TaskID, p.id).Trace("pod::GetCloneAnnotations")

isTemplate, templateID, err := oci.ParseCloneAnnotations(ctx, s)
if err != nil {
return false, "", err
Expand All @@ -296,6 +304,11 @@ func (p *pod) GetCloneAnnotations(ctx context.Context, s *specs.Spec) (bool, str
}

func (p *pod) CreateTask(ctx context.Context, req *task.CreateTaskRequest, s *specs.Spec) (_ shimTask, err error) {
ctx, span := oc.StartTraceSpan(ctx, "pod::CreateTask")
defer span.End()
defer func() { oc.SetSpanStatus(span, err) }()
span.AddAttributes(trace.StringAttribute(logfields.TaskID, p.id))

if req.ID == p.id {
return nil, errors.Wrapf(errdefs.ErrAlreadyExists, "task with id: '%s' already exists", req.ID)
}
Expand Down Expand Up @@ -371,7 +384,12 @@ func (p *pod) GetTask(tid string) (shimTask, error) {
return raw.(shimTask), nil
}

func (p *pod) KillTask(ctx context.Context, tid, eid string, signal uint32, all bool) error {
func (p *pod) KillTask(ctx context.Context, tid, eid string, signal uint32, all bool) (err error) {
ctx, span := oc.StartTraceSpan(ctx, "pod::KillTask")
defer span.End()
defer func() { oc.SetSpanStatus(span, err) }()
span.AddAttributes(trace.StringAttribute(logfields.TaskID, p.id))

t, err := p.GetTask(tid)
if err != nil {
return err
Expand Down
Loading

0 comments on commit 064cc68

Please sign in to comment.