Skip to content

Commit

Permalink
WIP: add otel logging to help with debugging:
Browse files Browse the repository at this point in the history
And to help with understanding code paths
taken. This is very helpful as we have to create
an idempotent Reconcile function.

Signed-off-by: Jacob Weinstock <jakobweinstock@gmail.com>
  • Loading branch information
jacobweinstock committed Oct 14, 2024
1 parent f2ba776 commit 30ea6ab
Show file tree
Hide file tree
Showing 9 changed files with 142 additions and 22 deletions.
2 changes: 2 additions & 0 deletions api/v1alpha1/workflow_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,8 @@ type WorkflowStatus struct {
// +patchStrategy=merge
// +listType=atomic
Conditions []WorkflowCondition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type" protobuf:"bytes,1,rep,name=conditions"`

TraceParent string `json:"traceParent,omitempty"`
}

// JobStatus holds the state of a specific job.bmc.tinkerbell.org object created.
Expand Down
52 changes: 45 additions & 7 deletions cmd/tink-controller/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ import (
"github.com/spf13/pflag"
"github.com/spf13/viper"
"github.com/tinkerbell/tink/internal/deprecated/controller"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploghttp"
"go.opentelemetry.io/otel/sdk/log"
"go.opentelemetry.io/otel/trace"
"go.uber.org/zap"
"k8s.io/client-go/tools/clientcmd"
clientcmdapi "k8s.io/client-go/tools/clientcmd/api"
Expand Down Expand Up @@ -56,19 +60,53 @@ func NewRootCommand() *cobra.Command {
if err != nil {
panic(err)
}
logger := zapr.NewLogger(zlog).WithName("github.com/tinkerbell/tink")
logger2 := zapr.NewLogger(zlog).WithName("github.com/tinkerbell/tink")

cmd := &cobra.Command{
Use: "tink-controller",
PreRunE: func(cmd *cobra.Command, _ []string) error {
viper, err := createViper(logger)
viper, err := createViper(logger2)
if err != nil {
return fmt.Errorf("config init: %w", err)
}
return applyViper(viper, cmd)
},
RunE: func(cmd *cobra.Command, _ []string) error {
logger.Info("Starting controller version " + version)
os.Setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://192.168.2.50:4318")
os.Setenv("OTEL_EXPORTER_OTLP_INSECURE", "true")
os.Setenv("OTEL_SERVICE_NAME", "tink-controller")
ctx := cmd.Context()

oCfg := OConfig{
Servicename: "tink-controller",
Endpoint: "192.168.2.50:4317",
Insecure: true,
}
ctx, _, _ = Init(ctx, oCfg)
// Create the OTLP log exporter that sends logs to configured destination
logExporter, err := otlploghttp.New(ctx)
if err != nil {
panic("failed to initialize exporter")
}

// Create the logger provider
lp := log.NewLoggerProvider(
log.WithProcessor(
log.NewBatchProcessor(logExporter),
),
)

// Ensure the logger is shutdown before exiting so all pending logs are exported
defer lp.Shutdown(ctx)
handler := NewHandler("github.com/tinkerbell/tink", WithLoggerProvider(lp))

logger := logr.FromSlogHandler(handler)
tracer := otel.Tracer("my-tracer")
var span trace.Span
ctx, span = tracer.Start(ctx, "start up")
defer span.End()
// new stuff above
logger.Info("Starting controller version "+version, "TraceID", trace.SpanContextFromContext(ctx).TraceID())

ccfg := clientcmd.NewNonInteractiveDeferredLoadingClientConfig(
&clientcmd.ClientConfigLoadingRules{ExplicitPath: config.Kubeconfig},
Expand All @@ -85,7 +123,7 @@ func NewRootCommand() *cobra.Command {
}

options := ctrl.Options{
Logger: logger,
Logger: logger2,
LeaderElection: config.EnableLeaderElection,
LeaderElectionID: "tink.tinkerbell.org",
LeaderElectionNamespace: namespace,
Expand All @@ -95,14 +133,14 @@ func NewRootCommand() *cobra.Command {
HealthProbeBindAddress: config.ProbeAddr,
}

ctrl.SetLogger(logger)
ctrl.SetLogger(logger2)

mgr, err := controller.NewManager(cfg, options)
mgr, err := controller.NewManager(cfg, options, logger)
if err != nil {
return fmt.Errorf("controller manager: %w", err)
}

return mgr.Start(cmd.Context())
return mgr.Start(ctx)
},
}
config.AddFlags(cmd.Flags())
Expand Down
2 changes: 1 addition & 1 deletion config/manager-rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: manager-role
name: tink-controller-manager-role
rules:
- apiGroups:
- bmc.tinkerbell.org
Expand Down
9 changes: 6 additions & 3 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ toolchain go1.22.2
require (
github.com/Masterminds/sprig/v3 v3.3.0
github.com/avast/retry-go v3.0.0+incompatible
github.com/cenkalti/backoff/v4 v4.3.0
github.com/distribution/reference v0.6.0
github.com/docker/docker v27.3.1+incompatible
github.com/equinix-labs/otel-init-go v0.0.9
Expand All @@ -28,6 +29,10 @@ require (
github.com/stretchr/testify v1.9.0
github.com/tinkerbell/rufio v0.3.3
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.55.0
go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploghttp v0.6.0
go.opentelemetry.io/otel/log v0.6.0
go.opentelemetry.io/otel/sdk/log v0.6.0
go.opentelemetry.io/otel/trace v1.30.0
go.uber.org/multierr v1.11.0
go.uber.org/zap v1.27.0
google.golang.org/grpc v1.67.1
Expand All @@ -36,7 +41,6 @@ require (
k8s.io/api v0.31.1
k8s.io/apimachinery v0.31.1
k8s.io/client-go v0.31.1
k8s.io/utils v0.0.0-20240711033017-18e509b52bc8
knative.dev/pkg v0.0.0-20240917091217-aaab500c26c4
sigs.k8s.io/controller-runtime v0.19.0
sigs.k8s.io/yaml v1.4.0
Expand All @@ -48,7 +52,6 @@ require (
github.com/Masterminds/semver/v3 v3.3.0 // indirect
github.com/Microsoft/go-winio v0.4.14 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/cenkalti/backoff/v4 v4.3.0 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/containerd/log v0.1.0 // indirect
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
Expand Down Expand Up @@ -112,7 +115,6 @@ require (
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.30.0 // indirect
go.opentelemetry.io/otel/metric v1.30.0 // indirect
go.opentelemetry.io/otel/sdk v1.30.0 // indirect
go.opentelemetry.io/otel/trace v1.30.0 // indirect
go.opentelemetry.io/proto/otlp v1.3.1 // indirect
golang.org/x/crypto v0.27.0 // indirect
golang.org/x/exp v0.0.0-20240808152545-0cdaa3abc0fa // indirect
Expand All @@ -134,6 +136,7 @@ require (
k8s.io/apiextensions-apiserver v0.31.0 // indirect
k8s.io/klog/v2 v2.130.1 // indirect
k8s.io/kube-openapi v0.0.0-20240808142205-8e686545bdb8 // indirect
k8s.io/utils v0.0.0-20240711033017-18e509b52bc8 // indirect
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect
sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect
)
6 changes: 6 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -219,16 +219,22 @@ go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0 h1:4K4tsIX
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0/go.mod h1:jjdQuTGVsXV4vSs+CJ2qYDeDPf9yIJV23qlIzBm73Vg=
go.opentelemetry.io/otel v1.30.0 h1:F2t8sK4qf1fAmY9ua4ohFS/K+FUuOPemHUIXHtktrts=
go.opentelemetry.io/otel v1.30.0/go.mod h1:tFw4Br9b7fOS+uEao81PJjVMjW/5fvNCbpsDIXqP0pc=
go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploghttp v0.6.0 h1:QSKmLBzbFULSyHzOdO9JsN9lpE4zkrz1byYGmJecdVE=
go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploghttp v0.6.0/go.mod h1:sTQ/NH8Yrirf0sJ5rWqVu+oT82i4zL9FaF6rWcqnptM=
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.30.0 h1:lsInsfvhVIfOI6qHVyysXMNDnjO9Npvl7tlDPJFBVd4=
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.30.0/go.mod h1:KQsVNh4OjgjTG0G6EiNi1jVpnaeeKsKMRwbLN+f1+8M=
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.27.0 h1:qFffATk0X+HD+f1Z8lswGiOQYKHRlzfmdJm0wEaVrFA=
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.27.0/go.mod h1:MOiCmryaYtc+V0Ei+Tx9o5S1ZjA7kzLucuVuyzBZloQ=
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.30.0 h1:umZgi92IyxfXd/l4kaDhnKgY8rnN/cZcF1LKc6I8OQ8=
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.30.0/go.mod h1:4lVs6obhSVRb1EW5FhOuBTyiQhtRtAnnva9vD3yRfq8=
go.opentelemetry.io/otel/log v0.6.0 h1:nH66tr+dmEgW5y+F9LanGJUBYPrRgP4g2EkmPE3LeK8=
go.opentelemetry.io/otel/log v0.6.0/go.mod h1:KdySypjQHhP069JX0z/t26VHwa8vSwzgaKmXtIB3fJM=
go.opentelemetry.io/otel/metric v1.30.0 h1:4xNulvn9gjzo4hjg+wzIKG7iNFEaBMX00Qd4QIZs7+w=
go.opentelemetry.io/otel/metric v1.30.0/go.mod h1:aXTfST94tswhWEb+5QjlSqG+cZlmyXy/u8jFpor3WqQ=
go.opentelemetry.io/otel/sdk v1.30.0 h1:cHdik6irO49R5IysVhdn8oaiR9m8XluDaJAs4DfOrYE=
go.opentelemetry.io/otel/sdk v1.30.0/go.mod h1:p14X4Ok8S+sygzblytT1nqG98QG2KYKv++HE0LY/mhg=
go.opentelemetry.io/otel/sdk/log v0.6.0 h1:4J8BwXY4EeDE9Mowg+CyhWVBhTSLXVXodiXxS/+PGqI=
go.opentelemetry.io/otel/sdk/log v0.6.0/go.mod h1:L1DN8RMAduKkrwRAFDEX3E3TLOq46+XMGSbUfHU/+vE=
go.opentelemetry.io/otel/trace v1.30.0 h1:7UBkkYzeg3C7kQX8VAidWh2biiQbtAKjyIML8dQ9wmc=
go.opentelemetry.io/otel/trace v1.30.0/go.mod h1:5EyKqTzzmyqB9bwtCCq6pDLktPK6fmGf/Dph+8VI02o=
go.opentelemetry.io/proto/otlp v1.3.1 h1:TrMUixzpM0yuc/znrFTP9MMRh8trP93mkCiDVeXrui0=
Expand Down
5 changes: 3 additions & 2 deletions internal/deprecated/controller/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package controller
import (
"fmt"

"github.com/go-logr/logr"
rufio "github.com/tinkerbell/rufio/api/v1alpha1"
"github.com/tinkerbell/tink/api/v1alpha1"
"github.com/tinkerbell/tink/internal/deprecated/workflow"
Expand All @@ -28,7 +29,7 @@ func DefaultScheme() *runtime.Scheme {

// NewManager creates a new controller manager with tink controller controllers pre-registered.
// If opts.Scheme is nil, DefaultScheme() is used.
func NewManager(cfg *rest.Config, opts ctrl.Options) (ctrl.Manager, error) {
func NewManager(cfg *rest.Config, opts ctrl.Options, logger logr.Logger) (ctrl.Manager, error) {
if opts.Scheme == nil {
opts.Scheme = DefaultScheme()
}
Expand All @@ -46,7 +47,7 @@ func NewManager(cfg *rest.Config, opts ctrl.Options) (ctrl.Manager, error) {
return nil, fmt.Errorf("set up ready check: %w", err)
}

err = workflow.NewReconciler(mgr.GetClient()).SetupWithManager(mgr)
err = workflow.NewReconciler(mgr.GetClient(), logger).SetupWithManager(mgr)
if err != nil {
return nil, fmt.Errorf("setup workflow reconciler: %w", err)
}
Expand Down
37 changes: 35 additions & 2 deletions internal/deprecated/workflow/job.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,13 @@ package workflow
import (
"context"
"fmt"
"log/slog"
"time"

rufio "github.com/tinkerbell/rufio/api/v1alpha1"
"github.com/tinkerbell/tink/api/v1alpha1"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/trace"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
Expand All @@ -26,9 +29,14 @@ func (j jobName) String() string {

// this function will update the Workflow status.
func (s *state) handleJob(ctx context.Context, actions []rufio.Action, name jobName) (reconcile.Result, error) {
tracer := otel.Tracer("handleJob")
var span trace.Span
ctx, span = tracer.Start(ctx, "handleJob")
defer span.End()
// there are 3 phases. 1. Clean up existing 2. Create new 3. Track status
// 1. clean up existing job if it wasn't already deleted
if j := s.workflow.Status.BootOptions.Jobs[name.String()]; !j.ExistingJobDeleted {
s.logger.InfoContext(ctx, "deleting existing job", "name", name)
result, err := s.deleteExisting(ctx, name)
if err != nil {
return result, err
Expand All @@ -39,6 +47,7 @@ func (s *state) handleJob(ctx context.Context, actions []rufio.Action, name jobN

// 2. create a new job
if uid := s.workflow.Status.BootOptions.Jobs[name.String()].UID; uid == "" {
s.logger.InfoContext(ctx, "no uid found for job", "name", name)
result, err := s.createJob(ctx, actions, name)
if err != nil {
s.workflow.Status.SetCondition(v1alpha1.WorkflowCondition{
Expand All @@ -62,6 +71,7 @@ func (s *state) handleJob(ctx context.Context, actions []rufio.Action, name jobN

// 3. track status
if !s.workflow.Status.BootOptions.Jobs[name.String()].Complete {
s.logger.InfoContext(ctx, "tracking job", "name", name)
// track status
r, tState, err := s.trackRunningJob(ctx, name)
if err != nil {
Expand Down Expand Up @@ -111,13 +121,23 @@ func (s *state) deleteExisting(ctx context.Context, name jobName) (reconcile.Res

// This function will update the Workflow status.
func (s *state) createJob(ctx context.Context, actions []rufio.Action, name jobName) (reconcile.Result, error) {
tracer := otel.Tracer("createJob")
var span trace.Span
ctx, span = tracer.Start(ctx, "createJob")
defer span.End()
// create a new job
// The assumption is that the UID is not set. UID checking is not handled here.
// 1. look up if there's an existing job with the same name, if so update the status with the UID and return
// 2. if there's no existing job, create a new job, update the status with the UID, and return

rj := &rufio.Job{}
if err := s.client.Get(ctx, client.ObjectKey{Name: name.String(), Namespace: s.workflow.Namespace}, rj); err == nil {
s.logger.InfoContext(ctx, "job already exists", "name", name)
if !rj.DeletionTimestamp.IsZero() {
s.logger.InfoContext(ctx, "job is being deleted", "name", name)
return reconcile.Result{Requeue: true}, nil
}
//TODO(jacobweinstock): job exists means that the job name and uid from the status are the same.
// get the UID and update the status
jStatus := s.workflow.Status.BootOptions.Jobs[name.String()]
jStatus.UID = rj.GetUID()
Expand All @@ -134,9 +154,10 @@ func (s *state) createJob(ctx context.Context, actions []rufio.Action, name jobN
return reconcile.Result{}, fmt.Errorf("hardware %q does not have a BMC", s.hardware.Name)
}

if err := create(ctx, s.client, name.String(), s.hardware, s.workflow.Namespace, actions); err != nil {
if err := create(ctx, s.logger, s.client, name.String(), s.hardware, s.workflow.Namespace, actions); err != nil {
return reconcile.Result{}, fmt.Errorf("error creating job: %w", err)
}
s.logger.InfoContext(ctx, "job created", "name", name)

return reconcile.Result{Requeue: true}, nil
}
Expand All @@ -152,17 +173,23 @@ var (

// This function will update the Workflow status.
func (s *state) trackRunningJob(ctx context.Context, name jobName) (reconcile.Result, trackedState, error) {
tracer := otel.Tracer("trackRunningJob")
var span trace.Span
ctx, span = tracer.Start(ctx, "trackRunningJob")
defer span.End()
// track status
// get the job
rj := &rufio.Job{}
if err := s.client.Get(ctx, client.ObjectKey{Name: name.String(), Namespace: s.workflow.Namespace}, rj); err != nil {
return reconcile.Result{}, trackedStateError, fmt.Errorf("error getting job: %w", err)
}
if rj.HasCondition(rufio.JobFailed, rufio.ConditionTrue) {
s.logger.InfoContext(ctx, "job failed", "name", name)
// job failed
return reconcile.Result{}, trackedStateFailed, fmt.Errorf("job failed")
}
if rj.HasCondition(rufio.JobCompleted, rufio.ConditionTrue) {
s.logger.InfoContext(ctx, "job completed", "name", name)
// job completed
jStatus := s.workflow.Status.BootOptions.Jobs[name.String()]
jStatus.Complete = true
Expand All @@ -171,11 +198,17 @@ func (s *state) trackRunningJob(ctx context.Context, name jobName) (reconcile.Re
return reconcile.Result{}, trackedStateComplete, nil
}
// still running
s.logger.InfoContext(ctx, "job still running", "name", name)
time.Sleep(s.backoff.NextBackOff())
return reconcile.Result{Requeue: true}, trackedStateRunning, nil
}

func create(ctx context.Context, cc client.Client, name string, hw *v1alpha1.Hardware, ns string, tasks []rufio.Action) error {
func create(ctx context.Context, logger *slog.Logger, cc client.Client, name string, hw *v1alpha1.Hardware, ns string, tasks []rufio.Action) error {
tracer := otel.Tracer("create")
var span trace.Span
ctx, span = tracer.Start(ctx, "create")
defer span.End()
logger.InfoContext(ctx, "creating job", "name", name)
if err := cc.Create(ctx, &rufio.Job{
ObjectMeta: metav1.ObjectMeta{
Name: name,
Expand Down
9 changes: 9 additions & 0 deletions internal/deprecated/workflow/pre.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,23 @@ import (
"github.com/pkg/errors"
rufio "github.com/tinkerbell/rufio/api/v1alpha1"
"github.com/tinkerbell/tink/api/v1alpha1"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/trace"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
)

// prepareWorkflow prepares the workflow for execution.
// The workflow (s.workflow) can be updated even if an error occurs.
// Any patching of the workflow object in a cluster is left up to the caller.
func (s *state) prepareWorkflow(ctx context.Context) (reconcile.Result, error) {
tracer := otel.Tracer("prepareWorkflow")
var span trace.Span
ctx, span = tracer.Start(ctx, "prepareWorkflow")
defer span.End()
// handle bootoptions
// 1. Handle toggling allowPXE in a hardware object if toggleAllowNetboot is true.
if s.workflow.Spec.BootOptions.ToggleAllowNetboot {
s.logger.InfoContext(ctx, "toggling allowPXE true")
if err := s.toggleHardware(ctx, true); err != nil {
return reconcile.Result{}, err
}
Expand All @@ -25,6 +32,7 @@ func (s *state) prepareWorkflow(ctx context.Context) (reconcile.Result, error) {
// 2. Handle booting scenarios.
switch s.workflow.Spec.BootOptions.BootMode {
case v1alpha1.BootModeNetboot:
s.logger.InfoContext(ctx, "boot mode netboot")
if s.hardware == nil {
return reconcile.Result{}, errors.New("hardware is nil")
}
Expand Down Expand Up @@ -60,6 +68,7 @@ func (s *state) prepareWorkflow(ctx context.Context) (reconcile.Result, error) {
}
return r, err
case v1alpha1.BootModeISO:
s.logger.InfoContext(ctx, "boot mode iso")
if s.hardware == nil {
return reconcile.Result{}, errors.New("hardware is nil")
}
Expand Down
Loading

0 comments on commit 30ea6ab

Please sign in to comment.