Skip to content
This repository has been archived by the owner on Jan 8, 2024. It is now read-only.

Initial metrics spike for gathering metrics about Waypoint operations #3440

Merged
merged 6 commits into from
Jun 21, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .changelog/3440.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:improvement
server: Introduce basic server-side metric collections around operations
```
5 changes: 4 additions & 1 deletion internal/cli/server_run.go
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,8 @@ func (c *ServerRunCommand) Run(args []string) int {
}
if httpInsecureLn != nil {
values = append(values, terminal.NamedValue{
Name: "HTTP Address (Insecure)", Value: httpInsecureLn.Addr().String()})
Name: "HTTP Address (Insecure)", Value: httpInsecureLn.Addr().String(),
})
}
if auth {
values = append(values, terminal.NamedValue{Name: "Auth Required", Value: "yes"})
Expand Down Expand Up @@ -374,7 +375,9 @@ This command will bootstrap the server and setup a CLI context.
telemetryOptions = append(telemetryOptions, telemetry.WithDatadogExporter(
datadog.Options{
TraceAddr: c.flagTelemetryDatadogTraceAddr,
StatsAddr: c.flagTelemetryDatadogTraceAddr,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here I simply re-use the current trace address, but we could easily add a separate flag for stats.

Service: "waypoint",
Namespace: "waypoint",
},
))
}
Expand Down
109 changes: 109 additions & 0 deletions internal/telemetry/metrics/stats.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
package metrics

import (
"context"
"fmt"
"os"
"time"

"go.opencensus.io/stats"
"go.opencensus.io/stats/view"
"go.opencensus.io/tag"
)

func init() {
// Register OpenCensus views.
if err := view.Register(statsViews...); err != nil {
fmt.Fprintf(os.Stderr, "error registering OpenCensus views: %v", err)
}
}

var (
// TagOperation is a tag for capturing the operation type, e.g. build,
// deploy, release
TagOperation = tag.MustNewKey("operation")

operationDurationMeasure = stats.Int64(
"operation_duration.milliseconds",
"The duration for this operation, measured in milliseconds",
stats.UnitMilliseconds,
)

operationCountMeasure = stats.Int64(
"operation_count",
"count of operations",
stats.UnitDimensionless,
)

// views aggregate measurements
waypointOperationCounts = &view.View{
Name: operationCountMeasure.Name(),
Description: operationCountMeasure.Description(),
TagKeys: []tag.Key{TagOperation},
Measure: operationCountMeasure,
Aggregation: view.Count(),
}

waypointOperationDurations = &view.View{
Name: operationDurationMeasure.Name(),
Description: operationDurationMeasure.Description(),
TagKeys: []tag.Key{TagOperation},
Measure: operationDurationMeasure,
// add a custom distribution bucket of 10 second intervals between
// 0 and 240 seconds, then 10 minute intervals up to 60 minutes
Aggregation: view.Distribution(
(10 * time.Second).Seconds(),
(20 * time.Second).Seconds(),
(30 * time.Second).Seconds(),
(40 * time.Second).Seconds(),
(50 * time.Second).Seconds(),
(60 * time.Second).Seconds(),
(70 * time.Second).Seconds(),
(80 * time.Second).Seconds(),
(90 * time.Second).Seconds(),
(100 * time.Second).Seconds(),
(110 * time.Second).Seconds(),
(120 * time.Second).Seconds(),
(130 * time.Second).Seconds(),
(140 * time.Second).Seconds(),
(150 * time.Second).Seconds(),
(160 * time.Second).Seconds(),
(170 * time.Second).Seconds(),
(180 * time.Second).Seconds(),
(190 * time.Second).Seconds(),
(200 * time.Second).Seconds(),
(210 * time.Second).Seconds(),
(220 * time.Second).Seconds(),
(230 * time.Second).Seconds(),
(240 * time.Second).Seconds(),
(10 * time.Minute).Seconds(),
(20 * time.Minute).Seconds(),
(30 * time.Minute).Seconds(),
(40 * time.Minute).Seconds(),
(50 * time.Minute).Seconds(),
(60 * time.Minute).Seconds(),
),
}

// statsViews is a list of all stats views for
// measurements emitted by this package.
statsViews = []*view.View{
waypointOperationDurations,
waypointOperationCounts,
}
)

// MeasureOperation records the duration of an operation and upserts the
// operation value into the TagOperation tag
func MeasureOperation(ctx context.Context, lastWriteAt time.Time, operationName string) {
_ = stats.RecordWithTags(ctx, []tag.Mutator{
tag.Upsert(TagOperation, operationName),
}, operationDurationMeasure.M(time.Since(lastWriteAt).Milliseconds()))
}

// CountOperation records a single incremental value for an operation
func CountOperation(ctx context.Context, operationName string) {
_ = stats.RecordWithTags(ctx, []tag.Mutator{
tag.Upsert(TagOperation, operationName),
}, operationCountMeasure.M(1))
}
55 changes: 55 additions & 0 deletions pkg/server/singleprocess/service_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"io"
"strings"
"time"

"github.com/hashicorp/go-hclog"
"github.com/hashicorp/go-memdb"
Expand All @@ -14,6 +15,7 @@ import (
"google.golang.org/protobuf/types/known/emptypb"
empty "google.golang.org/protobuf/types/known/emptypb"

"github.com/hashicorp/waypoint/internal/telemetry/metrics"
pb "github.com/hashicorp/waypoint/pkg/server/gen"
"github.com/hashicorp/waypoint/pkg/server/logstream"
serverptypes "github.com/hashicorp/waypoint/pkg/server/ptypes"
Expand Down Expand Up @@ -536,6 +538,12 @@ func (s *Service) RunnerJobStream(
log.Trace("loaded config sources for job", "total_sourcers", len(cfgSrcs))

log.Debug("sending job assignment to runner")
catsby marked this conversation as resolved.
Show resolved Hide resolved

operation := operationString(job.Job)
defer func(start time.Time) {
metrics.MeasureOperation(ctx, start, operation)
}(time.Now())
metrics.CountOperation(ctx, operation)
// Send the job assignment.
//
// If this has an error, we continue to accumulate the error until
Expand Down Expand Up @@ -874,3 +882,50 @@ func (s *Service) runnerVerifyToken(

return nil
}

func operationString(job *pb.Job) string {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we seem to do this in more than one place in Waypoint, so I'm open to suggestions to where we could place this and use it. Maybe /pkg/server/ptypes/job.go?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The CLI client needs something like this too, so maybe pkg/server/... isn't the right place. Maybe it could go some where in internal/...? 🤔

// Types that are assignable to Operation:
switch job.Operation.(type) {
case *pb.Job_Noop_:
return "noop"
case *pb.Job_Build:
return "build"
case *pb.Job_Push:
return "push"
case *pb.Job_Deploy:
return "deploy"
case *pb.Job_Destroy:
return "destroy"
case *pb.Job_Release:
return "release"
case *pb.Job_Validate:
return "validate"
case *pb.Job_Auth:
return "auth"
case *pb.Job_Docs:
return "docs"
case *pb.Job_ConfigSync:
return "config_sync"
case *pb.Job_Exec:
return "exec"
case *pb.Job_Up:
return "up"
case *pb.Job_Logs:
return "logs"
case *pb.Job_QueueProject:
return "queue_project"
case *pb.Job_Poll:
return "poll"
case *pb.Job_StatusReport:
return "status_report"
case *pb.Job_StartTask:
return "start_task"
case *pb.Job_StopTask:
return "stop_task"
case *pb.Job_WatchTask:
return "watch_task"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you think about making these PascalCase so they match the proto go definition? Like WatchTask, ConfigSync, Poll, etc.

Copy link
Contributor Author

@catsby catsby Jun 10, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From what I've seen internally (in other HashiCorp projects that use OpenCensus), lowercase with _ is the common style, so I chose this to be consistent

EDIT: clarify "where" internally I have seen this style

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ooh ok! Makes sense from that standpoint 👍🏻 Sounds good.

case *pb.Job_Init:
return "init"
}
return "unknown"
}