Skip to content
This repository has been archived by the owner on Jan 8, 2024. It is now read-only.

Commit

Permalink
Merge pull request #3440 from hashicorp/simple-metrics
Browse files Browse the repository at this point in the history
Initial metrics spike for gathering metrics about Waypoint operations
  • Loading branch information
catsby authored Jun 21, 2022
2 parents b310390 + 3b7452b commit 7b83979
Show file tree
Hide file tree
Showing 4 changed files with 171 additions and 1 deletion.
3 changes: 3 additions & 0 deletions .changelog/3440.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:improvement
server: Introduce basic server-side metric collections around operations
```
5 changes: 4 additions & 1 deletion internal/cli/server_run.go
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,8 @@ func (c *ServerRunCommand) Run(args []string) int {
}
if httpInsecureLn != nil {
values = append(values, terminal.NamedValue{
Name: "HTTP Address (Insecure)", Value: httpInsecureLn.Addr().String()})
Name: "HTTP Address (Insecure)", Value: httpInsecureLn.Addr().String(),
})
}
if auth {
values = append(values, terminal.NamedValue{Name: "Auth Required", Value: "yes"})
Expand Down Expand Up @@ -374,7 +375,9 @@ This command will bootstrap the server and setup a CLI context.
telemetryOptions = append(telemetryOptions, telemetry.WithDatadogExporter(
datadog.Options{
TraceAddr: c.flagTelemetryDatadogTraceAddr,
StatsAddr: c.flagTelemetryDatadogTraceAddr,
Service: "waypoint",
Namespace: "waypoint",
},
))
}
Expand Down
109 changes: 109 additions & 0 deletions internal/telemetry/metrics/stats.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
package metrics

import (
"context"
"fmt"
"os"
"time"

"go.opencensus.io/stats"
"go.opencensus.io/stats/view"
"go.opencensus.io/tag"
)

func init() {
// Register OpenCensus views.
if err := view.Register(statsViews...); err != nil {
fmt.Fprintf(os.Stderr, "error registering OpenCensus views: %v", err)
}
}

var (
// TagOperation is a tag for capturing the operation type, e.g. build,
// deploy, release
TagOperation = tag.MustNewKey("operation")

operationDurationMeasure = stats.Int64(
"operation_duration.milliseconds",
"The duration for this operation, measured in milliseconds",
stats.UnitMilliseconds,
)

operationCountMeasure = stats.Int64(
"operation_count",
"count of operations",
stats.UnitDimensionless,
)

// views aggregate measurements
waypointOperationCounts = &view.View{
Name: operationCountMeasure.Name(),
Description: operationCountMeasure.Description(),
TagKeys: []tag.Key{TagOperation},
Measure: operationCountMeasure,
Aggregation: view.Count(),
}

waypointOperationDurations = &view.View{
Name: operationDurationMeasure.Name(),
Description: operationDurationMeasure.Description(),
TagKeys: []tag.Key{TagOperation},
Measure: operationDurationMeasure,
// add a custom distribution bucket of 10 second intervals between
// 0 and 240 seconds, then 10 minute intervals up to 60 minutes
Aggregation: view.Distribution(
(10 * time.Second).Seconds(),
(20 * time.Second).Seconds(),
(30 * time.Second).Seconds(),
(40 * time.Second).Seconds(),
(50 * time.Second).Seconds(),
(60 * time.Second).Seconds(),
(70 * time.Second).Seconds(),
(80 * time.Second).Seconds(),
(90 * time.Second).Seconds(),
(100 * time.Second).Seconds(),
(110 * time.Second).Seconds(),
(120 * time.Second).Seconds(),
(130 * time.Second).Seconds(),
(140 * time.Second).Seconds(),
(150 * time.Second).Seconds(),
(160 * time.Second).Seconds(),
(170 * time.Second).Seconds(),
(180 * time.Second).Seconds(),
(190 * time.Second).Seconds(),
(200 * time.Second).Seconds(),
(210 * time.Second).Seconds(),
(220 * time.Second).Seconds(),
(230 * time.Second).Seconds(),
(240 * time.Second).Seconds(),
(10 * time.Minute).Seconds(),
(20 * time.Minute).Seconds(),
(30 * time.Minute).Seconds(),
(40 * time.Minute).Seconds(),
(50 * time.Minute).Seconds(),
(60 * time.Minute).Seconds(),
),
}

// statsViews is a list of all stats views for
// measurements emitted by this package.
statsViews = []*view.View{
waypointOperationDurations,
waypointOperationCounts,
}
)

// MeasureOperation records the duration of an operation and upserts the
// operation value into the TagOperation tag
func MeasureOperation(ctx context.Context, lastWriteAt time.Time, operationName string) {
_ = stats.RecordWithTags(ctx, []tag.Mutator{
tag.Upsert(TagOperation, operationName),
}, operationDurationMeasure.M(time.Since(lastWriteAt).Milliseconds()))
}

// CountOperation records a single incremental value for an operation
func CountOperation(ctx context.Context, operationName string) {
_ = stats.RecordWithTags(ctx, []tag.Mutator{
tag.Upsert(TagOperation, operationName),
}, operationCountMeasure.M(1))
}
55 changes: 55 additions & 0 deletions pkg/server/singleprocess/service_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"io"
"strings"
"time"

"github.com/hashicorp/go-hclog"
"github.com/hashicorp/go-memdb"
Expand All @@ -14,6 +15,7 @@ import (
"google.golang.org/protobuf/types/known/emptypb"
empty "google.golang.org/protobuf/types/known/emptypb"

"github.com/hashicorp/waypoint/internal/telemetry/metrics"
pb "github.com/hashicorp/waypoint/pkg/server/gen"
"github.com/hashicorp/waypoint/pkg/server/logstream"
serverptypes "github.com/hashicorp/waypoint/pkg/server/ptypes"
Expand Down Expand Up @@ -536,6 +538,12 @@ func (s *Service) RunnerJobStream(
log.Trace("loaded config sources for job", "total_sourcers", len(cfgSrcs))

log.Debug("sending job assignment to runner")

operation := operationString(job.Job)
defer func(start time.Time) {
metrics.MeasureOperation(ctx, start, operation)
}(time.Now())
metrics.CountOperation(ctx, operation)
// Send the job assignment.
//
// If this has an error, we continue to accumulate the error until
Expand Down Expand Up @@ -874,3 +882,50 @@ func (s *Service) runnerVerifyToken(

return nil
}

func operationString(job *pb.Job) string {
// Types that are assignable to Operation:
switch job.Operation.(type) {
case *pb.Job_Noop_:
return "noop"
case *pb.Job_Build:
return "build"
case *pb.Job_Push:
return "push"
case *pb.Job_Deploy:
return "deploy"
case *pb.Job_Destroy:
return "destroy"
case *pb.Job_Release:
return "release"
case *pb.Job_Validate:
return "validate"
case *pb.Job_Auth:
return "auth"
case *pb.Job_Docs:
return "docs"
case *pb.Job_ConfigSync:
return "config_sync"
case *pb.Job_Exec:
return "exec"
case *pb.Job_Up:
return "up"
case *pb.Job_Logs:
return "logs"
case *pb.Job_QueueProject:
return "queue_project"
case *pb.Job_Poll:
return "poll"
case *pb.Job_StatusReport:
return "status_report"
case *pb.Job_StartTask:
return "start_task"
case *pb.Job_StopTask:
return "stop_task"
case *pb.Job_WatchTask:
return "watch_task"
case *pb.Job_Init:
return "init"
}
return "unknown"
}

0 comments on commit 7b83979

Please sign in to comment.