Skip to content

Add grpc client-side reconnection #4598

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jun 24, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions chart/templates/content-service-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ spec:
runAsUser: 1000
{{ include "gitpod.container.defaultEnv" $this | indent 8 }}
{{ include "gitpod.container.tracingEnv" $this | indent 8 }}
- name: GRPC_GO_RETRY
value: "on"
volumeMounts:
- name: config
mountPath: "/config"
Expand Down
2 changes: 2 additions & 0 deletions chart/templates/image-builder-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -118,5 +118,7 @@ spec:
{{ include "gitpod.container.tracingEnv" $this | indent 8 }}
- name: DOCKER_HOST
value: "tcp://localhost:2375"
- name: GRPC_GO_RETRY
value: "on"
{{ toYaml .Values.defaults | indent 6 }}
{{ end }}
2 changes: 1 addition & 1 deletion chart/templates/registry-facade-configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ data:
},
{{ end -}}
"remoteSpecProvider": {
"addr": "ws-manager:{{ .Values.components.wsManager.ports.rpc.containerPort }}",
"addr": "dns:///ws-manager:{{ .Values.components.wsManager.ports.rpc.containerPort }}",
"tls": {
"ca": "/ws-manager-client-tls-certs/ca.crt",
"crt": "/ws-manager-client-tls-certs/tls.crt",
Expand Down
2 changes: 2 additions & 0 deletions chart/templates/registry-facade-daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ spec:
runAsUser: 1000
{{ include "gitpod.container.defaultEnv" $this | indent 8 }}
{{ include "gitpod.container.tracingEnv" $this | indent 8 }}
- name: GRPC_GO_RETRY
value: "on"
volumeMounts:
- name: cache
mountPath: "/mnt/cache"
Expand Down
2 changes: 1 addition & 1 deletion chart/templates/server-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ manager: []
manager:
{{- if not $comp.wsmanSkipSelf }}
- name: "{{ template "gitpod.installation.shortname" . }}"
url: "ws-manager:8080"
url: "dns:///ws-manager:8080"
state: "available"
maxScore: 100
score: 50
Expand Down
2 changes: 1 addition & 1 deletion chart/templates/ws-proxy-configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ data:
"header": "{{- $comp.hostHeader -}}"
},
"workspaceInfoProviderConfig": {
"wsManagerAddr": "ws-manager:8080",
"wsManagerAddr": "dns:///ws-manager:8080",
"reconnectInterval": "3s",
"tls": {
"ca": "/ws-manager-client-tls-certs/ca.crt",
Expand Down
2 changes: 1 addition & 1 deletion chart/templates/ws-scheduler-configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ data:
"enabled": true,
"driver": {
"wsman": {
"addr": "ws-manager:8080",
"addr": "dns:///ws-manager:8080",
"tls": {
"ca": "/ws-manager-client-tls-certs/ca.crt",
"crt": "/ws-manager-client-tls-certs/tls.crt",
Expand Down
5 changes: 5 additions & 0 deletions components/content-service/cmd/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ var runCmd = &cobra.Command{
reg.MustRegister(grpcMetrics)

grpcOpts := []grpc.ServerOption{
// terminate the connection if the client pings more than once every 2 seconds
grpc.KeepaliveEnforcementPolicy(keepalive.EnforcementPolicy{
MinTime: 2 * time.Second,
PermitWithoutStream: true,
}),
// We don't know how good our cients are at closing connections. If they don't close them properly
// we'll be leaking goroutines left and right. Closing Idle connections should prevent that.
grpc.KeepaliveParams(keepalive.ServerParameters{MaxConnectionIdle: 30 * time.Minute}),
Expand Down
12 changes: 11 additions & 1 deletion components/ee/ws-scheduler/pkg/scaler/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"google.golang.org/grpc"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/credentials"
"google.golang.org/grpc/keepalive"
"google.golang.org/grpc/status"

"github.com/gitpod-io/gitpod/common-go/log"
Expand Down Expand Up @@ -78,7 +79,16 @@ func NewWorkspaceManagerPrescaleDriver(config WorkspaceManagerPrescaleDriverConf
}
config.WorkspaceImage = imgRef.String()

var grpcOpts []grpc.DialOption
grpcOpts := []grpc.DialOption{
grpc.WithBlock(),
grpc.WithBackoffMaxDelay(5 * time.Second),
grpc.WithKeepaliveParams(keepalive.ClientParameters{
Time: 5 * time.Second,
Timeout: time.Second,
PermitWithoutStream: true,
}),
}

if config.WsManager.TLS != nil {
ca := config.WsManager.TLS.CA
crt := config.WsManager.TLS.Certificate
Expand Down
5 changes: 5 additions & 0 deletions components/image-builder/cmd/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,11 @@ var runCmd = &cobra.Command{
}

grpcOpts := []grpc.ServerOption{
// terminate the connection if the client pings more than once every 2 seconds
grpc.KeepaliveEnforcementPolicy(keepalive.EnforcementPolicy{
MinTime: 2 * time.Second,
PermitWithoutStream: true,
}),
// We don't know how good our cients are at closing connections. If they don't close them properly
// we'll be leaking goroutines left and right. Closing Idle connections should prevent that.
grpc.KeepaliveParams(keepalive.ServerParameters{MaxConnectionIdle: 30 * time.Minute}),
Expand Down
20 changes: 19 additions & 1 deletion components/registry-facade/cmd/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"crypto/sha256"
"encoding/hex"
"io"
"net"
"net/http"
"os"
"os/signal"
Expand Down Expand Up @@ -64,7 +65,7 @@ var runCmd = &cobra.Command{

promreg := prometheus.NewRegistry()
gpreg := prometheus.WrapRegistererWithPrefix("gitpod_registry_facade_", promreg)
rtt, err := registry.NewMeasuringRegistryRoundTripper(http.DefaultTransport, prometheus.WrapRegistererWithPrefix("downstream_", gpreg))
rtt, err := registry.NewMeasuringRegistryRoundTripper(newDefaultTransport(), prometheus.WrapRegistererWithPrefix("downstream_", gpreg))
if err != nil {
log.WithError(err).Fatal("cannot registry metrics")
}
Expand Down Expand Up @@ -125,6 +126,23 @@ var runCmd = &cobra.Command{
},
}

func newDefaultTransport() *http.Transport {
return &http.Transport{
Proxy: http.ProxyFromEnvironment,
DialContext: (&net.Dialer{
Timeout: 30 * time.Second,
KeepAlive: 30 * time.Second,
DualStack: false,
}).DialContext,
MaxIdleConns: 0,
MaxIdleConnsPerHost: 32,
IdleConnTimeout: 30 * time.Second,
TLSHandshakeTimeout: 10 * time.Second,
ExpectContinueTimeout: 5 * time.Second,
DisableKeepAlives: true,
}
}

func init() {
rootCmd.AddCommand(runCmd)
}
Expand Down
7 changes: 4 additions & 3 deletions components/registry-facade/pkg/registry/blob.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,14 @@ import (
"github.com/containerd/containerd/errdefs"
"github.com/containerd/containerd/remotes"
distv2 "github.com/docker/distribution/registry/api/v2"
"github.com/gitpod-io/gitpod/common-go/log"
"github.com/gitpod-io/gitpod/common-go/tracing"
"github.com/gitpod-io/gitpod/registry-facade/api"
"github.com/gorilla/handlers"
"github.com/opencontainers/go-digest"
ociv1 "github.com/opencontainers/image-spec/specs-go/v1"
"github.com/opentracing/opentracing-go"

"github.com/gitpod-io/gitpod/common-go/log"
"github.com/gitpod-io/gitpod/common-go/tracing"
"github.com/gitpod-io/gitpod/registry-facade/api"
)

func (reg *Registry) handleBlob(ctx context.Context, r *http.Request) http.Handler {
Expand Down
3 changes: 2 additions & 1 deletion components/registry-facade/pkg/registry/imagecfg.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,13 @@ import (
"fmt"
"sync"

"github.com/gitpod-io/gitpod/registry-facade/api"
lru "github.com/hashicorp/golang-lru"
ociv1 "github.com/opencontainers/image-spec/specs-go/v1"
"golang.org/x/xerrors"
"google.golang.org/grpc"
"google.golang.org/grpc/connectivity"

"github.com/gitpod-io/gitpod/registry-facade/api"
)

// ErrRefInvalid is returned by spec provider who cannot interpret the ref
Expand Down
5 changes: 3 additions & 2 deletions components/registry-facade/pkg/registry/layersource.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,13 @@ import (

"github.com/containerd/containerd/errdefs"
"github.com/containerd/containerd/remotes"
"github.com/gitpod-io/gitpod/common-go/log"
"github.com/gitpod-io/gitpod/registry-facade/api"
lru "github.com/hashicorp/golang-lru"
"github.com/opencontainers/go-digest"
ociv1 "github.com/opencontainers/image-spec/specs-go/v1"
"golang.org/x/xerrors"

"github.com/gitpod-io/gitpod/common-go/log"
"github.com/gitpod-io/gitpod/registry-facade/api"
)

const (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,11 @@ import (
"context"
"testing"

"github.com/gitpod-io/gitpod/registry-facade/pkg/registry"
"github.com/gitpod-io/gitpod/registry-facade/pkg/registry/mock"
"github.com/golang/mock/gomock"
"github.com/opencontainers/go-digest"

"github.com/gitpod-io/gitpod/registry-facade/pkg/registry"
"github.com/gitpod-io/gitpod/registry-facade/pkg/registry/mock"
)

func TestRevisioingLayerSource(t *testing.T) {
Expand Down
7 changes: 4 additions & 3 deletions components/registry-facade/pkg/registry/manifest.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,15 @@ import (
"github.com/containerd/containerd/images"
"github.com/containerd/containerd/remotes"
distv2 "github.com/docker/distribution/registry/api/v2"
"github.com/gitpod-io/gitpod/common-go/log"
"github.com/gitpod-io/gitpod/common-go/tracing"
"github.com/gitpod-io/gitpod/registry-facade/api"
"github.com/gorilla/handlers"
"github.com/opencontainers/go-digest"
ociv1 "github.com/opencontainers/image-spec/specs-go/v1"
"github.com/opentracing/opentracing-go"
"github.com/pkg/errors"

"github.com/gitpod-io/gitpod/common-go/log"
"github.com/gitpod-io/gitpod/common-go/tracing"
"github.com/gitpod-io/gitpod/registry-facade/api"
)

func (reg *Registry) handleManifest(ctx context.Context, r *http.Request) http.Handler {
Expand Down
14 changes: 11 additions & 3 deletions components/registry-facade/pkg/registry/registry.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@ import (
"fmt"
"net"
"net/http"
"net/http/httputil"
"os"
"path/filepath"
"strings"
"time"

"github.com/gitpod-io/gitpod/common-go/log"
"github.com/gitpod-io/gitpod/registry-facade/api"
Expand All @@ -33,6 +33,7 @@ import (
"golang.org/x/xerrors"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials"
"google.golang.org/grpc/keepalive"
)

// Config configures the registry
Expand Down Expand Up @@ -155,6 +156,13 @@ func NewRegistry(cfg Config, newResolver ResolverProvider, reg prometheus.Regist
opts := []grpc.DialOption{
grpc.WithUnaryInterceptor(grpc_opentracing.UnaryClientInterceptor(grpc_opentracing.WithTracer(opentracing.GlobalTracer()))),
grpc.WithStreamInterceptor(grpc_opentracing.StreamClientInterceptor(grpc_opentracing.WithTracer(opentracing.GlobalTracer()))),
grpc.WithBlock(),
grpc.WithBackoffMaxDelay(5 * time.Second),
grpc.WithKeepaliveParams(keepalive.ClientParameters{
Time: 5 * time.Second,
Timeout: time.Second,
PermitWithoutStream: true,
}),
}

if cfg.RemoteSpecProvider.TLS != nil {
Expand Down Expand Up @@ -343,8 +351,8 @@ type dispatchFunc func(ctx context.Context, r *http.Request) http.Handler
// dispatcher wraps a dispatchFunc and provides context
func dispatcher(d dispatchFunc) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
fc, _ := httputil.DumpRequest(r, false)
log.WithField("req", string(fc)).Debug("dispatching request")
//fc, _ := httputil.DumpRequest(r, false)
//log.WithField("req", string(fc)).Debug("dispatching request")

// Get context from request, add vars and other info and sync back
ctx := r.Context()
Expand Down
5 changes: 5 additions & 0 deletions components/ws-daemon/cmd/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@ var runCmd = &cobra.Command{
reg.MustRegister(grpcMetrics)

grpcOpts := []grpc.ServerOption{
// terminate the connection if the client pings more than once every 2 seconds
grpc.KeepaliveEnforcementPolicy(keepalive.EnforcementPolicy{
MinTime: 2 * time.Second,
PermitWithoutStream: true,
}),
// We don't know how good our cients are at closing connections. If they don't close them properly
// we'll be leaking goroutines left and right. Closing Idle connections should prevent that.
grpc.KeepaliveParams(keepalive.ServerParameters{MaxConnectionIdle: 30 * time.Minute}),
Expand Down
5 changes: 5 additions & 0 deletions components/ws-manager/cmd/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,11 @@ var runCmd = &cobra.Command{
metrics.Registry.MustRegister(grpcMetrics)

grpcOpts := []grpc.ServerOption{
// terminate the connection if the client pings more than once every 2 seconds
grpc.KeepaliveEnforcementPolicy(keepalive.EnforcementPolicy{
MinTime: 2 * time.Second,
PermitWithoutStream: true,
}),
// We don't know how good our cients are at closing connections. If they don't close them properly
// we'll be leaking goroutines left and right. Closing Idle connections should prevent that.
grpc.KeepaliveParams(keepalive.ServerParameters{MaxConnectionIdle: 30 * time.Minute}),
Expand Down
29 changes: 6 additions & 23 deletions components/ws-manager/pkg/manager/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
"google.golang.org/grpc"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/credentials"
"google.golang.org/grpc/keepalive"
"google.golang.org/grpc/peer"
"google.golang.org/grpc/status"
corev1 "k8s.io/api/core/v1"
Expand Down Expand Up @@ -1089,35 +1090,17 @@ func (m *Manager) connectToWorkspaceDaemon(ctx context.Context, wso workspaceObj

// newWssyncConnectionFactory creates a new wsdaemon connection factory based on the wsmanager configuration
func newWssyncConnectionFactory(managerConfig Configuration) (grpcpool.Factory, error) {
// We use client-side retry when ws-daemon is unavilable. The unavailability need just cover the time ws-daemon
// needs to restart. If ws-daemon is unavailable during a connection attempt, the WithBlock and WithBackoffMaxDelay
// configure the behaviour.
// Once the connection has been established, but becomes unavailable during a call, this retry mechanism takes hold.
//
// Note: the retry policy only has an effect if ws-manager is run with the GRPC_GO_RETRY=on env var set.
// see https://github.com/grpc/grpc-go/blob/506b7730668b5a13465224b0d8133f974a3f843d/dialoptions.go#L522-L524
var retryPolicy = `{
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removing this is fine because

  • we never had GRPC_GO_RETRY set, or because
  • some other option is taking over the functionality this config provided?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

some other option is taking over the functionality this config provided?

The changes in the PR introduce three changes:

  • grpc.KeepaliveEnforcementPolicy

Allow more frequent pings. Default is 5 minutes https://github.com/grpc/grpc-go/blob/87eb5b7502493f758e76c4d09430c0049a81a557/internal/transport/defaults.go#L40
This implies that more frequent pings result in an error HTTP/2 error code: ENHANCE_YOUR_CALM Received Goaway too_many_pings

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we never had GRPC_GO_RETRY set, or because

Good point. This env variable was configured only in ws-manager. Now is present in all components with grpc

"methodConfig": [{
"name": [{"service": "wsdaemon.WorkspaceContentService"}],
"waitForReady": true,

"retryPolicy": {
"MaxAttempts": 6,
"InitialBackoff": ".5s",
"MaxBackoff": "10s",
"BackoffMultiplier": 2,
"RetryableStatusCodes": [ "UNAVAILABLE" ]
}
}]
}`

cfg := managerConfig.WorkspaceDaemon
opts := []grpc.DialOption{
grpc.WithUnaryInterceptor(grpc_opentracing.UnaryClientInterceptor(grpc_opentracing.WithTracer(opentracing.GlobalTracer()))),
grpc.WithStreamInterceptor(grpc_opentracing.StreamClientInterceptor(grpc_opentracing.WithTracer(opentracing.GlobalTracer()))),
grpc.WithBlock(),
grpc.WithBackoffMaxDelay(5 * time.Second),
grpc.WithDefaultServiceConfig(retryPolicy),
grpc.WithKeepaliveParams(keepalive.ClientParameters{
Time: 5 * time.Second,
Timeout: time.Second,
PermitWithoutStream: true,
}),
}
if cfg.TLS.Authority != "" || cfg.TLS.Certificate != "" && cfg.TLS.PrivateKey != "" {
ca := cfg.TLS.Authority
Expand Down