Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Gracefully exit the program when the lease expired #2655

Merged
merged 10 commits into from
Jan 19, 2022
123 changes: 106 additions & 17 deletions cmd/internal/serverutil/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ package serverutil

import (
"context"
"errors"
"fmt"
"net"
"net/http"
Expand All @@ -28,10 +29,10 @@ import (
"github.com/google/trillian/monitoring"
"github.com/google/trillian/server/admin"
"github.com/google/trillian/server/interceptor"
"github.com/google/trillian/util"
"github.com/google/trillian/util/clock"
"github.com/prometheus/client_golang/prometheus/promhttp"
"go.etcd.io/etcd/client/v3/naming/endpoints"
"golang.org/x/sync/errgroup"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials"
"google.golang.org/grpc/reflection"
Expand Down Expand Up @@ -126,57 +127,101 @@ func (m *Main) Run(ctx context.Context) error {
trillian.RegisterTrillianAdminServer(srv, admin.New(m.Registry, m.AllowedTreeTypes))
reflection.Register(srv)

g, ctx := errgroup.WithContext(ctx)

if endpoint := m.HTTPEndpoint; endpoint != "" {
http.Handle("/metrics", promhttp.Handler())
http.HandleFunc("/healthz", m.healthz)

go func() {
s := &http.Server{
Addr: endpoint,
}

run := func() error {
glog.Infof("HTTP server starting on %v", endpoint)

var err error
// Let http.ListenAndServeTLS handle the error case when only one of the flags is set.
if m.TLSCertFile != "" || m.TLSKeyFile != "" {
err = http.ListenAndServeTLS(endpoint, m.TLSCertFile, m.TLSKeyFile, nil)
err = s.ListenAndServeTLS(m.TLSCertFile, m.TLSKeyFile)
} else {
err = http.ListenAndServe(endpoint, nil)
err = s.ListenAndServe()
}

if err != nil {
glog.Errorf("HTTP server stopped: %v", err)
if errors.Is(err, http.ErrServerClosed) {
return nil
}

err = fmt.Errorf("HTTP server stopped: %v", err)
}

return err
}

shutdown := func() {
glog.Infof("Stopping HTTP server...")
glog.Flush()

// 15 second exit time limit
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
defer cancel()

if err := s.Shutdown(ctx); err != nil {
glog.Errorf("Failed to http server shutdown: %v", err)
}
}()
}

g.Go(func() error {
return srvRun(ctx, run, shutdown)
})
}

glog.Infof("RPC server starting on %v", m.RPCEndpoint)
lis, err := net.Listen("tcp", m.RPCEndpoint)
if err != nil {
return err
}
go util.AwaitSignal(ctx, srv.Stop)

if m.TreeGCEnabled {
go func() {
g.Go(func() error {
glog.Info("Deleted tree GC started")
gc := admin.NewDeletedTreeGC(
m.Registry.AdminStorage,
m.TreeDeleteThreshold,
m.TreeDeleteMinInterval,
m.Registry.MetricFactory)
gc.Run(ctx)
}()
return nil
})
}

if err := srv.Serve(lis); err != nil {
glog.Errorf("RPC server terminated: %v", err)
run := func() error {
if err := srv.Serve(lis); err != nil {
return fmt.Errorf("RPC server terminated: %v", err)
}

return nil
}

glog.Infof("Stopping server, about to exit")
glog.Flush()
shutdown := func() {
glog.Infof("Stopping RPC server...")
glog.Flush()

srv.GracefulStop()
}

g.Go(func() error {
return srvRun(ctx, run, shutdown)
})

// wait for all jobs to exit gracefully
err = g.Wait()

// Give things a few seconds to tidy up
time.Sleep(time.Second * 5)

return nil
return err
}

// newGRPCServer starts a new Trillian gRPC server.
Expand Down Expand Up @@ -207,10 +252,11 @@ func (m *Main) newGRPCServer() (*grpc.Server, error) {
return s, nil
}

// AnnounceSelf announces this binary's presence to etcd. Returns a function that
// AnnounceSelf announces this binary's presence to etcd. This calls the cancel
// function if the keepalive lease with etcd expires. Returns a function that
// should be called on process exit.
// AnnounceSelf does nothing if client is nil.
func AnnounceSelf(ctx context.Context, client *clientv3.Client, etcdService, endpoint string) func() {
func AnnounceSelf(ctx context.Context, client *clientv3.Client, etcdService, endpoint string, cancel func()) func() {
if client == nil {
return func() {}
}
Expand All @@ -220,7 +266,12 @@ func AnnounceSelf(ctx context.Context, client *clientv3.Client, etcdService, end
if err != nil {
glog.Exitf("Failed to get lease from etcd: %v", err)
}
client.KeepAlive(ctx, leaseRsp.ID)

keepAliveRspCh, err := client.KeepAlive(ctx, leaseRsp.ID)
if err != nil {
glog.Exitf("Failed to keep lease alive from etcd: %v", err)
}
go listenKeepAliveRsp(ctx, keepAliveRspCh, cancel)

em, err := endpoints.NewManager(client, etcdService)
if err != nil {
Expand All @@ -238,3 +289,41 @@ func AnnounceSelf(ctx context.Context, client *clientv3.Client, etcdService, end
client.Revoke(ctx, leaseRsp.ID)
}
}

// listenKeepAliveRsp listens to `keepAliveRspCh` channel, and calls the cancel function
// to notify the lease expired.
func listenKeepAliveRsp(ctx context.Context, keepAliveRspCh <-chan *clientv3.LeaseKeepAliveResponse, cancel func()) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you think it'd be slightly easier to grok if this method blocked and the caller is responsible for doing go listenKeepAliveRsp(...)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

for {
select {
case <-ctx.Done():
glog.Infof("listenKeepAliveRsp canceled: %v", ctx.Err())
return
case _, ok := <-keepAliveRspCh:
if !ok {
glog.Errorf("listenKeepAliveRsp canceled: unexpected lease expired")
cancel()
return
}
}
}
}

// srvRun run the server and call `shutdown` when the context has been cancelled
func srvRun(ctx context.Context, run func() error, shutdown func()) error {
exit := make(chan struct{})
var err error
go func() {
defer close(exit)
err = run()
}()

select {
case <-ctx.Done():
shutdown()
// wait for run to return
<-exit
case <-exit:
}

return err
}
10 changes: 7 additions & 3 deletions cmd/trillian_log_server/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ import (
"github.com/google/trillian/quota/etcd/quotapb"
"github.com/google/trillian/server"
"github.com/google/trillian/storage"
"github.com/google/trillian/util"
"github.com/google/trillian/util/clock"
clientv3 "go.etcd.io/etcd/client/v3"
"google.golang.org/grpc"
Expand Down Expand Up @@ -91,7 +92,9 @@ func main() {
}
}

ctx := context.Background()
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
go util.AwaitSignal(ctx, cancel)

var options []grpc.ServerOption
mf := prometheus.MetricFactory{}
Expand Down Expand Up @@ -124,10 +127,11 @@ func main() {
}

// Announce our endpoints to etcd if so configured.
unannounce := serverutil.AnnounceSelf(ctx, client, *etcdService, *rpcEndpoint)
unannounce := serverutil.AnnounceSelf(ctx, client, *etcdService, *rpcEndpoint, cancel)
defer unannounce()

if *httpEndpoint != "" {
unannounceHTTP := serverutil.AnnounceSelf(ctx, client, *etcdHTTPService, *httpEndpoint)
unannounceHTTP := serverutil.AnnounceSelf(ctx, client, *etcdHTTPService, *httpEndpoint, cancel)
defer unannounceHTTP()
}

Expand Down
2 changes: 1 addition & 1 deletion cmd/trillian_log_signer/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ func main() {
// Start HTTP server (optional)
if *httpEndpoint != "" {
// Announce our endpoint to etcd if so configured.
unannounceHTTP := serverutil.AnnounceSelf(ctx, client, *etcdHTTPService, *httpEndpoint)
unannounceHTTP := serverutil.AnnounceSelf(ctx, client, *etcdHTTPService, *httpEndpoint, cancel)
defer unannounceHTTP()
}

Expand Down